In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('data\spam.tsv', sep='\t')

In [3]:
df.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [4]:
df.shape

(5572, 4)

In [5]:
ham = df[df['label']=='ham']
spam = df[df['label']=='spam']

In [6]:
ham.shape, spam.shape

((4825, 4), (747, 4))

In [8]:
ham = ham.sample(spam.shape[0])

In [9]:
data = pd.concat([ham, spam], ignore_index=True)

In [10]:
data.head()

Unnamed: 0,label,message,length,punct
0,ham,Are you free now?can i call now?,32,2
1,ham,"Sorry, I'll call later",22,2
2,ham,I guess that's why you re worried. You must kn...,300,14
3,ham,LOL that would be awesome payback.,34,1
4,ham,Mom wants to know where you at,30,0


In [11]:
data['label'].value_counts()

ham     747
spam    747
Name: label, dtype: int64

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
 data.rename(columns={'label':'Target','message':'Text'},inplace=True)

In [14]:
data['Target'].value_counts()

ham     747
spam    747
Name: Target, dtype: int64

In [15]:
X_train, X_test, y_train, y_test = train_test_split(data['Text'],data['Target'], test_size=0.3, random_state=0)

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
tfidf = TfidfVectorizer()

In [18]:
X_train

1438      You have 1 new message. Please call 08715205273
431     Hey. What happened? U switch off ur cell d who...
194                        I'm in a movie. Call me 4 wat?
240     Ok then no need to tell me anything i am going...
1309    Had your mobile 11 months or more? U R entitle...
                              ...                        
763     Your free ringtone is waiting to be collected....
835     XCLUSIVE@CLUBSAISAI 2MOROW 28/5 SOIREE SPECIAL...
1216    Dorothy@kiefer.com (Bank of Granite issues Str...
559                […] anyway, many good evenings to u! s
684     Good afternoon, my love. How goes your day ? W...
Name: Text, Length: 1045, dtype: object

In [19]:
X_train_tf = tfidf.fit_transform(X_train)
X_train_arr = X_train_tf.toarray()
X_test_tf = tfidf.transform(X_test)
X_test_arr = X_test_tf.toarray()

In [20]:
X_train_tf.shape, type(X_train_tf)

((1045, 3634), scipy.sparse._csr.csr_matrix)

In [21]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [22]:
from sklearn.ensemble import RandomForestClassifier

In [23]:
classifier = RandomForestClassifier()

In [24]:
classifier.fit(X_train_tf, y_train)

In [25]:
type(X_train_tf), X_train_tf.shape

(scipy.sparse._csr.csr_matrix, (1045, 3634))

In [26]:
type(X_test_tf), X_test_tf.shape

(scipy.sparse._csr.csr_matrix, (449, 3634))

In [27]:
y_pred = classifier.predict(X_test_tf)

In [28]:
accuracy_score(y_test, y_pred)

0.955456570155902

In [29]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         ham       0.92      1.00      0.96       227
        spam       1.00      0.91      0.95       222

    accuracy                           0.96       449
   macro avg       0.96      0.96      0.96       449
weighted avg       0.96      0.96      0.96       449



In [30]:
test1 = ['Hello, You are learning natural Language Processing']
test2 = ['Hope you are doing good and learning new things !']
test3 = ['Congratulations, You won a lottery ticket worth $1 Million ! To claim call on 446677']
test4 = ['fgbnlsdkfmdaFJLfbvnlfm;knbdkfvn']

In [31]:
print(classifier.predict(tfidf.transform(test1)))
print(classifier.predict(tfidf.transform(test2)))
print(classifier.predict(tfidf.transform(test3)))
print(classifier.predict(tfidf.transform(test4)))

['ham']
['ham']
['spam']
['ham']
