In [2]:
import pandas as pd
import numpy as np
#import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('spam.csv', sep = ',')

In [4]:
df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will �_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [5]:
df[['label','sentence']] = df[['v1','v2']]

In [6]:
df.drop(columns=['v1','v2','Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace = True)

In [7]:
df

Unnamed: 0,label,sentence
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will �_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [8]:
df.isnull().sum()

label       0
sentence    0
dtype: int64

In [9]:
df['label'].value_counts() #as it is not in balancing mode , since 'ham' and 'spam' messages are not balanced..

ham     4825
spam     747
Name: label, dtype: int64

## Balance data

In [10]:
filt1 = df['label'] == 'ham'
ham = df[filt1]
ham

Unnamed: 0,label,sentence
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
6,ham,Even my brother is not like to speak with me. ...
...,...,...
5565,ham,Huh y lei...
5568,ham,Will �_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [11]:
filt2 = df['label'] == 'spam'
spam = df[filt2]
spam

Unnamed: 0,label,sentence
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
5,spam,FreeMsg Hey there darling it's been 3 week's n...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...
11,spam,"SIX chances to win CASH! From 100 to 20,000 po..."
...,...,...
5537,spam,Want explicit SEX in 30 secs? Ring 02073162414...
5540,spam,ASKED 3MOBILE IF 0870 CHATLINES INCLU IN FREE ...
5547,spam,Had your contract mobile 11 Mnths? Latest Moto...
5566,spam,REMINDER FROM O2: To get 2.50 pounds free call...


In [12]:
ham.shape,spam.shape

((4825, 2), (747, 2))

In [13]:
#we have to take 747 random samples from 'ham' to balance with the 'spam'....
ham = ham.sample(spam.shape[0])
ham.shape

(747, 2)

In [14]:
data = ham.append(spam, ignore_index = True)  # 'ignore_index = True'  to provide data in index wise but by changing their 
                                             #original index....
data

Unnamed: 0,label,sentence
0,ham,A bloo bloo bloo I'll miss the first bowl
1,ham,Thts god's gift for birds as humans hav some n...
2,ham,K.k.this month kotees birthday know?
3,ham,Frnd s not juz a word.....not merely a relatio...
4,ham,I donno its in your genes or something
...,...,...
1489,spam,Want explicit SEX in 30 secs? Ring 02073162414...
1490,spam,ASKED 3MOBILE IF 0870 CHATLINES INCLU IN FREE ...
1491,spam,Had your contract mobile 11 Mnths? Latest Moto...
1492,spam,REMINDER FROM O2: To get 2.50 pounds free call...


## Data preparation

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
data.head()

Unnamed: 0,label,sentence
0,ham,A bloo bloo bloo I'll miss the first bowl
1,ham,Thts god's gift for birds as humans hav some n...
2,ham,K.k.this month kotees birthday know?
3,ham,Frnd s not juz a word.....not merely a relatio...
4,ham,I donno its in your genes or something


In [18]:
type(data['sentence'])

pandas.core.series.Series

In [19]:
x_train, x_test, y_train, y_test = train_test_split(data['sentence'], data['label'], test_size = 0.3, random_state = 0,
                                                    shuffle = True, stratify = data['label'])

#shuffle : Whether or not to shuffle the data before splitting..
#stratify : used to show same labels present in dataset output..

## Bag of words creation

##### just for visualization, how to create bag of words...

In [20]:
import string
import spacy

nlp = spacy.load('en_core_web_sm')
punct = string.punctuation
from spacy.lang.en.stop_words import STOP_WORDS
stopwords = list(STOP_WORDS)



def text_data_cleaning(sentence):
    doc = nlp(sentence)
    
    tokens =[]
    for token in doc:
        if token.lemma_ !='-PRON-':
            temp = token.lemma_.lower().strip()  #since all our data prepocessing steps are in lower format..
            
        else:
            temp = token.lower_
        tokens.append(temp)
        
    #removing stopwords and punctuations
    cleaned_tokens = []
    for token in tokens:
        if token not in stopwords and token not in punct:
            cleaned_tokens.append(token)
    return cleaned_tokens
        

In [44]:
vectorizer = TfidfVectorizer(tokenizer = text_data_cleaning)

In [45]:
x_train = vectorizer.fit_transform(x_train)


In [46]:
x_train.shape  #i.e in training dataset we have 1045 sentences with 3719 unique words in it

(1045, 3266)

In [47]:
x_train # it compresses the total elements to 17851 . i.e if we multiply (1045*3719) we get 3886355 which is very big...
#remove the top box and see result.

<1045x3266 sparse matrix of type '<class 'numpy.float64'>'
	with 11626 stored elements in Compressed Sparse Row format>

## pipeline and randomforest classifier

In [21]:
clf = Pipeline([('tfidf', TfidfVectorizer(tokenizer = text_data_cleaning)), 
                ('clf', RandomForestClassifier(n_estimators = 400, n_jobs = -1))])


In [22]:
clf.fit(x_train, y_train)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(tokenizer=<function text_data_cleaning at 0x000001CF1A0CB430>)),
                ('clf', RandomForestClassifier(n_estimators=400, n_jobs=-1))])

In [23]:
y_pred = clf.predict(x_test)  #so here pipeline is used by not transforming x_test again as it automatically transform data..

In [24]:
confusion_matrix(y_test, y_pred)

array([[223,   2],
       [ 28, 196]], dtype=int64)

In [25]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         ham       0.89      0.99      0.94       225
        spam       0.99      0.88      0.93       224

    accuracy                           0.93       449
   macro avg       0.94      0.93      0.93       449
weighted avg       0.94      0.93      0.93       449



In [26]:
accuracy_score(y_test, y_pred)

0.933184855233853

## it's runtime

In [27]:
clf.predict(['hi this is not kartheek'])

array(['ham'], dtype=object)

In [28]:
clf.predict(['Congratulations!, you got free tickets, to go USA ,TEXT "WON" to 19998'])

array(['spam'], dtype=object)

# predict using SVM

In [29]:
clf = Pipeline([('tfidf', TfidfVectorizer(tokenizer = text_data_cleaning)), ('clf', SVC(C=1000, gamma = 'auto'))])

#here 'c' is penalty


In [30]:
clf.fit(x_train, y_train)


Pipeline(steps=[('tfidf',
                 TfidfVectorizer(tokenizer=<function text_data_cleaning at 0x000001CF1A0CB430>)),
                ('clf', SVC(C=1000, gamma='auto'))])

In [31]:
y_pred = clf.predict(x_test) 

In [32]:

confusion_matrix(y_test, y_pred)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

         ham       0.92      0.97      0.95       225
        spam       0.97      0.92      0.94       224

    accuracy                           0.95       449
   macro avg       0.95      0.95      0.95       449
weighted avg       0.95      0.95      0.95       449



In [33]:
accuracy_score(y_test, y_pred)


0.9465478841870824

In [34]:
## it's runtime

clf.predict(['hi this is not kartheek'])

array(['ham'], dtype=object)

In [35]:
clf.predict(['you won free tickets, to go USA this summer'])  #try to change 'won' with 'got' and see the result..

array(['spam'], dtype=object)