Install and Import packages

In [60]:
#import basic packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

#check versions
print(pd.__version__)
print(np.__version__)

# Importing WordCloud for text visualization
from wordcloud import WordCloud

# Importing NLTK for natural language processing
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')
from nltk.corpus import stopwords # For stopwords

#machine learning package
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split

# Importing the Porter Stemmer for text stemming
from nltk.stem.porter import PorterStemmer

# Importing the string module for handling special characters
import string

2.3.0
2.3.1


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Mishti\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mishti\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Data Ingestion

In [39]:

#read input data
df = pd.read_csv("spam_data.csv")
df.head(5)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [40]:
#delete unwanted columns and rename required columns
df = df[['v1','v2']].rename(columns={'v1':'target','v2':'text'})
df.head(5)

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


Data Pre-processing

In [41]:
#Convert target class to binary 1 and 0
encoder = LabelEncoder()
df['target'] = encoder.fit_transform(df['target'])
#get number of rows
print(f"Rows count before remorving duplicate rows= {len(df)}")
#remove duplicate rows
df = df.drop_duplicates(keep='first')
print(f"Rows count after remorving duplicate rows= {len(df)}")
df.head()

Rows count before remorving duplicate rows= 5572
Rows count after remorving duplicate rows= 5169


Unnamed: 0,target,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


Feature Engineering

In [42]:
#create an instance of Porter Stemmet
ps = PorterStemmer()

In [50]:
#create function to transform text for efficient processing
def transform_text(txt):
    #convert to lower case
    txt = txt.lower()

    #tokenize using nltk
    txt = nltk.tokenize.word_tokenize(txt)
    
    #remove special characters
    y = []
    for i in txt:
        if i.isalnum():
            y.append(i)
    
    #after removing special characters, copy y into txt and clear y for future use
    txt = y[:] #creates a copy
    y.clear()

    #remove stopwords and punctuation
    for i in txt:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
    
    #after removing stopwords and punctuations, copy y into txt and clear y for future use
    txt = y[:]
    y.clear()
    
    #stem using Porter Stemmer
    for i in txt:
        y.append(ps.stem(i))
    txt = y[:]
    y.clear()
    
    #join the processed tokens back into a string and return processed text
    return " ".join(txt)

#Example text to check function
text = "hey big boy's how @ are you how?"
transform_text(text)


'hey big boy'

In [51]:
#create a new column processing the text column in the original dataframe
df['processed_text'] = df['text'].apply(transform_text)
df.head(5)

Unnamed: 0,target,text,processed_text
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,0,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
3,0,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though


In [53]:
#create instance of tf-idf vectorizer
tfid = TfidfVectorizer(max_features = 500)

In [62]:
#get x and y values for model training
X = tfid.fit_transform(df['processed_text']).toarray()
y = df['target'].values

Train Test Split

In [65]:
#split data into train and test
X_train, X_test , y_train, y_test = train_test_split(X,y,test_size = 0.20, random_state = 2)

Model Development

In [66]:
#import ML libraries
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

In [67]:
#create instances for each of the machine learning models
svc = SVC(kernel= "sigmoid", gamma  = 1.0)
knc = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth = 5)
lrc = LogisticRegression(solver = 'liblinear', penalty = 'l1')
rfc = RandomForestClassifier(n_estimators = 50, random_state = 2 )
abc = AdaBoostClassifier(n_estimators = 50, random_state = 2)
bc = BaggingClassifier(n_estimators = 50, random_state = 2)
etc = ExtraTreesClassifier(n_estimators = 50, random_state = 2)
gbdt = GradientBoostingClassifier(n_estimators = 50, random_state = 2)    
xgb  = XGBClassifier(n_estimators = 50, random_state = 2)

In [68]:
#save the instances in a dictionary for training the model simulatneously
clfs = {
    'SVC': svc,
    'KNN': knc,
    'NB': mnb,
    'DT': dtc,
    'LR': lrc,
    'RF': rfc,
    'Adaboost': abc,
    'Bgc': bc,
    'ETC': etc,
    'GBDT': gbdt,
    'xgb': xgb
    
}

In [69]:
#create classifier model
from sklearn.metrics import accuracy_score, precision_score
def train_classifier(clfs, X_train, y_train, X_test, y_test):
    clfs.fit(X_train,y_train)
    y_pred = clfs.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    return accuracy , precision

Model evaluation

In [70]:
accuracy_scores = []
precision_scores = []
for name , clfs in clfs.items():
    current_accuracy, current_precision = train_classifier(clfs, X_train, y_train, X_test, y_test)
    print()
    print("For: ", name)
    print("Accuracy: ", current_accuracy)
    print("Precision: ", current_precision)
    
    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)


For:  SVC
Accuracy:  0.9671179883945842
Precision:  0.9333333333333333

For:  KNN
Accuracy:  0.9274661508704062
Precision:  1.0

For:  NB
Accuracy:  0.9709864603481625
Precision:  0.9655172413793104

For:  DT
Accuracy:  0.9361702127659575
Precision:  0.9

For:  LR
Accuracy:  0.9632495164410058
Precision:  0.9629629629629629

For:  RF
Accuracy:  0.9700193423597679
Precision:  0.9421487603305785

For:  Adaboost
Accuracy:  0.9235976789168279
Precision:  0.8734177215189873

For:  Bgc
Accuracy:  0.9622823984526112
Precision:  0.9024390243902439

For:  ETC
Accuracy:  0.9709864603481625
Precision:  0.921875

For:  GBDT
Accuracy:  0.9497098646034816
Precision:  0.93

For:  xgb
Accuracy:  0.9690522243713733
Precision:  0.9568965517241379
