In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
 
#Importing WordCloud for text visualization
from wordcloud import WordCloud

#Importing NlTK for natural language processing
import nltk 
from nltk.corpus import stopwords

#downloading NLTK data
nltk.download('stopwords') #Downloading stopwords data
nltk.download('punkt') #Downloading tokenizer data


[nltk_data] Downloading package stopwords to /home/amit/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/amit/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [13]:
import nltk
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt_tab to /home/amit/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [15]:
df = pd.read_csv('spam.csv')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [16]:
df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace= True)
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [17]:
df.rename(columns={'v1':'target', 'v2':'text'},inplace=True)
df.head()

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [18]:
#Data preprocessing
from sklearn.preprocessing import LabelEncoder
encoder =LabelEncoder()
df['target'] = encoder.fit_transform(df['target'])
df.head()

Unnamed: 0,target,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [19]:
df.duplicated().sum()

np.int64(403)

In [20]:
df.shape


(5572, 2)

In [21]:
df = df.drop_duplicates(keep= 'first')
df.shape

(5169, 2)

In [22]:
#Feature Enginnering\
from nltk.stem.porter import PorterStemmer # Importing the porter Sremmer for text stemming

#Importing the string module for handling special characters
import string

#Creating an instance of the porter Stemmer
ps = PorterStemmer()


In [23]:
#Lowercase transformation and text preprocessing function
def transform_text(text):
    text = text.lower()
    
    #Tokenization using NLTK
    text = nltk.word_tokenize(text)

    #Removing speacil character
    y = []
    for i in text:
        if i.isalnum():
            y.append(i)

    #Remove stop words and punctuation
    text = y[:]
    y.clear()

    #Loop through the tokens and remove stopwords and punctuation
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)

    #stemming using porter stemmer
    text = y[:]
    y.clear()
    for i in text:
        y.append(ps.stem(i))

    #join the process tokens back into a single string
    return " ".join(y)

In [24]:
transform_text('Go until final point, crazy....Available only in bugss n so what are you thinking!')

'go final point crazi avail bugss n think'

In [25]:
df['tansformed_text'] = df['text'].apply(transform_text)
df.head()

Unnamed: 0,target,text,tansformed_text
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,0,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
3,0,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though


In [26]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
tfid = TfidfVectorizer(max_features = 500)


In [27]:
X = tfid.fit_transform(df['tansformed_text']).toarray()
y = df['target'].values


In [28]:
#Train tet split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y ,test_size=0.2, random_state=7)


In [29]:
#Model Training
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

In [37]:
svc = SVC(kernel='sigmoid', gamma= 1.0)
knc = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth=5)
lrc = LogisticRegression(solver = 'liblinear', penalty='l1')
rfc = RandomForestClassifier(n_estimators=45, random_state=7)
abc = AdaBoostClassifier(n_estimators=45, random_state=7)
bc = BaggingClassifier(n_estimators=45,random_state=7)
etc = ExtraTreesClassifier(n_estimators=45, random_state=7)
gbc = GradientBoostingClassifier(n_estimators=45, random_state=7)
xgb = XGBClassifier(n_estimators = 45, random_state = 7)

In [38]:
clfs = {
    'SVC':svc,
    'KNN':knc,
    'NB':mnb,
    'DT':dtc,
    'LR':lrc,
    'RF':rfc,
    'Adaboost': abc,
    'Bgc': bc,
    'ETV': etc,
    'GBC': gbc,
    'xgb':xgb
}

In [39]:
#model evaluation
from sklearn.metrics import accuracy_score, precision_score
def train_classifier(clfs,X_train,y_train,X_test,y_test):
    clfs.fit(X_train,y_train)
    y_pred = clfs.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)
    return accuracy, precision

In [41]:
accuracy_scores = []
precision_scores = []
for name, clf in clfs.items():
    current_accuracy, current_precision = train_classifier(clf, X_train, y_train, X_test, y_test)
    print()
    print("For: ", name)
    print("Accuracy: ", current_accuracy)
    print("Precision: ", current_precision)

    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)



For:  SVC
Accuracy:  0.9680851063829787
Precision:  0.9457364341085271

For:  KNN
Accuracy:  0.9303675048355899
Precision:  1.0

For:  NB
Accuracy:  0.9690522243713733
Precision:  0.967741935483871

For:  DT
Accuracy:  0.9284332688588007
Precision:  0.8854166666666666

For:  LR
Accuracy:  0.9622823984526112
Precision:  0.943089430894309

For:  RF
Accuracy:  0.9700193423597679
Precision:  0.9606299212598425

For:  Adaboost
Accuracy:  0.9129593810444874
Precision:  0.8222222222222222

For:  Bgc
Accuracy:  0.9613152804642167
Precision:  0.8913043478260869

For:  ETV
Accuracy:  0.9709864603481625
Precision:  0.9609375

For:  GBC
Accuracy:  0.9410058027079303
Precision:  0.9578947368421052

For:  xgb
Accuracy:  0.9671179883945842
Precision:  0.9672131147540983
