## Basic Imports

In [46]:
from io import IncrementalNewlineDecoder
# importing necessary libraries
import numpy as np            # for numeric operations
import pandas as pd           # for data manipulation and analysis
import matplotlib.pyplot as plt   # for data visualization
%matplotlib inline

# Importing WordCloud for text visualization
from wordcloud import wordcloud

# Importing NLTK for natural language processing
import nltk
from nltk.corpus import stopwords     # for stopwords

# Downloading NLTK data
nltk.download("stopwords")      # Downloading stopwords data
nltk.download("punkt")          # Downloading tokenizer data


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [47]:
# Downloading NLTK data
nltk.download("stopwords")      # Downloading stopwords data
nltk.download("punkt")          # Downloading tokenizer data
nltk.download('punkt_tab')    # Downloading punkt_tab for advanced word tokenization

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [48]:
# read the csv file
df = pd.read_csv("spam.csv")

# display the first few rows of the dataframe
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [49]:
# drop unnecessary columns
df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True)
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [50]:
# Rename the columns name
df.rename(columns={'v1':'target', 'v2':'text'}, inplace=True)
df.head()

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [51]:
df['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
ham,4825
spam,747


## Data Preprocessing

In [52]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df['target'] = encoder.fit_transform(df['target'])

df.head()

Unnamed: 0,target,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [53]:
# check duplicate values
df.duplicated().sum()

403

In [54]:
len(df)

5572

In [55]:
df.duplicated().sum()/len(df)*100

7.232591529073941

## Feature Engineering

In [56]:
# Importing the Porter Stemmer for text steming
from nltk.stem.porter import PorterStemmer

# Importing the string module for handling special characters
import string

# Creating an instance of the porter stemer
ps = PorterStemmer()

In [57]:
# Lowecase transformation and text preprocessing function
def transform_text(text):
  # Transform the text to lowercase
  text = text.lower()

  # Tokenization using NLTK
  text = nltk.word_tokenize(text)

  # Removing special characters
  y = []
  for i in text:
    if i.isalnum():
      y.append(i)

  # removing stop words and punctuation
  text = y[:]
  y.clear()

  # Loop through the tokens and remove stopwords and punctuation
  for i in text:
    if i not in stopwords.words('english') and i not in string.punctuation:
      y.append(i)

  # Stemming using Porter Stemmer
  text = y[:]
  y.clear()
  for i in text:
    y.append(ps.stem(i))
  # Join the processed tokens back into a single string
  return " ".join(y)

In [58]:
transform_text('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... cine there got amore wat...')

'go jurong point crazi avail bugi n great world la e buffet cine got amor wat'

In [59]:
df['transform_text'] = df['text'].apply(transform_text)
df.head()

Unnamed: 0,target,text,transform_text
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,0,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
3,0,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though


In [60]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
tfid = TfidfVectorizer(max_features= 500)

In [61]:
X = tfid.fit_transform(df['transform_text']).toarray()
y = df['target'].values

## Train Test Split

In [62]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=2)

## Model Training

In [63]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

In [64]:
svc = SVC(kernel= "sigmoid", gamma  = 1.0)
knc = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth = 5)
lrc = LogisticRegression(solver = 'liblinear', penalty = 'l1')
rfc = RandomForestClassifier(n_estimators = 50, random_state = 2 )
abc = AdaBoostClassifier(n_estimators = 50, random_state = 2)
bc = BaggingClassifier(n_estimators = 50, random_state = 2)
etc = ExtraTreesClassifier(n_estimators = 50, random_state = 2)
gbdt = GradientBoostingClassifier(n_estimators = 50, random_state = 2)
xgb  = XGBClassifier(n_estimators = 50, random_state = 2)

In [65]:

clfs = {
    'SVC': svc,
    'KNN': knc,
    'NB': mnb,
    'DT': dtc,
    'LR': lrc,
    'RF': rfc,
    'Adaboost': abc,
    'Bgc': bc,
    'ETC': etc,
    'GBDT': gbdt,
    'xgb': xgb
}

## Model Evaluation

In [66]:
from sklearn.metrics import accuracy_score, precision_score
def train_classifier(clfs, X_train, y_train, X_test, y_test):
    clfs.fit(X_train,y_train)
    y_pred = clfs.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    return accuracy , precision

In [67]:
accuracy_scores = []
precision_scores = []

for name, clf in clfs.items(): # Changed clfs to clf within the loop
    current_accuracy, current_precision = train_classifier(clf, X_train, y_train, X_test, y_test)
    print()
    print("For: ", name)
    print("Accuracy: ", current_accuracy)
    print("Precision: ", current_precision)

    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)


For:  SVC
Accuracy:  0.9632286995515695
Precision:  0.9606299212598425

For:  KNN
Accuracy:  0.9273542600896861
Precision:  0.9873417721518988

For:  NB
Accuracy:  0.9659192825112107
Precision:  0.9838709677419355

For:  DT
Accuracy:  0.9183856502242153
Precision:  0.845360824742268

For:  LR
Accuracy:  0.9596412556053812
Precision:  0.959349593495935

For:  RF
Accuracy:  0.9713004484304932
Precision:  0.9846153846153847

For:  Adaboost
Accuracy:  0.9112107623318386
Precision:  0.8933333333333333

For:  Bgc
Accuracy:  0.9623318385650225
Precision:  0.9142857142857143

For:  ETC
Accuracy:  0.9695067264573991
Precision:  0.9696969696969697

For:  GBDT
Accuracy:  0.9399103139013453
Precision:  0.941747572815534

For:  xgb
Accuracy:  0.9641255605381166
Precision:  0.9682539682539683
