In [146]:
# Importing necessary libraries
import numpy as np # For Numerical Operations
import pandas as pd # For data manipulation and analysis
import matplotlib.pyplot as plt # For data visualization
%matplotlib inline

# Import WordCloud for text visualization
from wordcloud import WordCloud

# Importing NLTK for natural language processing(NLP)
import nltk
from nltk.corpus import stopwords #For stopwords

# Downloading NLTK data
nltk.download('stopwords') # Downloading stopwords data
nltk.download('punkt') # Download tokenizer data
nltk.download('punkt_tab')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

Basics Import

In [147]:
# Read the csv file
df = pd.read_csv('spam.csv')

# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [148]:
df.drop(columns= ['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace = True)
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [149]:
# Rename the columns name
df.rename(columns= {'v1': 'target', 'v2':'text'}, inplace= True)
df.head()

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [150]:
#check duplicate values
df.duplicated().sum()

403

In [151]:
len(df)

5572

Feature Engg

In [152]:
#Remove Duplicate
df = df.drop_duplicates(keep='first')
len(df)

5169

In [153]:
# Importing the Porter Stemmer for text stemming
from nltk.stem.porter import PorterStemmer

# Importing String modules for handling special characters
import string

# Create an instance of the Porter Stemmer
ps = PorterStemmer()

In [154]:
# Lowercase transformation & text preprocessing function
def transform_text(text):
    # Transform text into lowercase
    text = text.lower()

    # Tokenization using NLTK
    text = nltk.word_tokenize(text)

    # Removing special characters
    y = []
    for i in text:
        if i.isalnum():
            y.append(i)
    
    # Stemming using Porter Stemmer
    # text: contentReference[oaicite:0]{index=0}​

    # Removing stop words & punctuation
    text = y[:]
    y.clear()

    # Loop through the tokens & remove stopwords and punctuation
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)

    # Stemming using Porter Stemmer
    text = y[:]
    y.clear()
    for i in text:
        y.append(ps.stem(i))
    
    # Join the processed tokens back into single string
    return " ".join(y)

In [155]:
transform_text('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...')

'go jurong point crazi avail bugi n great world la e buffet cine got amor wat'

In [156]:
# df['transformed_text'] = df['v2'].apply(transform_text)
df['transformed_text'] = df['text'].apply(transform_text)
df.head()

Unnamed: 0,target,text,transformed_text
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,ham,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
3,ham,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though


In [157]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
tfid = TfidfVectorizer(max_features = 500)

In [158]:
X = tfid.fit_transform(df['transformed_text']).toarray()
y = df['target'].values

Train Test Split

In [159]:
from sklearn.model_selection import train_test_split
X_train, X_test , y_train, y_test = train_test_split(X,y,test_size = 0.20, random_state = 2)

Model Training

In [160]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier


In [161]:
svc = SVC(kernel="sigmoid", gamma = 1.0)
knc = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier()
lrc = LogisticRegression()
rfc = RandomForestClassifier()
abc  = AdaBoostClassifier()
bc = BaggingClassifier(n_estimators= 50, random_state = 2)
etc = ExtraTreesClassifier(n_estimators = 50, random_state= 2)
gbdt = GradientBoostingClassifier(n_estimators=50, random_state=2)
xgb = XGBClassifier(n_estimators = 50, random_state = 2)

In [162]:
clfs = {
    "SVC": svc,
    "KNN": knc,
    "MultinomialNB": mnb,
    "DecisionTree": dtc,
    "LogisticRegression": lrc,
    "RandomForest": rfc,
    "AdaBoost": abc,
    "Bagging": bc,
    "ExtraTrees": etc,
    "GradientBoosting": gbdt,
    "XGBoost": xgb
}


In [163]:
# from sklearn.preprocessing import LabelEncoder

# le = LabelEncoder()
# y = le.fit_transform(y)  # 'ham' -> 0, 'spam' -> 1

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# Vectorize the text
tfid = TfidfVectorizer(max_features=500)
X = tfid.fit_transform(df['transformed_text']).toarray()

# Encode the target labels
le = LabelEncoder()
y = le.fit_transform(df['target'].values)  # Convert 'ham' -> 0, 'spam' -> 1

# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=2)

Model Evaluation

In [164]:
from sklearn.metrics import accuracy_score, precision_score

def train_classifier(clfs, X_train, y_train, X_test, y_test):
    clfs.fit(X_train,y_train)
    y_pred = clfs.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    # precision = precision_score(y_test, y_pred,average='binary')
    precision = precision_score(y_test, y_pred, average='weighted')
    # precision = precision_score(y_test, y_pred)
    return accuracy, precision

In [165]:
accuracy_scores = []
precision_scores = []
for name , clf in clfs.items():
    current_accuracy, current_precision = train_classifier(clf, X_train, y_train, X_test, y_test)
    print()
    print("For: ",name)
    print("Accuracy: ", (current_accuracy*100), "%")
    print("Precision: ", (current_precision*100), "%")

    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)


For:  SVC
Accuracy:  96.61508704061895 %
Precision:  96.54577802836897 %

For:  KNN
Accuracy:  92.74661508704062 %
Precision:  93.306866238917 %

For:  MultinomialNB
Accuracy:  97.09864603481626 %
Precision:  97.08553897410658 %

For:  DecisionTree
Accuracy:  95.64796905222437 %
Precision:  95.55248494524223 %

For:  LogisticRegression
Accuracy:  96.61508704061895 %
Precision:  96.60868980617258 %

For:  RandomForest
Accuracy:  97.1953578336557 %
Precision:  97.15524983383828 %

For:  AdaBoost
Accuracy:  96.13152804642166 %
Precision:  96.06436998871347 %

For:  Bagging
Accuracy:  96.5183752417795 %
Precision:  96.43565424961713 %

For:  ExtraTrees
Accuracy:  97.29206963249516 %
Precision:  97.24435087275461 %

For:  GradientBoosting
Accuracy:  95.06769825918762 %
Precision:  95.02063529825506 %

For:  XGBoost
Accuracy:  97.00193423597679 %
Precision:  96.97272572049249 %
