In [1]:
# import necessary libraries
import numpy as np              # for numerical operations
import pandas as pd             # for data manipulation and analysis
import matplotlib.pyplot as plt # for data visualizaton
%matplotlib inline

# importing wordcloud for text visualization
from wordcloud import WordCloud

# Importing nltk for natural language processing
import nltk
from nltk.corpus import stopwords # for stopwords

# importing tokenizer library from the nltk
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()

# Downloading NLTK data
nltk.download('stopwords') # Downloading stopwords data
nltk.download('punkt')     # downloading tokenizer data

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\NITHIN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\NITHIN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# Read the CSV file
data=pd.read_csv('spam.csv')

# Display the first few rows of the DataFrame
data.head(5)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
# droping unwanted columns
data.drop(columns=[ 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'],inplace=True)
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
# Rename the columns name
data.rename(columns={'v1':'target','v2':'text'},inplace=True)
data.head()

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Data Preprocessing

In [5]:
from sklearn.preprocessing import LabelEncoder

encoder=LabelEncoder()

data['target']=encoder.fit_transform(data['target'])

data.head()

Unnamed: 0,target,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
# check the duplication
data.duplicated().sum()

np.int64(403)

In [7]:
len(data)

5572

In [8]:
# remove the duplicate
data=data.drop_duplicates(keep='first')
len(data)

5169

### Feature Engineering

In [9]:
# Import the porter stemmer for text stemming
from nltk.stem.porter import PorterStemmer

# Import the string module for handling the special character
import string

# create an instance of the Porter stemmer
ps=PorterStemmer()

In [10]:
# Initialize tokenizer and stemmer
tokenizer = TreebankWordTokenizer()
ps = PorterStemmer()

def transform_text(text):
    # Lowercase
    text = text.lower()

    # Tokenize
    tokens = tokenizer.tokenize(text)

    # Remove non-alphanumeric tokens
    tokens = [token for token in tokens if token.isalnum()]

    # Remove stopwords and punctuation
    tokens = [token for token in tokens if token not in stopwords.words('english') and token not in string.punctuation]

    # Stemming
    tokens = [ps.stem(token) for token in tokens]

    # Join back into string
    return " ".join(tokens)

In [11]:
# testing the function using an example
transform_text('Go until Journey point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...')

'go journey point avail bugi n great world la e buffet cine got amor wat'

In [12]:
# Applying the funciton on our DataFrame
data['transformed_text']=data['text'].apply(transform_text)
data.head()

Unnamed: 0,target,text,transformed_text
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point avail bugi n great world la e ...
1,0,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
3,0,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though


In [13]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
tfid=TfidfVectorizer(max_features=500)

In [14]:
x=tfid.fit_transform(data['transformed_text']).toarray()
y=data['target'].values

### Train Test Split

In [15]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=2)

### MOdel Training

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

In [17]:
svc=SVC(kernel='sigmoid',gamma=1.0)
knc=KNeighborsClassifier()
mnb=MultinomialNB()
dtc=DecisionTreeClassifier(max_depth=5)
lrc=LogisticRegression(solver='liblinear',penalty='l1')
rfc=RandomForestClassifier(n_estimators=50,random_state=2)
abc=AdaBoostClassifier(n_estimators=50,random_state=2)
bc=BaggingClassifier(n_estimators=50,random_state=2)
etc=ExtraTreesClassifier(n_estimators=50,random_state=2)
gbdt=GradientBoostingClassifier(n_estimators=50,random_state=2)
xgb=XGBClassifier(n_estimators=5,random_state=2)

In [18]:
clfs={'SVC':svc,
'KNN':knc,
'NB':mnb,
'DT':dtc,
'LR':lrc,
'RF':rfc,
'Adaboost':abc,
'Bgc':bc,
'ETC':etc,
'GBDT':gbdt,
'xgb':xgb}

### Model Evaluation

In [19]:
from sklearn.metrics import accuracy_score,precision_score

def train_classifier(clfs,X_train,y_train,X_test,y_test):
    clfs.fit(X_train,y_train)
    y_pred=clfs.predict(X_test)
    accuracy=accuracy_score(y_test,y_pred)
    precision=precision_score(y_test,y_pred)
    return accuracy,precision

In [20]:
accuracy_scores=[]
precision_scores=[]
for name,clfs in clfs.items():
    current_accuracy,current_precision=train_classifier(clfs,X_train,y_train,X_test,y_test)
    print()
    print("For: ",name)
    print('Accuracy: ',current_accuracy)
    print('Precision: ',current_precision)

    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)


For:  SVC
Accuracy:  0.9671179883945842
Precision:  0.9333333333333333

For:  KNN
Accuracy:  0.925531914893617
Precision:  0.9841269841269841

For:  NB
Accuracy:  0.9690522243713733
Precision:  0.9649122807017544

For:  DT
Accuracy:  0.9352030947775629
Precision:  0.8901098901098901

For:  LR
Accuracy:  0.960348162475822
Precision:  0.9292035398230089

For:  RF
Accuracy:  0.9700193423597679
Precision:  0.9349593495934959

For:  Adaboost
Accuracy:  0.9245647969052224
Precision:  0.8947368421052632

For:  Bgc
Accuracy:  0.9584139264990329
Precision:  0.8740157480314961

For:  ETC
Accuracy:  0.9709864603481625
Precision:  0.9285714285714286

For:  GBDT
Accuracy:  0.9506769825918762
Precision:  0.9484536082474226

For:  xgb
Accuracy:  0.9390715667311412
Precision:  0.9213483146067416
