<a href="https://colab.research.google.com/github/ChahakSaklecha/SMS-Email-Classifier/blob/main/sms_email_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [87]:
import numpy as np
import pandas as pd

In [88]:
df = pd.read_csv('/content/drive/MyDrive/spam-sms-detector/spam.csv')

In [89]:
df.sample(5)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
1762,ham,Sometimes Heart Remembrs someone Very much... ...,,,
5196,spam,Spook up your mob with a Halloween collection ...,,,
3988,ham,Ok lor. Anyway i thk we cant get tickets now c...,,,
5239,ham,"Jay wants to work out first, how's 4 sound?",,,
5474,ham,Where's mummy's boy ? Is he being good or bad ...,,,


#1. Data Cleaning

In [90]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [92]:
df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace = True)

In [None]:
df.sample(5)


In [None]:
#renaming the cols
df.rename(columns={'v1':'target','v2':'text' },inplace=True)
df.sample(5)

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [None]:
df['target'] = encoder.fit_transform(df['target'])

In [None]:
df.head()

In [None]:
#missing values
df.isnull().sum()

In [None]:
#check for duplicate values
df.duplicated().sum()


In [None]:
#remove dupliactes
df = df.drop_duplicates(keep='first')

df.duplicated().sum()

In [None]:
df.shape

# 2.EDA

In [None]:
#Exploratory data analysis - understanding of analysis

In [None]:
df.head()

In [None]:
df['target'].value_counts()

In [None]:
import matplotlib.pyplot as plt
plt.pie(df['target'].value_counts(), labels=['ham','spam'], autopct="%0.2f")
plt.show()

In [None]:
#data is imbalanced

In [None]:
!pip install nltk

In [None]:
import nltk
nltk.download('punkt')

In [None]:
df['num_characters'] = df['text'].apply(len)

In [None]:
df.head()

In [None]:
#fetching number of words

In [None]:
df['num_words'] = df['text'].apply(lambda x:len(nltk.word_tokenize(x)))

In [None]:
df.head()

In [None]:
df['num_sentences'] = df['text'].apply(lambda x:len(nltk.sent_tokenize(x)))

In [None]:
df.head()

In [None]:
df[['num_characters','num_words','num_sentences']].describe()

In [None]:
#ham
df[df['target']==0][['num_characters','num_words','num_sentences']].describe()

In [None]:
#spam
df[df['target']==1][['num_characters','num_words','num_sentences']].describe()

In [None]:
import seaborn as sns

In [None]:
plt.figure(figsize=(12,6))
sns.histplot(df[df['target']==0]['num_characters'])
sns.histplot(df[df['target']==1]['num_characters'],color='red')

In [None]:
plt.figure(figsize=(12,6))
sns.histplot(df[df['target']==0]['num_words'])
sns.histplot(df[df['target']==1]['num_words'],color='red')

In [None]:
sns.pairplot(df,hue='target')

In [None]:
sns.heatmap(df.corr(),annot=True)

# 3.Data Preprocessing
      Lower case
      Tokenization
      Removing special char
      Removing stop words and punctuations
      Stemming

In [None]:
def transform_text(text):
  text=text.lower()
  text=nltk.word_tokenize(text)

  y=[]
  for i in text:
    if i.isalnum():
      y.append(i)


  test=y[:]  #cloning
  y.clear()

  for i in text:
    if i not in stopwords.words('english') and i not in string.punctuation:
      y.append(i)


  text=y[:]
  y.clear()

  for i in text:
    y.append(ps.stem(i))


  return " ".join(y)


In [None]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords.words('english')


In [None]:
import string
string.punctuation

In [None]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
ps.stem('loving')

In [None]:
df['transformed_text'] = df['text'].apply(transform_text)

In [None]:
df.head()

In [None]:
from wordcloud import WordCloud
wc = WordCloud(width=500, height=500, min_font_size=10,background_color='white')

In [None]:
spam_wc = wc.generate(df[df['target']==1]['transformed_text'].str.cat(sep=" "))

In [None]:
plt.figure(figsize=(12,6))
plt.imshow(spam_wc)

In [None]:
ham_wc = wc.generate(df[df['target']==0]['transformed_text'].str.cat(sep=" "))

In [None]:
plt.figure(figsize=(12,6))
plt.imshow(ham_wc)

In [None]:
df.head()

In [None]:
spam_corpus =[]
for msg in df[df['target']==1]['transformed_text'].tolist():
  for word in msg.split():
    spam_corpus.append(word)

In [None]:
len(spam_corpus)

In [None]:
from collections import Counter

spam_most_common_df = pd.DataFrame(Counter(spam_corpus).most_common(30))
sns.barplot(x=spam_most_common_df[0], y=spam_most_common_df[1])
plt.xticks(rotation='vertical')
plt.show()

In [None]:
ham_corpus =[]
for msg in df[df['target']==0]['transformed_text'].tolist():
  for word in msg.split():
    ham_corpus.append(word)

In [None]:
len(ham_corpus)

In [None]:
from collections import Counter

ham_most_common_df = pd.DataFrame(Counter(ham_corpus).most_common(30))
sns.barplot(x=ham_most_common_df[0], y=ham_most_common_df[1])


In [None]:
df.head()

# 4.Model Building


In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
cv = CountVectorizer()
tfidf = TfidfVectorizer(max_features=3000)


In [None]:
X = tfidf.fit_transform(df['transformed_text']).toarray()

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [None]:
y = df['target'].values

In [None]:
y

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [None]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score

In [None]:
gnb = GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB()


In [None]:
gnb.fit(X_train,y_train)
y_pred1 = gnb.predict(X_test)
print(accuracy_score(y_test,y_pred1))
print(confusion_matrix(y_test,y_pred1))
print(precision_score(y_test,y_pred1))

In [None]:
mnb.fit(X_train,y_train)
y_pred2 = mnb.predict(X_test)
print(accuracy_score(y_test,y_pred2))
print(confusion_matrix(y_test,y_pred2))
print(precision_score(y_test,y_pred2))

In [None]:
bnb.fit(X_train,y_train)
y_pred3 = bnb.predict(X_test)
print(accuracy_score(y_test,y_pred3))
print(confusion_matrix(y_test,y_pred3))
print(precision_score(y_test,y_pred3))

In [None]:
#tfidf -> bnb

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

In [None]:
svc = SVC (kernel='sigmoid', gamma=1.0)
knc = KNeighborsClassifier()
bnb = BernoulliNB()
dtc = DecisionTreeClassifier(max_depth=5)
lrc = LogisticRegression(solver='liblinear', penalty='l1')
rfc = RandomForestClassifier(n_estimators=50, random_state=2)
abc = AdaBoostClassifier (n_estimators=50, random_state=2)
bc = BaggingClassifier (n_estimators=50, random_state=2)
etc = ExtraTreesClassifier (n_estimators=50, random_state=2)
gbdt = GradientBoostingClassifier (n_estimators=50, random_state=2)
xgb = XGBClassifier (n_estimators=50, random_state=2)

In [None]:
clfs = {
    'SVC' : svc,
    'KN' : knc,
    'NB' : bnb,
    'DT' : dtc,
    'LR' : lrc,
    'RF' : rfc,
    'AdaBoost' : abc,
    'BgC' : bc,
    'ETC' : etc,
    'GBDT' : gbdt,
    'xgb' : xgb
}

In [None]:
def train_classifier(clf,X_train,y_train,X_test,y_test):
  clf.fit(X_train,y_train)
  y_pred = clf.predict(X_test)
  accuracy = accuracy_score(y_test,y_pred)
  precision = precision_score(y_test,y_pred)

  return accuracy,precision

In [None]:
train_classifier(svc,X_train,y_train,X_test,y_test)

In [93]:
accuracy_scores = []
precision_scores = []
for name, clf in clfs.items():
  current_accuracy, current_precision = train_classifier(clf,X_train,y_train,X_test,y_test)

  print("For",name)
  print("Accuracy - ", current_accuracy)
  print("Precision - ",current_precision)

  accuracy_scores.append(current_accuracy)
  precision_scores.append(current_precision)

For SVC
Accuracy -  0.9758220502901354
Precision -  0.959349593495935
For KN
Accuracy -  0.9061895551257253
Precision -  1.0
For NB
Accuracy -  0.988394584139265
Precision -  1.0
For DT
Accuracy -  0.9303675048355899
Precision -  0.83
For LR
Accuracy -  0.971953578336557
Precision -  0.9658119658119658
For RF
Accuracy -  0.97678916827853
Precision -  0.9830508474576272
For AdaBoost
Accuracy -  0.971953578336557
Precision -  0.9504132231404959
For BgC
Accuracy -  0.9584139264990329
Precision -  0.8571428571428571
For ETC
Accuracy -  0.9806576402321083
Precision -  0.9682539682539683
For GBDT
Accuracy -  0.9468085106382979
Precision -  0.946236559139785
For xgb
Accuracy -  0.9729206963249516
Precision -  0.9508196721311475


In [94]:
performance_df = pd.DataFrame({'Algorithm':clfs.keys(), 'Accuracy' : accuracy_scores, 'Precision' : precision_scores}).sort_values('Precision',ascending = False)

In [95]:
performance_df

Unnamed: 0,Algorithm,Accuracy,Precision
1,KN,0.90619,1.0
2,NB,0.988395,1.0
5,RF,0.976789,0.983051
8,ETC,0.980658,0.968254
4,LR,0.971954,0.965812
0,SVC,0.975822,0.95935
10,xgb,0.972921,0.95082
6,AdaBoost,0.971954,0.950413
9,GBDT,0.946809,0.946237
7,BgC,0.958414,0.857143


In [96]:
#model.improve
#1. Change the max_features of tdif

In [97]:
temp_df = pd.DataFrame({'Algorithm':clfs.keys(), 'Accuracy_max_ft_3000' : accuracy_scores, 'Precision_max_ft_3000' : precision_scores}).sort_values('Precision_max_ft_3000',ascending = False)

In [98]:
performance_df.merge(temp_df,on='Algorithm')

Unnamed: 0,Algorithm,Accuracy,Precision,Accuracy_max_ft_3000,Precision_max_ft_3000
0,KN,0.90619,1.0,0.90619,1.0
1,NB,0.988395,1.0,0.988395,1.0
2,RF,0.976789,0.983051,0.976789,0.983051
3,ETC,0.980658,0.968254,0.980658,0.968254
4,LR,0.971954,0.965812,0.971954,0.965812
5,SVC,0.975822,0.95935,0.975822,0.95935
6,xgb,0.972921,0.95082,0.972921,0.95082
7,AdaBoost,0.971954,0.950413,0.971954,0.950413
8,GBDT,0.946809,0.946237,0.946809,0.946237
9,BgC,0.958414,0.857143,0.958414,0.857143


In [99]:
temp_df = pd.DataFrame({'Algorithm':clfs.keys(), 'Accuracy_scaling' : accuracy_scores, 'Precision_scaling' : precision_scores}).sort_values('Precision_scaling',ascending = False)

In [100]:
new_df_scaled = performance_df.merge(temp_df,on='Algorithm')

In [101]:
new_df_scaled

Unnamed: 0,Algorithm,Accuracy,Precision,Accuracy_scaling,Precision_scaling
0,KN,0.90619,1.0,0.90619,1.0
1,NB,0.988395,1.0,0.988395,1.0
2,RF,0.976789,0.983051,0.976789,0.983051
3,ETC,0.980658,0.968254,0.980658,0.968254
4,LR,0.971954,0.965812,0.971954,0.965812
5,SVC,0.975822,0.95935,0.975822,0.95935
6,xgb,0.972921,0.95082,0.972921,0.95082
7,AdaBoost,0.971954,0.950413,0.971954,0.950413
8,GBDT,0.946809,0.946237,0.946809,0.946237
9,BgC,0.958414,0.857143,0.958414,0.857143


In [102]:
import pickle
pickle.dump(tfidf,open('vectorizer.pkl','wb'))
pickle.dump(bnb,open('model.pkl','wb'))