In [5]:
import numpy as np
import pandas as pd

In [6]:
df = pd.read_csv('spam.csv', encoding='latin1')

In [7]:
df.sample(5)

Unnamed: 0.1,Unnamed: 0,label,text,label_num
516,3189,ham,Subject: suemar berryman well\r\nthe inlet to ...,0
3570,335,ham,Subject: purchasing computer equipment\r\nin o...,0
2641,2488,ham,Subject: equistar feb - 01\r\ni think i found ...,0
5166,1518,ham,Subject: put the 10 on the ft\r\nthe transport...,0
4978,3553,ham,Subject: fw : meter 98 - 7266 ; may 2001 texas...,0


In [8]:
df.shape

(5171, 4)

In [9]:
# 1. Data cleaning
# 2. EDA (Exporatory Data Analysis
# 3. Text preprocessing
# 4. Model buiding
# 5. Evaluation
# 6. Improvement
# 7. Website

## 1. Data Cleaning

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5171 non-null   int64 
 1   label       5171 non-null   object
 2   text        5171 non-null   object
 3   label_num   5171 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 161.7+ KB


In [11]:
# drop last 3 cols
df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'], inplace = True)

KeyError: "['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'] not found in axis"

In [None]:
df.sample(5)

In [None]:
# renaming the columns

df.rename(columns={'v1':'target','v2':'text'}, inplace = True)

In [None]:
df.sample(5)

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [None]:
df['target'] = encoder.fit_transform(df['target'])

In [None]:
df.head(5)

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df = df.drop_duplicates(keep = 'first')

In [None]:
df.duplicated().sum()

In [None]:
df.shape

## 2.EDA

In [None]:
df['target'].value_counts()

In [None]:
import matplotlib.pyplot as plt
plt.pie(df['target'].value_counts(), labels=['ham','spam'],autopct="%0.2f")
plt.show()

In [None]:
# data is imbalanced

In [None]:
import nltk

In [None]:
nltk.download('punkt_tab')
nltk.download('stopwords')

In [None]:
df['num_characters']=df['text'].apply(len)

In [None]:
df['num_words'] = df['text'].apply(lambda x: len(nltk.word_tokenize(x)))

In [None]:
df['num_sentences'] = df['text'].apply(lambda x:len(nltk.sent_tokenize(x)))

In [None]:
df.head()

In [None]:
df[['num_characters', 'num_words', 'num_sentences']].describe()

In [None]:
df[df['target'] ==0][['num_characters', 'num_words', 'num_sentences']].describe()

In [None]:
df[df['target']==1][['num_characters', 'num_words', 'num_sentences']].describe()

In [None]:
import seaborn as sns   

In [None]:
plt.figure(figsize=(12,6))
sns.histplot(df[df['target']==0]['num_characters'])
sns.histplot(df[df['target']==1]['num_characters'], color='red')

In [None]:
plt.figure(figsize=(12,6))
sns.histplot(df[df['target']==0]['num_words'])
sns.histplot(df[df['target']==1]['num_words'], color='red')

In [None]:
sns.pairplot(df,hue= 'target')

In [None]:
correlation_matrix = df.select_dtypes(include=['number']).corr()
sns.heatmap(correlation_matrix, annot=True)


### 3. Data Preprocessing
1. Convert text to **lowercase**.
2. Perform **tokenization** (split text into individual words or tokens).
3. Remove **special characters**.
4. Eliminate **stop words** and **punctuation**.
5. Apply **stemming** (reduce words to their root form).

In [None]:
from nltk.corpus import stopwords
import string

In [None]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
ps.stem('laughing')


In [None]:
def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    y =[]
    for i in text:
        if i.isalnum():
            y.append(i)
    text = y[:]
    y.clear()
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
    text = y[:]
    y.clear()
    for i in text:
        y.append(ps.stem(i))
    return " ".join(y)

In [None]:
df['transformed_text']= df['text'].apply(transform_text)

In [None]:
df.head()

In [None]:
from wordcloud import WordCloud
wc = WordCloud(width = 500, height =500, min_font_size=10, background_color = 'white')

In [None]:
spam_wc = wc.generate(df[df['target'] == 1]['transformed_text'].str.cat(sep = " "))

In [None]:
plt.figure(figsize = (12,6))
plt.imshow(spam_wc)

In [None]:
ham_wc = wc.generate(df[df['target'] == 0]['transformed_text'].str.cat(sep = " "))

In [None]:
plt.figure(figsize = (12,6))
plt.imshow(ham_wc)

In [None]:
df.head()

In [None]:
spam_corpus = []
for msg in df[df['target'] == 1]['transformed_text'].tolist():
    for word in msg.split():
        spam_corpus.append(word)
    

In [None]:
len(spam_corpus)

In [None]:
from collections import Counter
sns.barplot(x =0, y=1, data = pd.DataFrame(Counter(spam_corpus).most_common(30)))
plt.xticks(rotation = 'vertical') 
plt.show()

In [None]:
ham_corpus = []
for msg in df[df['target'] == 0]['transformed_text'].tolist():
    for word in msg.split():
        ham_corpus.append(word)

In [None]:
from collections import Counter
sns.barplot(x =0, y=1, data = pd.DataFrame(Counter(ham_corpus).most_common(30)))
plt.xticks(rotation = 'vertical') 
plt.show()

In [None]:
df.head()

## 4. Model Building

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
cv = CountVectorizer()
tfidf = TfidfVectorizer(max_features = 3000)

In [None]:
X = tfidf.fit_transform(df['transformed_text']).toarray()

In [None]:
X.shape

In [None]:
y = df['target'].values

In [None]:
y

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state=2)

In [None]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score

In [None]:
gnb = GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB()

In [None]:
gnb.fit(X_train, y_train)
y_pred1 = gnb.predict(X_test)
print(accuracy_score(y_test, y_pred1))
print(confusion_matrix(y_test, y_pred1))
print(precision_score(y_test, y_pred1))

In [None]:
mnb.fit(X_train, y_train)
y_pred2 = mnb.predict(X_test)
print(accuracy_score(y_test, y_pred2))
print(confusion_matrix(y_test, y_pred2))
print(precision_score(y_test, y_pred2))

In [None]:
bnb.fit(X_train, y_train)
y_pred3 = bnb.predict(X_test)
print(accuracy_score(y_test, y_pred3))
print(confusion_matrix(y_test, y_pred3))
print(precision_score(y_test, y_pred3))

In [None]:
#tfidf -->MNB

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
!pip install xgboost

In [None]:
svc = SVC(kernel = 'sigmoid', gamma= 1.0)
knc = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth=5)
lrc = LogisticRegression(solver = 'liblinear', penalty='l1')
rfc = RandomForestClassifier(n_estimators = 50, random_state=2)
abc = AdaBoostClassifier(n_estimators =50, random_state =2)
bc = BaggingClassifier(n_estimators=50, random_state =2)
etc = ExtraTreesClassifier(n_estimators =50, random_state =2)
gbdt = GradientBoostingClassifier(n_estimators=50, random_state=2)
xgb = XGBClassifier(n_estimators=50,random_state=2)

In [None]:
clfs = {
    'SVC' : svc,
    'KN' : knc, 
    'NB': mnb, 
    'DT': dtc, 
    'LR': lrc, 
    'RF': rfc, 
    'AdaBoost': abc, 
    'BgC': bc, 
    'ETC': etc,
    'GBDT':gbdt,
    'xgb':xgb
}

In [None]:
def train_classifier(clf,X_train, y_train,X_test, y_test):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)
    
    return accuracy,precision
    

In [None]:
train_classifier(svc,X_train,y_train,X_test,y_test)

In [None]:

accuracy_scores = []
precision_scores = []

for name,clf in clfs.items():
    
    current_accuracy,current_precision = train_classifier(clf, X_train,y_train,X_test,y_test)
    
    print("For ",name)
    print("Accuracy - ",current_accuracy)
    print("Precision - ",current_precision)
    
    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)

In [None]:
performance_df  = pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy':accuracy_scores,'Precision':precision_scores}).sort_values('Precision',ascending=False)

In [None]:
performance_df

In [None]:
performance_df1 = pd.melt(performance_df, id_vars = "Algorithm")

In [None]:

performance_df1

In [None]:
sns.catplot(x = 'Algorithm', y='value', 
               hue = 'variable',data=performance_df1, kind='bar',height=5)
plt.ylim(0.5,1.0)
plt.xticks(rotation='vertical')
plt.show()

In [None]:

# model improve
# 1. Change the max_features parameter of TfIdf

In [None]:
temp_df = pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy_max_ft_3000':accuracy_scores,'Precision_max_ft_3000':precision_scores}).sort_values('Precision_max_ft_3000',ascending=False)

In [None]:
temp_df = pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy_scaling':accuracy_scores,'Precision_scaling':precision_scores}).sort_values('Precision_scaling',ascending=False)

In [None]:
new_df = performance_df.merge(temp_df,on='Algorithm')

In [None]:
new_df_scaled = new_df.merge(temp_df,on='Algorithm')

In [None]:
temp_df = pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy_num_chars':accuracy_scores,'Precision_num_chars':precision_scores}).sort_values('Precision_num_chars',ascending=False)

In [None]:
new_df_scaled.merge(temp_df,on='Algorithm')