In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_csv('Dataset/spam.csv')
data.head()

In [None]:
data.drop(columns=data.columns[2:5], inplace=True)

In [None]:
data.rename(columns={'v1':'target','v2':'text'},inplace=True)

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data['target'] = le.fit_transform(data['target'])

In [None]:
data.isnull().sum()

In [None]:
data.duplicated().sum()

In [None]:
data = data.drop_duplicates(keep='first')

In [None]:
plt.pie(data['target'].value_counts().values, labels=['ham', 'spam'], autopct='%1.2f%%', startangle=140)
plt.title('Distribution of Target Values')
plt.show()

In [None]:
data['characters'] = data['text'].apply(len)

In [None]:
import nltk
nltk.download('punkt')
data['words'] = data['text'].apply(lambda x: len(nltk.word_tokenize(x)))

In [None]:
data['sentences'] = data['text'].apply(lambda x: len(nltk.sent_tokenize(x)))

In [None]:
data.head()

In [None]:
data[data.columns[2:5]].describe()

In [None]:
# for ham 
data[data['target']==0][data.columns[2:5]].describe()

In [None]:
# for spam 
data[data['target']==1][data.columns[2:5]].describe()

In [None]:
plt.figure(figsize=(10,6))
sns.histplot(data[data['target']==0]['characters'])
sns.histplot(data[data['target']==1]['characters'], color='red')

In [None]:
plt.figure(figsize=(10,6))
sns.histplot(data[data['target']==0]['words'])
sns.histplot(data[data['target']==1]['words'], color='red')

In [None]:
plt.figure(figsize=(10,6))
sns.histplot(data[data['target']==0]['sentences'])
sns.histplot(data[data['target']==1]['sentences'], color='red')

In [None]:
sns.pairplot(data,hue='target')

In [None]:
sns.heatmap(data.iloc[:, [0, 2, 3, 4]].corr(),annot=True)

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')
import string
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

def convert(text):
    text = text.lower()
    text = nltk.word_tokenize(text)

    temp = []
    for i in text:
        if (i.isalnum() and (i not in stopwords.words('english') and string.punctuation)):
            temp.append(ps.stem(i))
    return " ".join(temp)

In [None]:
data['converted_text'] = data['text'].apply(convert)

In [None]:
data.head()

In [None]:
from wordcloud import WordCloud
wc = WordCloud(width=500, height=500, min_font_size=10, max_font_size=200, colormap='viridis')

In [None]:
spam_msg = wc.generate(data[data['target']==1]['converted_text'].str.cat(sep=" "))
plt.figure(figsize=(8,8))
plt.imshow(spam_msg, interpolation='bilinear')
plt.axis('off')  
plt.show()

In [None]:
ham_msg = wc.generate(data[data['target']==0]['converted_text'].str.cat(sep=" "))
plt.figure(figsize=(8,8))
plt.imshow(ham_msg, interpolation='bilinear')
plt.axis('off')  
plt.show()

In [None]:
spam_corpus = []
for msg in data[data['target']==1]['converted_text'].tolist():
    for word in msg.split():
        spam_corpus.append(word)

In [None]:
from collections import Counter 

spam_counter = Counter(spam_corpus)
spam_data = pd.DataFrame(spam_counter.most_common(30), columns=['word', 'count'])

sns.barplot(x='word', y='count', data=spam_data, hue='word', palette='viridis', legend=False)
plt.xticks(rotation=90)
plt.show()

In [None]:
ham_corpus = []
for msg in data[data['target']==0]['converted_text'].tolist():
    for word in msg.split():
        ham_corpus.append(word)

In [None]:
from collections import Counter 

ham_counter = Counter(ham_corpus)
ham_data = pd.DataFrame(ham_counter.most_common(30), columns=['word', 'count'])

sns.barplot(x='word', y='count', data=ham_data, hue='word', palette='viridis', legend=False)
plt.xticks(rotation=90)
plt.show()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
cv = CountVectorizer()
tfv = TfidfVectorizer()

In [None]:
X1 = cv.fit_transform(data['converted_text']).toarray()
Y1 = data['target'].values

X2 = tfv.fit_transform(data['converted_text']).toarray()
Y2 = data['target'].values

In [None]:
from sklearn.model_selection import train_test_split

X_train1,X_test1,Y_train1,Y_test1 = train_test_split(X1,Y1,test_size=0.2,random_state=2)

X_train2,X_test2,Y_train2,Y_test2 = train_test_split(X2,Y2,test_size=0.2,random_state=2)

In [None]:
from sklearn.naive_bayes import GaussianNB,BernoulliNB,MultinomialNB
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score
gnb = GaussianNB()
bnb = BernoulliNB()
mnb = MultinomialNB()

In [None]:
print("GaussianNB:")
gnb.fit(X_train1,Y_train1)
y_pred1 = gnb.predict(X_test1)
print("accuracy_score :",accuracy_score(Y_test1,y_pred1))
print("confusion_matrix:\n",confusion_matrix(Y_test1,y_pred1))
print("precision_score :",precision_score(Y_test1,y_pred1))

print("\nBernoulliNB:")
bnb.fit(X_train1,Y_train1)
y_pred2 = bnb.predict(X_test1)
print("accuracy_score :",accuracy_score(Y_test1,y_pred2))
print("confusion_matrix :\n",confusion_matrix(Y_test1,y_pred2))
print("precision_score :",precision_score(Y_test1,y_pred2))

print("\nMultinomialNB:")
mnb.fit(X_train1,Y_train1)
y_pred3 = mnb.predict(X_test1)
print("accuracy_score :",accuracy_score(Y_test1,y_pred3))
print("confusion_matrix :\n",confusion_matrix(Y_test1,y_pred3))
print("precision_score :",precision_score(Y_test1,y_pred3))

In [None]:
print("GaussianNB:")
gnb.fit(X_train2,Y_train2)
y_pred1 = gnb.predict(X_test2)
print("accuracy_score :",accuracy_score(Y_test2,y_pred1))
print("confusion_matrix :\n",confusion_matrix(Y_test2,y_pred1))
print("precision_score :",precision_score(Y_test2,y_pred1))

print("\nBernoulliNB:")
bnb.fit(X_train2,Y_train2)
y_pred2 = bnb.predict(X_test2)
print("accuracy_score :",accuracy_score(Y_test2,y_pred2))
print("confusion_matrix :\n",confusion_matrix(Y_test2,y_pred2))
print("precision_score :",precision_score(Y_test2,y_pred2))

print("\nMultinomialNB:")
mnb.fit(X_train2,Y_train2)
y_pred3 = mnb.predict(X_test2)
print("accuracy_score :",accuracy_score(Y_test2,y_pred3))
print("confusion_matrix :\n",confusion_matrix(Y_test2,y_pred3))
print("precision_score :",precision_score(Y_test2,y_pred3))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

In [None]:
svc = SVC(kernel='sigmoid', gamma=1.0)
knc = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth=5)
lrc = LogisticRegression(solver='liblinear', penalty='l1')
rfc = RandomForestClassifier(n_estimators=50, random_state=2)
abc = AdaBoostClassifier(n_estimators=50, random_state=2)
bc = BaggingClassifier(n_estimators=50, random_state=2)
etc = ExtraTreesClassifier(n_estimators=50, random_state=2)
gbdt = GradientBoostingClassifier(n_estimators=50,random_state=2)
xgb = XGBClassifier(n_estimators=50,random_state=2)

In [None]:
clfs = {
    'SVC' : svc,
    'KN' : knc, 
    'NB': mnb, 
    'DT': dtc, 
    'LR': lrc, 
    'RF': rfc, 
    'AdaBoost': abc, 
    'BgC': bc, 
    'ETC': etc,
    'GBDT':gbdt,
    'xgb':xgb
}

In [None]:
def train_classifier(clf,X_train,y_train,X_test,y_test):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)
    
    return accuracy,precision

In [None]:
accuracy_scores = []
precision_scores = []

for name,clf in clfs.items():
    
    current_accuracy,current_precision = train_classifier(clf, X_train,y_train,X_test,y_test)
    
    print("For ",name)
    print("Accuracy - ",current_accuracy)
    print("Precision - ",current_precision)
    
    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)

In [None]:
performance_df = pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy':accuracy_scores,'Precision':precision_scores}).sort_values('Precision',ascending=False)

In [None]:
performance_df

In [None]:
performance_df1 = pd.melt(performance_df, id_vars = "Algorithm")
performance_df1
