Import Lib

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

#Train and Test Spliting
from sklearn.model_selection import train_test_split

#Models
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import tree, svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

#Cross Validation
from sklearn.model_selection import cross_val_score, KFold
from sklearn.svm import SVC
from sklearn.datasets import load_iris

#Score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report,ConfusionMatrixDisplay

%matplotlib inline

: 

Import Data

In [None]:
data = pd.read_csv("./content/spam_ham_dataset.csv")


: 

Basic Analysis

In [None]:
data.head()

: 

In [None]:
data.tail()

: 

In [None]:
data.info()

: 

In [None]:
data.describe()

: 

In [None]:
sum(data.duplicated()) == 0

: 

In [None]:
data.drop_duplicates(inplace=True)

: 

In [None]:
print (data.isnull().sum())

: 

In [None]:
data.dropna(inplace=True)

: 

In [None]:
data.head(20)

: 

In [None]:
data[data["label_num"] == 1]

: 

In [None]:
data.drop("label" , axis=1 , inplace =True)

: 

In [None]:
data.info()

: 

In [None]:
data.head()

: 

In [None]:

data["size text"] = data['text'].str.len()
data[data["label_num"]==0]

: 

In [None]:
data.groupby(["label_num"]).min()

: 

In [None]:
data.groupby(["label_num"]).max()

: 

In [None]:
data[data["text"].duplicated()]

: 

In [None]:
data.drop("size text" , axis =1 , inplace =True)

: 

In [None]:
data.info()

: 

Train Test Split

In [None]:
X = data["text"]
Y = data["label_num"]
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

: 

Feature Extraction

In [None]:
featureExtraction = TfidfVectorizer(min_df= 1, stop_words='english', lowercase=True)
X_train_features = featureExtraction.fit_transform(X_train)
X_test_features = featureExtraction.transform(X_test)
pickle.dump(featureExtraction, open("featureExtractionVectorizer" +'.pkl', 'wb'))

X_train_features

: 

Models

In [None]:
models = [
    ('LogisticRegression', LogisticRegression()),
    ('DecisionTreeClassifier', tree.DecisionTreeClassifier()),
    ('SVC', svm.SVC(kernel='linear')),
    ('KNeighborsClassifier', KNeighborsClassifier(n_neighbors=5)),
    ('RandomForestClassifier', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('MultinomialNB', MultinomialNB())
]

global accuracies
accuracies = {}

: 

Classification Report and Confusion Matrix

In [None]:
for name, model in models:
    model.fit(X_train_features, y_train)
    y_pred = model.predict(X_test_features)
    # pickle.dump(model, open(name +'.pkl', 'wb'))
    accuracy = accuracy_score(y_test, y_pred)
    y_pred2 = model.predict(X_train_features)
    accuracy2 = accuracy_score(y_train, y_pred2)
    accuracies[name] = [accuracy, accuracy2]

    print('-----------------------------------------------------')
    print(f"{name} Classification Report:\n")
    print(classification_report(y_test, y_pred))
    ConfusionMatrixDisplay(confusion_matrix(y_test,y_pred)).plot()
    plt.title("Confusion Matrix for Logistic Regression Classifer")
pickle.dump(accuracies, open("accuracies.pkl", 'wb'))


: 

K Folds Cross Validation

In [None]:
kf = KFold(n_splits=5)

for name, model in models:
    pipe = Pipeline([
        ('feature_extraction', TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)),
        ('model', model)
    ])
    scores = cross_val_score(pipe, X, Y, cv=kf)
    accuracy = np.mean(scores)
    print(f"{name} Accuracy: {accuracy*100:.2f}%")

: 

Prediction Using User Input

In [None]:
x = input("Enter an email: ")

x_input = [x]
x_features = featureExtraction.transform(x_input)

for name,model in models:
    y_pred = model.predict(x_features)

    if (y_pred == 0):
        print(f'{name} Predict : ham')
    else:
        print(f'{name} Predict : spam')

: 