In [2]:

""""
Spam Prediction Using Machine Learning

"""



'"\nSpam Prediction Using Machine Learning\n\n'

Importing The Dependencies

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import tkinter as tk
from tkinter import messagebox

# For Visualization
import matplotlib as mpl
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
import plotly.figure_factory as ff
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objs as go
from wordcloud import WordCloud


# Machine Learning Models
from sklearn.ensemble import RandomForestClassifier
from lightgbm.sklearn import LGBMClassifier
import xgboost as xgb
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from catboost import CatBoostClassifier
from sklearn.svm import SVC

#Model Evaluation and Scoring
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_curve, RocCurveDisplay

Data Preprocessing

In [4]:
# Loading the data from csv format to panadas Dataframe
Mail_data_raw = pd.read_csv('C:\BRA FAMOUS\PROGRAMMING\PYTHON\MACHINE LEARNING PROJECTS\PROJECTS\mail_data.csv')


# Replacing Null values with a null string
Mail_data = Mail_data_raw.where((pd.notnull(Mail_data_raw)),'')



Exploratory Data Analysis (EDA) on the Dataset

In [5]:
category_ct = Mail_data_raw['Category'].value_counts()
fig = px.pie(values = category_ct.values,
             names = category_ct.index,
             color_discrete_sequence = px.colors.sequential.OrRd,
             title = 'Pie Graph: Spam of not'

)

fig.update_traces(hoverinfo='label+percent', textinfo='label+value+percent', textfont_size=15,
                  marker=dict(line=dict(color='#000000', width=2)))
fig.show()

Length Distribution of Spam and Ham Message

In [6]:
categories = pd.get_dummies(Mail_data_raw["Category"])
spam_or_ham = pd.concat([Mail_data_raw, categories], axis=1)
spam_or_ham.drop('Category',axis=1,inplace=True)

Mail_data_raw["length"] = Mail_data_raw["Message"].apply(len)

ham = Mail_data_raw.loc[np.where(spam_or_ham['ham'] == 1)].reset_index()
spam = Mail_data_raw.loc[np.where(spam_or_ham['ham'] == 0)].reset_index()

ham.drop('index',axis=1,inplace=True)
spam.drop('index',axis=1,inplace=True)


hist_data = [ham['length'],spam['length']]

group_labels = ['ham','spam']

colors = ['black', 'red']

# Create distplot with curve_type set to 'normal'
fig = ff.create_distplot(hist_data, group_labels, show_hist=False, colors=colors)

# Add title
fig.update_layout(title_text='Length distribution of ham and spam messages',
                 template = 'simple_white')
fig.show()

Label Encoding

In [7]:
# Label Encoding
Mail_data.loc[Mail_data['Category'] == 'spam',  'Category',] = 0
Mail_data.loc[Mail_data['Category'] == 'ham',  'Category',] = 1
Mail_data.head()

X_data = Mail_data['Message']
y_data = Mail_data['Category']

# Splitting The Data
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=3)


Feature Extraction

In [8]:
feature_extraction = TfidfVectorizer(min_df=1, stop_words = 'english', lowercase= True)

X_train_feat =feature_extraction.fit_transform(X_train)
X_test_feat =feature_extraction.transform(X_test)

# Making the prediction label integers
y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [9]:
Model_LR = LogisticRegression()
Model_LR.fit(X_train_feat, y_train)


Training with different Models

In [10]:
#Logistric Regression
Model_LR = LogisticRegression()
Model_RFC = RandomForestClassifier(random_state = 8888)
Model_LGBM = LGBMClassifier(boosting_type='gbdt', objective='binary', random_state=8888)
Model_Catboost = CatBoostClassifier(random_state=8888, logging_level='Silent')
Model_xgbr = xgb.XGBClassifier(objective='binary:hinge',random_state=8888)
Model_SVC = SVC(probability=True,random_state=8888)


#Fitting the models
#Model.fit(X_train_feat, y_train)

Model_LR.fit(X_train_feat, y_train)
Model_RFC.fit(X_train_feat, y_train)
Model_LGBM.fit(X_train_feat, y_train)
Model_Catboost.fit(X_train_feat, y_train, verbose=0)
Model_xgbr.fit(X_train_feat, y_train)
Model_SVC.fit(X_train_feat, y_train)

Classifiers = []
Classifiers.append(Model_LR)
Classifiers.append(Model_RFC)
Classifiers.append(Model_LGBM)
Classifiers.append(Model_Catboost)
Classifiers.append(Model_xgbr)
Classifiers.append(Model_SVC)

model_name = ['Logistric Regression', 'Random Forest', 'LGBMClassifier', 'CatBoostClassifier', 'XGBClassifier', 'SVC']

# Model Evaluation
# Prediction on the Training data

Prediction_model = Model_SVC.predict(X_train_feat)
Accuracy_training_data = accuracy_score(y_train, Prediction_model)

[LightGBM] [Info] Number of positive: 3865, number of negative: 592
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5590
[LightGBM] [Info] Number of data points in the train set: 4457, number of used features: 320
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.867175 -> initscore=1.876210
[LightGBM] [Info] Start training from score 1.876210


MODEL EVALUATION

In [11]:
accuracy_list = []
auc_list=[]        #Area under the curve
recall_list = []
f1_list = []

for classifier in Classifiers :
    y_pred=classifier.predict(X_test_feat)
    y_pred_proba=classifier.predict_proba(X_test_feat)[:,1]
    accuracy_list.append(accuracy_score(y_test,y_pred))
    auc_list.append(roc_auc_score(y_test, y_pred_proba))
    recall_list.append(recall_score(y_test, y_pred))
    f1_list.append(f1_score(y_test, y_pred))

In [12]:
def plot_model_score(model_name, accuracy_list, auc_list, recall_list, f1_list, title):

    fig = go.Figure(data=[
        go.Bar(name='Accuracy', x=model_name, y=np.round(accuracy_list,3), text=np.round(accuracy_list,3), marker_color='#97bad9'),    
        go.Bar(name='AUC',x=model_name, y=np.round(auc_list,3), text=np.round(auc_list,3), marker_color= '#bcd6ef'), 
        go.Bar(name='Recall',   x=model_name, y=np.round(recall_list,3), text=np.round(recall_list,3), marker_color='#ebcad9'),
        go.Bar(name='F1',       x=model_name, y=np.round(f1_list,3), text=np.round(f1_list,3), marker_color='#d5a6bd')
    ])
    
    fig.update_layout(template = 'simple_white', title = title)
    fig.update_layout(xaxis_title="Models", yaxis_title="Score", font = dict(size=17, family = 'Franklin Gothic'))
    fig.update_layout(yaxis_range=[0.7,1])
        
    fig.show()
    
plot_model_score(model_name, accuracy_list, auc_list, recall_list, f1_list,
                 'Accuracy, AUC, Recall Score & F1 Score')

In [13]:
#Prediction on Test Data
Prediction_model_test = Model_RFC.predict(X_test_feat)
Accuracy_training_data = accuracy_score(y_test, Prediction_model_test)


BUILIDNG A PREDICTIVE SYSTEM

In [14]:
user_mail = ['Text & meet someone sexy today. U can find a date or even flirt its up to U. Join 4 just 10p. REPLY with NAME & AGE eg Sam 25. 18 -msg recd@thirtyeight pence']

#Vectorize input
input_mail_feat = feature_extraction.transform(user_mail)

#Making the prediction
prediction2 = Model_LR.predict(input_mail_feat)
print(prediction2)

if prediction2[0]==1:
    print('Ham Mail')
    
else:
 print('Spam Mail')   


[0]
Spam Mail


In [15]:
def classify_email():
    user_mail1 = email_entry.get('1.0', 'end-1c')
    input_mail= feature_extraction.transform(user_mail1)

    #Making the prediction
    prediction2 = Model_LR.predict(input_mail_feat)
    if prediction2[0]==1:
        print('Ham Mail')
    else:
        print('Spam Mail') 
 
window = tk.Tk()
window.title("Email Classifier")

# Create the email input box
email_label = tk.Label(window, text="Enter the email text:")
email_label.pack()
email_entry = tk.Text(window, height=10, width=50)
email_entry.pack()

# Create the classify button
classify_button = tk.Button(window, text="Classify", command=classify_email)
classify_button.pack()

In [16]:
'''import tkinter as tk
from tkinter import messagebox

# Function to handle the user input and classify the email
def classify_email():
    email_text = email_entry.get("1.0", "end-1c")  # Get the email text from the input box
    
    # Preprocess the email text (apply necessary cleaning techniques)
    # Vectorize the preprocessed text
    
    # Pass the vectorized input to your trained model for prediction
    # prediction = model.predict(vectorized_input)
    
    # Display the prediction to the user
    messagebox.showinfo("Email Classification Result", f"The email is classified as: {prediction}")

# Create the main window
window = tk.Tk()
window.title("Email Classifier")

# Create the email input box
email_label = tk.Label(window, text="Enter the email text:")
email_label.pack()
email_entry = tk.Text(window, height=10, width=50)
email_entry.pack()

# Create the classify button
classify_button = tk.Button(window, text="Classify", command=classify_email)
classify_button.pack()

# Run the main event loop
window.mainloop()

'''

'import tkinter as tk\nfrom tkinter import messagebox\n\n# Function to handle the user input and classify the email\ndef classify_email():\n    email_text = email_entry.get("1.0", "end-1c")  # Get the email text from the input box\n    \n    # Preprocess the email text (apply necessary cleaning techniques)\n    # Vectorize the preprocessed text\n    \n    # Pass the vectorized input to your trained model for prediction\n    # prediction = model.predict(vectorized_input)\n    \n    # Display the prediction to the user\n    messagebox.showinfo("Email Classification Result", f"The email is classified as: {prediction}")\n\n# Create the main window\nwindow = tk.Tk()\nwindow.title("Email Classifier")\n\n# Create the email input box\nemail_label = tk.Label(window, text="Enter the email text:")\nemail_label.pack()\nemail_entry = tk.Text(window, height=10, width=50)\nemail_entry.pack()\n\n# Create the classify button\nclassify_button = tk.Button(window, text="Classify", command=classify_email)\

In [17]:
import tkinter as tk

def get_sentence():
    user_sentence = entry.get()
    print("You entered:", user_sentence)

# Create the main application window
root = tk.Tk()
root.title("Sentence Input")

# Create a label
label = tk.Label(root, text="Enter a sentence:")
label.pack()

# Create an entry widget to get the sentence
entry = tk.Entry(root)
entry.pack()

# Create a button to submit the sentence
submit_button = tk.Button(root, text="Submit", command=get_sentence)
submit_button.pack()

# Start the main event loop
root.mainloop()


Exception in Tkinter callback
Traceback (most recent call last):
  File "c:\Program Files\Python311\Lib\tkinter\__init__.py", line 1948, in __call__
    return self.func(*args)
           ^^^^^^^^^^^^^^^^
  File "C:\Users\linco\AppData\Local\Temp\ipykernel_5684\624201454.py", line 3, in classify_email
    input_mail= feature_extraction.transform(user_mail1)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\linco\AppData\Roaming\Python\Python311\site-packages\sklearn\feature_extraction\text.py", line 2157, in transform
    X = super().transform(raw_documents)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\linco\AppData\Roaming\Python\Python311\site-packages\sklearn\feature_extraction\text.py", line 1427, in transform
    raise ValueError(
ValueError: Iterable over raw text documents expected, string object received.
Exception in Tkinter callback
Traceback (most recent call last):
  File "c:\Program Files\Python311\Lib\tkinter\__init__.py", line 1948, i