In [23]:
!wget https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv -O spam.csv


--2025-06-11 17:56:42--  https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 477907 (467K) [text/plain]
Saving to: ‘spam.csv’


2025-06-11 17:56:42 (2.34 MB/s) - ‘spam.csv’ saved [477907/477907]



In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import plotly.graph_objects as go

In [25]:

class spam_detector:
    def __init__(self, data_path=None):
        self.data_emails = None
        if data_path is not None:
            try:
                self.data_emails = pd.read_csv(data_path, sep='\t', names=["Category", "Message"])
                print("Dataset loaded successfully.")
            except FileNotFoundError as e:
                print(f"File not found: {e}")
            except Exception as e:
                print(f"Error loading file: {e}")

    def xdata_preprocessing(self, xdata, xpreprocessing_method):
        self.xpreprocesing_method = xpreprocessing_method
        self.xdata_preprocessed = xpreprocessing_method.fit_transform(xdata)
        return self.xdata_preprocessed

    def ydata_preprocessing(self, ydata):
        self.ydata_preprocessed = ydata.apply(lambda i: 1 if i == 'spam' else 0)
        return self.ydata_preprocessed

    def train_test_split(self, test_size):
        self.data_split = train_test_split(
            self.xdata_preprocessed, self.ydata_preprocessed,
            test_size=test_size, random_state=42
        )
        return self.data_split

    def model_train(self, model_type):
        self.model_type = model_type
        model_type.fit(self.data_split[0], self.data_split[2])
        return self.model_type

    def model_predict(self, data_to_predict):
        self.model_prediction = self.model_type.predict(data_to_predict)
        return self.model_prediction

    def model_accuracy(self):
        self.accuracy = accuracy_score(self.data_split[3], self.model_prediction)
        print("Model accuracy:", self.accuracy)

    def model_user_predict(self, user_input):
        self.user_input_preprocessed = self.xpreprocesing_method.transform([user_input])
        self.model_user_prediction = self.model_type.predict(self.user_input_preprocessed)
        if self.model_user_prediction[0] == 1:
            print(' This mail was classified as **SPAM**')
        else:
            print(' This mail is **NOT SPAM**')

    def model_plot_accuracy(self):
        self.fig = go.Figure(data=[go.Pie(
            values=[self.accuracy * 100, (1 - self.accuracy) * 100],
            textposition='inside',
            marker=dict(colors=['lightgreen', '#FF9999']),
            textinfo='text',
            text=['Accuracy', 'Errors']
        )])

        self.fig.update_layout(title_text='Model Precision', title_x=0.5, width=400, height=400)
        self.fig.show()


In [26]:

#Instantiate with the downloaded CSV
data = spam_detector('spam.csv')

#Preprocess text using CountVectorizer
data.xdata_preprocessing(data.data_emails.Message, CountVectorizer())

#Convert labels to 1 (spam) and 0 (ham)
data.ydata_preprocessing(data.data_emails.Category)

#Split dataset into training and testing sets
data_split = data.train_test_split(0.25)

#Train model using Naive Bayes
data.model_train(MultinomialNB())

#Make prediction on the test set
data.model_predict(data_split[1])

#Display model accuracy
data.model_accuracy()

#Classify a sample email
data.model_user_predict("Congratulations! You've won a free Bahamas cruise!")

#Plot accuracy chart
data.model_plot_accuracy()


Dataset loaded successfully.
Model accuracy: 0.9834888729361091
 This mail was classified as **SPAM**
