In [6]:
import pandas as pd
import os
import argparse
import chardet as cd
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer
import requests

In [7]:
try:
    # GitHub URL of the dataset raw file
    url = "https://github.com/BSantos04/SMS-Spam-Detection/raw/refs/heads/main/datasets/spam.csv"

    # Open the .csv file and detect the type of encoding of the dataset
    rawdata = requests.get(url).content
    result = cd.detect(rawdata)
    encoding = result["encoding"]

    # Convert the .csv file into a Pandas dataframe specifying the detected encoding
    df = pd.read_csv(url, encoding=encoding)

    # Display the first 5 rows of the raw dataframe
    print("-"*211)
    print("Give a glimpse of the raw dataset:\n")
    print(df.head().to_string(index=False))
except Exception as e:
    raise e

-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Give a glimpse of the raw dataset:

  v1                                                                                                                                                          v2 Unnamed: 2 Unnamed: 3 Unnamed: 4
 ham                                             Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...        NaN        NaN        NaN
 ham                                                                                                                               Ok lar... Joking wif u oni...        NaN        NaN        NaN
spam Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's        Na

In [8]:
try:
    # Restrain the dataset to only the first 2 columns, since the other are just filling columns and those are the only that have any relevant data
    df = df.loc[:, ["v1", "v2"]].copy()

    # Remove duplicate row
    df.drop_duplicates(inplace=True)

    # Since it's crucial that a row has data on both columns, I chose to remove any row with NaN values
    df.dropna(axis=0, how="any", inplace=True)

    # Rename the columns for a more intuitive work from now on
    df.rename(columns={"v1": "SPAM/HAM", "v2": "SMS"}, inplace=True)

    # Remove additional empty spaces and put the text of every column content to lowercase
    df["SPAM/HAM"] = df["SPAM/HAM"].str.lower().str.strip()
    df["SMS"] = df["SMS"].str.lower().str.strip()

    # Ordinal codification for 'SPAM/HAM' column
    ord_spam = {"spam": 0, "ham": 1}
    df["SPAM/HAM"] = df["SPAM/HAM"].map(ord_spam)

    # Display the first 5 rows of the now pre-processed dataset
    print("-"*211)
    print("Give a glimpse of the pre-processed dataset:\n")
    print(df.head().to_string(index=False))
except Exception as e:
    raise e

-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Give a glimpse of the pre-processed dataset:

 SPAM/HAM                                                                                                                                                         SMS
        1                                             go until jurong point, crazy.. available only in bugis n great world la e buffet... cine there got amore wat...
        1                                                                                                                               ok lar... joking wif u oni...
        0 free entry in 2 a wkly comp to win fa cup final tkts 21st may 2005. text fa to 87121 to receive entry question(std txt rate)t&c's apply 08452810075over18's
        1                                                                     

In [None]:
try:
    # Define the dependent and idenpendent variables (being X the independent variable and y the dependent variable)
    X = df["SMS"]
    y = df["SPAM/HAM"]

    # Split the data into training data and test data, setting the test data as 30% of the dataset and training data 70%, with a random state of 42
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

    # Create a scikit-learn pipeline that will vectorize data and apply LOgistic Regression for the training and test set
    pipeline = make_pipeline(TfidfVectorizer(ngram_range=(1, 2), max_df=0.9, stop_words=None, min_df=2), LogisticRegression(class_weight="balanced", max_iter=1000))
    pipeline.fit(X_train, y_train)

    # Get the predictions and probability based on the test datasets
    y_pred = pipeline.predict(X_test)
    y_proba = pipeline.predict_proba(X_test)[:, 1]

    # Evaluate the model using methods such as Confusion Matrix, Rating Report (focusing on F1-Score) and AUC-ROC Score
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    auc_roc_score = roc_auc_score(y_test, y_proba)

    # Display the results of the evaluations
    print("-"*211)
    print("Confusion Matrix:\n", conf_matrix)
    print("\n\nClassification Report:\n", class_report)
    print(f"\n\nAUC-ROC Score: {auc_roc_score}")

    # Predict if the message is SPAM or not
    sms_one = "Hey johnny, wassup!?"
    sms_two = "WINNER!! This is the secret code to get a brand new iPhone: IPHONE4U."
    sms_three = "It's a me, Mario"
    pred_one = pipeline.predict([sms_one.strip().lower()])
    pred_two = pipeline.predict([sms_two.strip().lower()])
    pred_three = pipeline.predict([sms_three.strip().lower()])

    # Display the results of the predictions
    print("-"*211)
    print(f"The message: {sms_one}")
    print(f"\nThe verdict: {'SPAM' if pred_one==0 else 'HAM'}!!!")
    print("-"*211)
    print(f"The message: {sms_two}")
    print(f"\nThe verdict: {'SPAM' if pred_two==0 else 'HAM'}!!!")
    print("-"*211)
    print(f"The message: {sms_three}")
    print(f"\nThe verdict: {'SPAM' if pred_three==0 else 'HAM'}!!!")
    print("-"*211)
except Exception as e:
    raise e

-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Confusion Matrix:
 [[ 207   13]
 [  19 1312]]


Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.94      0.93       220
           1       0.99      0.99      0.99      1331

    accuracy                           0.98      1551
   macro avg       0.95      0.96      0.96      1551
weighted avg       0.98      0.98      0.98      1551



AUC-ROC Score: 0.9946349293081074
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
The message: Hey johnny, wassup!?

The verdict: HAM!!!
--------------------------------------------------------------------------------------