Building a Model that detects the messages a user enters as spam or not spam

Import necessary libraries

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

Load dataset 

In [34]:
df = pd.read_csv(r"C:\Users\HP\Desktop\python\ML\personal learn\logistic regression\spam_ham_log_reg\spam.csv", encoding='latin1')

Select needed columns

In [35]:
df = df.iloc[:, :2]

Rename the columns

In [36]:
df.columns = ['label', 'message']
df.head(3)

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...


In [37]:
# convert spam to ham
df['label'] = df['label'].map({'spam':1, 'ham': 0})

In [38]:
df.head(3)

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...


Check info for null values

In [68]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   label            5572 non-null   int64 
 1   message          5572 non-null   object
 2   cleaned message  5572 non-null   object
dtypes: int64(1), object(2)
memory usage: 130.7+ KB


Function for preprocessing the data to remove noise (common words, punctuations, and numbers)

In [39]:
def preprocess_message(text):
  text = text.lower() # converts text to lowercase
  text = re.sub(r'\d+', '', text)  # remove numbers
  text = text.translate(str.maketrans('', '', string.punctuation))  # removes punctuation
  words = word_tokenize(text)  # splits text into words
  words = [word for word in words if word not in stopwords.words('english')]
  return " ".join(words)

df['cleaned message'] = df['message'].apply(preprocess_message)

print(df.head())


   label                                            message  \
0      0  Go until jurong point, crazy.. Available only ...   
1      0                      Ok lar... Joking wif u oni...   
2      1  Free entry in 2 a wkly comp to win FA Cup fina...   
3      0  U dun say so early hor... U c already then say...   
4      0  Nah I don't think he goes to usf, he lives aro...   

                                     cleaned message  
0  go jurong point crazy available bugis n great ...  
1                            ok lar joking wif u oni  
2  free entry wkly comp win fa cup final tkts st ...  
3                u dun say early hor u c already say  
4        nah dont think goes usf lives around though  


Import libraries for scaling,training, testing, and performance checking

In [42]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

In [44]:
X_train, X_test, y_train, y_test = train_test_split(df['cleaned message'], df['label'], test_size=0.25, random_state = 42)

Scale X values to fit TfidfVectorizer() that converts letters to numbers

In [45]:
vectorizer = TfidfVectorizer()

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

Train model

In [46]:
model = LogisticRegression()

model.fit(X_train_tfidf, y_train)

Test Model and accuracy

In [47]:
y_pred = model.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9468772433596554


Function to predict spam or not spam for user input

In [66]:
def predict_spam(message):
  text = preprocess_message(message)
  text_df = [text]
  text_tfidf = vectorizer.transform(text_df)
  prediction = model.predict(text_tfidf)

  if prediction == 1:
    result = "The message is 0.95% Spam"
  elif prediction == 0:
    result = "The message is 0.95% not spam"

  print(result)

Function for letting the user input messages

In [None]:
while True:
  user_input = input("\nEnter an SMS message (or type 'exit' to quit): ")

  if user_input.lower() == "exit":
    print("Exiting the program. Have a nice day!")
    break

  predict_spam(user_input)

The message is 0.95% Spam
The message is 0.95% not spam
Exiting the program.
