In [3]:
#importing Libraries
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

#Loading the dataset
data = pd.read_csv('spam.csv', encoding='latin-1')

In [4]:
#Explore the dataset
print(data.head())
print(data['v1'].value_counts())

     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  
ham     4825
spam     747
Name: v1, dtype: int64


In [5]:
#Preprocessing
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    
# Remove special characters and numbers
    text = re.sub('[^a-zA-Z]', ' ', text)
    
# Expand abbreviations
    abbreviations = {
        "u": "you",
        "r": "are",
        "ur": "your",
        "n": "and",
        "plz": "please",
# Only a few abbreviations added
    }
    words = text.split()
    expanded_words = [abbreviations.get(word, word) for word in words]
    text = ' '.join(expanded_words)
    
    return text

data['v2'] = data['v2'].apply(preprocess_text)
print(data.head())

     v1                                                 v2 Unnamed: 2  \
0   ham  go until jurong point crazy available only in ...        NaN   
1   ham                          ok lar joking wif you oni        NaN   
2  spam  free entry in a wkly comp to win fa cup final ...        NaN   
3   ham    you dun say so early hor you c already then say        NaN   
4   ham  nah i don t think he goes to usf he lives arou...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  


In [6]:
#Split the data into training and test sets
X = data['v2']
y = data['v1']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
#Vectorize the text data (with stop words removal)
vectorizer = CountVectorizer(stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)
print(X_train.head())

1978    no i m in the same boat still here at my moms ...
3989    bank of granite issues strong buy explosive pi...
3935      they are giving a second chance to rahul dengra
4078              o i played smash bros lt gt religiously
4086    private your account statement for shows un re...
Name: v2, dtype: object


In [8]:
#Train a prediction model
model = MultinomialNB()
model.fit(X_train_vec, y_train)

In [9]:
#Evaluate the model
y_pred_train = model.predict(X_train_vec)
y_pred_test = model.predict(X_test_vec)

In [10]:
#accuracy
accuracy = accuracy_score(y_test, y_pred_test)
print("Accuracy:", accuracy)

Accuracy: 0.9829596412556054


In [11]:
#classification Report
print("Classification Report:")
print(classification_report(y_test, y_pred_test))

Classification Report:
              precision    recall  f1-score   support

         ham       0.99      0.99      0.99       965
        spam       0.95      0.93      0.94       150

    accuracy                           0.98      1115
   macro avg       0.97      0.96      0.96      1115
weighted avg       0.98      0.98      0.98      1115

