In [34]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import re
import string

In [35]:
# Load the dataset
def load_data(filepath):
    df = pd.read_csv(filepath,encoding='ISO-8859-1')
    print("Dataset loaded successfully.")
    print(df.head())
    return df

In [36]:
# Preprocess the text data
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove digits
    text = re.sub(r'\d+', '', text)
    # Remove extra whitespaces
    text = text.strip()
    return text

In [37]:
# Prepare data for modeling
def prepare_data(df):
    # Apply preprocessing to the message column
    df['cleaned_message'] = df['message'].apply(preprocess_text)
    
    # Split the data into training and testing sets
    X = df['cleaned_message']
    y = df['class']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    return X_train, X_test, y_train, y_test

In [38]:
# Feature extraction using TfidfVectorizer
def extract_features(X_train, X_test):
    vectorizer = TfidfVectorizer(stop_words='english')
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)
    print("Feature extraction complete.")
    return X_train_tfidf, X_test_tfidf, vectorizer

In [39]:
# Train a Naive Bayes classifier
def train_model(X_train_tfidf, y_train):
    model = MultinomialNB()
    model.fit(X_train_tfidf, y_train)
    print("Model training complete.")
    return model

In [40]:
# Evaluate the model
def evaluate_model(model, X_test_tfidf, y_test):
    y_pred = model.predict(X_test_tfidf)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))
    print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [41]:
def main():
    filepath = "C:\\Users\\asaha\\Downloads\\spam.csv"  # Replace with your file path
    df = load_data(filepath)
    X_train, X_test, y_train, y_test = prepare_data(df)
    X_train_tfidf, X_test_tfidf, vectorizer = extract_features(X_train, X_test)
    model = train_model(X_train_tfidf, y_train)
    evaluate_model(model, X_test_tfidf, y_test)

In [42]:
if __name__ == "__main__":
    main()

Dataset loaded successfully.
  class                                            message Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  
Feature extraction complete.
Model training complete.
Accuracy: 0.9623318385650225

Classification Report:
               precision    recall  f1-score   support

         ham       0.96      1.00      0.98       966
        spam       0.99      0.72      0.84       149

    accuracy                           0.96      1115
   macro avg       0.98      0.86      0.91      1