<a href="https://colab.research.google.com/github/Adhishtalakshmibharathi/mfc-task2/blob/main/NM_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Cell 1: Set up environment (run in Colab)
!pip install -q scikit-learn pandas matplotlib nltk joblib

# Cell 2: Download dataset (UCI SMS Spam Collection)
!wget -q --show-progress https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip


!unzip -o smsspamcollection.zip

# Cell 3: Imports and NLTK setup
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import joblib
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab') # Add this line to download the missing resource
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Cell 4: Load dataset
df = pd.read_csv('SMSSpamCollection', sep='\t', header=None, names=['label','message'])
df['label_num'] = df['label'].map({'ham':0, 'spam':1})
print('Dataset shape:', df.shape)
print(df.head())

# Cell 5: Basic EDA
print(df['label'].value_counts())
df['msg_len'] = df['message'].apply(len)
print('Message length — mean:', df['msg_len'].mean(), 'median:', df['msg_len'].median())

# Cell 6: Preprocessing function
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\\S+|www\\S+','', text)
    text = re.sub(r'[^a-z0-9\\s]', ' ', text)
    tokens = nltk.word_tokenize(text)
    tokens = [t for t in tokens if t not in stop_words and t.isalpha()]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return ' '.join(tokens)

df['clean'] = df['message'].apply(preprocess_text)

# Cell 7: Train-test split and feature extraction
X_train, X_test, y_train, y_test = train_test_split(df['clean'], df['label_num'], test_size=0.2, random_state=42, stratify=df['label_num'])
tfidf = TfidfVectorizer(ngram_range=(1,2), max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Cell 8: Train baseline model (MultinomialNB)
mnb = MultinomialNB()
mnb.fit(X_train_tfidf, y_train)
y_pred = mnb.predict(X_test_tfidf)
print('MultinomialNB Classification Report:')
print(classification_report(y_test, y_pred, target_names=['ham','spam']))
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

# Cell 9: Train Logistic Regression for comparison
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_tfidf, y_train)
y_pred_lr = lr.predict(X_test_tfidf)
print('Logistic Regression Classification Report:')
print(classification_report(y_test, y_pred_lr, target_names=['ham','spam']))

# Cell 10: Save model and vectorizer


joblib.dump(mnb, 'mnb_sms_spam_model.joblib')
joblib.dump(tfidf, 'tfidf_vectorizer.joblib')
print('Saved model and vectorizer to current directory.')

# Cell 11: Inference example
def predict_message(msg):
    clean = preprocess_text(msg)
    vect = tfidf.transform([clean])
    prob = mnb.predict_proba(vect)[0][1]
    label = 'spam' if prob > 0.5 else 'ham'
    return label, prob

examples = ["Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate).",
            "Hey, are we meeting today?"]
for e in examples:
    print(e, "->", predict_message(e))

smsspamcollection.z     [ <=>                ] 198.65K  1.29MB/s    in 0.2s    
Archive:  smsspamcollection.zip
  inflating: SMSSpamCollection       
  inflating: readme                  


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Dataset shape: (5572, 3)
  label                                            message  label_num
0   ham  Go until jurong point, crazy.. Available only ...          0
1   ham                      Ok lar... Joking wif u oni...          0
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...          1
3   ham  U dun say so early hor... U c already then say...          0
4   ham  Nah I don't think he goes to usf, he lives aro...          0
label
ham     4825
spam     747
Name: count, dtype: int64
Message length — mean: 80.48994974874371 median: 62.0
MultinomialNB Classification Report:
              precision    recall  f1-score   support

         ham       0.97      1.00      0.99       966
        spam       1.00      0.81      0.89       149

    accuracy                           0.97      1115
   macro avg       0.99      0.90      0.94      1115
weighted avg       0.97      0.97      0.97      1115

Confusion Matrix:
[[966   0]
 [ 29 120]]
Logistic Regression Classification Rep