In [1]:
import pandas as pd

df = pd.read_csv(
    "sms_spam_collection/SMSSpamCollection",
    sep="\t",
    header=None,
    names=["label", "message"],
    encoding="latin-1"
)

df.head()


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [2]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [3]:
df.isnull().sum()


label      0
message    0
dtype: int64

In [4]:
df.duplicated().sum()


np.int64(403)

In [5]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Extract features (X) and labels (y)
X = df['message']
y = df['label']

# Convert text to TF-IDF (text → numbers)
vectorizer = TfidfVectorizer(stop_words="english")
X_tfidf = vectorizer.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y, test_size=0.2, random_state=42
)


In [7]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

model = MultinomialNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nReport:\n", classification_report(y_test, y_pred))


Accuracy: 0.979372197309417

Report:
               precision    recall  f1-score   support

         ham       0.98      1.00      0.99       966
        spam       1.00      0.85      0.92       149

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [8]:
msg = ["WINNER!! You have won a free iPhone! Click here now"]
msg_tfidf = vectorizer.transform(msg)
print(model.predict(msg_tfidf))


['spam']


In [9]:
msg = ["Ok bro I'm heading to the gym, text me later"]
msg_tfidf = vectorizer.transform(msg)
print(model.predict(msg_tfidf))


['ham']


In [10]:
import nltk

nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("stopwords")

[nltk_data] Error loading punkt: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>
[nltk_data] Error loading punkt_tab: <urlopen error [Errno -3]
[nltk_data]     Temporary failure in name resolution>
[nltk_data] Error loading stopwords: <urlopen error [Errno -3]
[nltk_data]     Temporary failure in name resolution>


False

In [11]:
import nltk
nltk.download()

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------


Downloader>  q


True

In [12]:
import nltk
nltk.download("punkt")
nltk.download("stopwords")


[nltk_data] Error loading punkt: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>
[nltk_data] Error loading stopwords: <urlopen error [Errno -3]
[nltk_data]     Temporary failure in name resolution>


False

In [13]:
import nltk
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("stopwords")


[nltk_data] Error loading punkt: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>
[nltk_data] Error loading punkt_tab: <urlopen error [Errno -3]
[nltk_data]     Temporary failure in name resolution>
[nltk_data] Error loading stopwords: <urlopen error [Errno -3]
[nltk_data]     Temporary failure in name resolution>


False

In [14]:
stop_words = {
    "the","a","to","and","or","is","in","at","on","for","of","this","that","it",
    "be","are","with","as","an","by","from","i","you","me","my","we","our","your",
    "but","so","if","not","was","were","very","too","just","rt"
}


In [15]:
def simple_tokenize(text):
    return text.split()


In [16]:
import re

def preprocess(text):

    # 1. lowercase
    text = text.lower()

    # 2. remove everything EXCEPT a–z, spaces, $ and !
    text = re.sub(r"[^a-z\s$!]", "", text)

    # 3. tokenize
    tokens = simple_tokenize(text)

    # 4. remove stopwords
    tokens = [t for t in tokens if t not in stop_words]

    # 5. join back into string
    return " ".join(tokens)


In [17]:
df["message"] = df["message"].apply(preprocess)
df.head()


Unnamed: 0,label,message
0,ham,go until jurong point crazy available only bug...
1,ham,ok lar joking wif u oni
2,spam,free entry wkly comp win fa cup final tkts st ...
3,ham,u dun say early hor u c already then say
4,ham,nah dont think he goes usf he lives around her...


In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["message"])


In [19]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X, df["label"])


In [20]:
msg = ["WIN a FREE $1000 prize!!! click now!!!"]
msg_clean = preprocess(msg[0])
msg_tfidf = vectorizer.transform([msg_clean])
model.predict(msg_tfidf)


array(['spam'], dtype='<U4')

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer with bigrams, min_df, and max_df to focus on relevant terms
vectorizer = CountVectorizer(min_df=1, max_df=0.9, ngram_range=(1, 2))

# Fit and transform the message column
X = vectorizer.fit_transform(df["message"])

# Labels (target variable)
y = df["label"].apply(lambda x: 1 if x == "spam" else 0)  # Converting labels to 1 and 0

In [22]:
import joblib
joblib.dump(best_model, "spam_model.joblib")


NameError: name 'best_model' is not defined

In [23]:
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ("vectorizer", vectorizer),
    ("classifier", MultinomialNB())
])

param_grid = {
    "classifier__alpha": [0.01, 0.1, 0.2, 0.5, 1.0]
}

grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring="f1"
)

grid_search.fit(df["message"], y)

best_model = grid_search.best_estimator_
print("BEST PARAMS:", grid_search.best_params_)


BEST PARAMS: {'classifier__alpha': 0.5}


In [24]:
import joblib
joblib.dump(best_model, "spam_model.joblib")


['spam_model.joblib']