In [1]:
import pandas as pd

df = pd.read_csv(
    "sms_spam_collection/SMSSpamCollection",
    sep="\t",
    header=None,
    names=["label", "message"],
    encoding="latin-1"
)

# Remove duplicates
df = df.drop_duplicates()
print(df.shape)
df.head()


(5169, 2)


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [2]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

nltk.download("punkt")
nltk.download("stopwords")

stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()

def preprocess(msg):
    msg = msg.lower()
    msg = re.sub(r"[^a-z\s$!]", "", msg)
    tokens = word_tokenize(msg)
    tokens = [w for w in tokens if w not in stop_words]
    tokens = [stemmer.stem(w) for w in tokens]
    return " ".join(tokens)

df["clean"] = df["message"].apply(preprocess)
df.head()


[nltk_data] Error loading punkt: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>
[nltk_data] Error loading stopwords: <urlopen error [Errno -3]
[nltk_data]     Temporary failure in name resolution>


Unnamed: 0,label,message,clean
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,ham,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entri wkli comp win fa cup final tkt st m...
3,ham,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah dont think goe usf live around though


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

X = df["clean"]
y = df["label"].apply(lambda x: 1 if x == "spam" else 0)

pipeline = Pipeline([
    ("vectorizer", CountVectorizer(ngram_range=(1,2), max_df=0.9)),
    ("classifier", MultinomialNB())
])

param_grid = {
    "classifier__alpha": [0.01, 0.1, 0.2, 0.5, 1.0]
}

grid = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring="f1"
)

grid.fit(X, y)

best_model = grid.best_estimator_
print("Best params:", grid.best_params_)


Best params: {'classifier__alpha': 0.2}


In [4]:
import joblib
joblib.dump(best_model, "spam_model.joblib")
print("MODEL SAVED ✔️")


MODEL SAVED ✔️


In [5]:
import requests, json

url = "http://localhost:8000/api/upload"

with open("spam_model.joblib", "rb") as f:
    response = requests.post(url, files={"model": f})

print(json.dumps(response.json(), indent=4))


{
    "accuracy": 0.9130434782608695,
    "flag": "HTB{sp4m_cla55if13r_3v4lu4t0r}"
}
