# 02 â€“ Feature Engineering & Model Training

** Objectives**
- Load cleaned data 
- Convert text to numerical features using TF-IDF
- Train a Naive Bayes model
- Evaluate model performance
- Save trained model & vectorizer


In [10]:


import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns
import joblib

sns.set_style("whitegrid")


## Load cleaned data 


In [11]:
DATA_PATH = "../data/processed.pkl"

df = pd.read_pickle(DATA_PATH)

print("Dataset shape:", df.shape)
df.head()


Dataset shape: (5572, 2)


Unnamed: 0,clean_message,label_num
0,go jurong point crazi avail bugi n great world...,0
1,ok lar joke wif u oni,0
2,free entri wkli comp win fa cup final tkt st m...,1
3,u dun say earli hor u c alreadi say,0
4,nah dont think goe usf live around though,0


## Train-test split


In [12]:
X = df['clean_message']
y = df['label_num']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train size:", X_train.shape)
print("Test size :", X_test.shape)


Train size: (4457,)
Test size : (1115,)


## TF-IDF Vectorization


In [13]:
tfidf = TfidfVectorizer(
    max_features=3000,
    ngram_range=(1, 2)
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print("TF-IDF shape:", X_train_tfidf.shape)


TF-IDF shape: (4457, 3000)


## Train Naive Bayes Model


In [14]:
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

print("Model training completed")


Model training completed


## Model Evaluation


In [15]:
y_pred = model.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", round(accuracy * 100, 2), "%")


Accuracy: 97.22 %
