In [2]:
# Cell 1: Imports & downloads
import re
import numpy as np
import pandas as pd
from tqdm import tqdm

# NLP imports
import nltk
nltk.download('stopwords')
nltk.download('punkt')

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import sent_tokenize
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aarav\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aarav\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# Cell 2: Load data
df = pd.read_csv('spam.csv', encoding='latin-1')[['v1', 'v2']].copy()
df = df.rename(columns={'v1': 'label', 'v2': 'text'})  # clearer names
df.head()


Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
# Cell 3: Preprocessing
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

def preprocess_text(text):
    text = re.sub('[^a-zA-Z]', ' ', str(text))
    tokens = text.lower().split()
    tokens = [ps.stem(w) for w in tokens if w not in stop_words]
    return tokens

corpus_tokens = [preprocess_text(t) for t in df['text'].tolist()]

# quick checks
print("n_docs:", len(corpus_tokens))
print("example tokens:", corpus_tokens[0][:20])


n_docs: 5572
example tokens: ['go', 'jurong', 'point', 'crazi', 'avail', 'bugi', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'got', 'amor', 'wat']


In [5]:
# Cell 4: Train Word2Vec
# (adjust vector_size / min_count / epochs as needed)
model = Word2Vec(sentences=corpus_tokens, vector_size=100, window=5, min_count=1, workers=4, epochs=10)

# optional: inspect vocabulary size
print("Vocab size:", len(model.wv.key_to_index))


Vocab size: 6239


In [6]:
# Cell 5: Build document vectors and labels
def avg_word2vec(token_list, model, vector_size=100):
    vecs = [model.wv[word] for word in token_list if word in model.wv.key_to_index]
    if len(vecs) == 0:
        return np.zeros(vector_size)
    return np.mean(vecs, axis=0)

X = np.array([avg_word2vec(tokens, model, vector_size=100) for tokens in corpus_tokens])
y = df['label'].map({'ham': 0, 'spam': 1}).values

print("X.shape:", X.shape)   # (n_samples, 100)
print("y.shape:", y.shape)   # (n_samples,)


X.shape: (5572, 100)
y.shape: (5572,)


In [13]:
df.isnull().sum()


label    0
text     0
dtype: int64

In [16]:
pd.DataFrame(X).head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.233866,0.376486,0.07926,0.069643,0.133246,-0.57344,0.19684,0.759292,-0.343324,-0.194294,...,0.485478,0.278894,0.087018,0.124224,0.605339,0.356998,0.18693,-0.396586,0.190427,0.004394
1,-0.243304,0.361958,0.080044,0.09243,0.138488,-0.542858,0.185578,0.774873,-0.345033,-0.201142,...,0.486132,0.251219,0.120469,0.134572,0.60442,0.364473,0.146069,-0.395197,0.2017,0.012949
2,-0.212985,0.427219,0.05999,-0.002086,0.006428,-0.71966,0.298697,0.700986,-0.32129,-0.160672,...,0.472918,0.415997,-0.010213,0.07452,0.669911,0.277473,0.372487,-0.438484,0.139463,-0.061512
3,-0.339651,0.495946,0.115944,0.133304,0.19517,-0.762935,0.260536,1.078695,-0.488662,-0.276703,...,0.681654,0.351189,0.160306,0.196928,0.838827,0.510123,0.208402,-0.559703,0.287071,0.015798
4,-0.224881,0.361386,0.100362,0.063252,0.132294,-0.581946,0.197526,0.74783,-0.338442,-0.20687,...,0.505197,0.262612,0.038562,0.112072,0.595911,0.367193,0.197669,-0.40749,0.192347,-0.002908


In [7]:
# Cell 6: Train / test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("X_train.shape:", X_train.shape)
print("X_test.shape:", X_test.shape)
print("y_train.shape:", y_train.shape)
print("y_test.shape:", y_test.shape)


X_train.shape: (4457, 100)
X_test.shape: (1115, 100)
y_train.shape: (4457,)
y_test.shape: (1115,)


In [9]:
y_train

array([0, 0, 0, ..., 1, 1, 0], shape=(4457,))

In [10]:
## aply random forest classifier
from sklearn.ensemble import RandomForestClassifier
classifier=RandomForestClassifier()
classifier.fit(X_train,y_train) 

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [17]:
classifier.score(X_test,y_test)
y_pred=classifier.predict(X_test)
from sklearn.metrics import confusion_matrix,classification_report
cm=confusion_matrix(y_test,y_pred)
print(cm)
print(classification_report(y_test,y_pred))


[[951  15]
 [ 24 125]]
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       966
           1       0.89      0.84      0.87       149

    accuracy                           0.97      1115
   macro avg       0.93      0.91      0.92      1115
weighted avg       0.96      0.97      0.96      1115

