In [26]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score

In [27]:
df = pd.read_csv('spam.csv', sep='\t', header=None, names=['label', 'message'], encoding='latin1')

In [28]:
def preprocess_text(text):
    if not isinstance(text, str):
        return []  # Return empty list for non-string (e.g., NaN) values

    # 1. Lowercase
    text = text.lower()

    # 2. Tokenize
    tokens = word_tokenize(text)

    # 3. Remove punctuation and stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]

    return tokens


In [29]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

# Loading dataset
df = pd.read_csv('spam.csv', sep='\t', header=None, encoding='latin1')

# Manually split the first column into 'label' and 'message'
df[['label', 'message']] = df[0].str.split(',', n=1, expand=True)

# Drop the original combined column
df = df.drop(columns=[0])

# Preprocessing function
def preprocess_text(text):
    if not isinstance(text, str):
        return []  # Return empty list for non-string (e.g., NaN) values

    # 1. Lowercase
    text = text.lower()

    # 2. Tokenize
    tokens = word_tokenize(text)

    # 3. Remove punctuation and stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]

    return tokens

# Apply preprocessing to each message
df['tokens'] = df['message'].apply(preprocess_text)

# View the first few rows
print(df.head())

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


  label                                            message  \
0    v1                                              v2,,,   
1   ham  "Go until jurong point, crazy.. Available only...   
2   ham                   Ok lar... Joking wif u oni...,,,   
3  spam  Free entry in 2 a wkly comp to win FA Cup fina...   
4   ham  U dun say so early hor... U c already then say...   

                                              tokens  
0                                               [v2]  
1  [``, go, jurong, point, crazy, .., available, ...  
2           [ok, lar, ..., joking, wif, u, oni, ...]  
3  [free, entry, 2, wkly, comp, win, fa, cup, fin...  
4  [u, dun, say, early, hor, ..., u, c, already, ...  


In [30]:
pip install gensim




In [31]:
import gensim.downloader as api
word2vec_model = api.load("word2vec-google-news-300")

In [32]:
def get_avg_word2vec(tokens, model, vector_size=300):
    valid_vectors = [model[word] for word in tokens if word in model]
    if valid_vectors:
        return np.mean(valid_vectors, axis=0)
    else:
        return np.zeros(vector_size)  # Return a zero vector if none are in vocab

# Appling dataset
df['embedding'] = df['tokens'].apply(lambda tokens: get_avg_word2vec(tokens, word2vec_model))


In [33]:
df = df[df['label'].isin(['spam', 'ham'])]  # Remove anything unexpected
df = df.dropna(subset=['label', 'embedding'])  # Drop missing labels/embeddings

In [34]:
# Preparing feature matrix (X) and labels (y)
X = np.stack(df['embedding'].values)  # 2D array of shape (n_samples, 300)

# Encode 'spam' = 1, 'ham' = 0
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['label'])  # Converts ['ham', 'spam'] → [0, 1]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train logistic regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

#  Evaluate
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Accuracy: 0.9390134529147982
              precision    recall  f1-score   support

         ham       0.95      0.98      0.97       965
        spam       0.83      0.69      0.75       150

    accuracy                           0.94      1115
   macro avg       0.89      0.83      0.86      1115
weighted avg       0.94      0.94      0.94      1115



In [35]:
# Function to preprocess and vectorize a single message
def preprocess_and_vectorize(message, w2v_model, vector_size=300):
    stop_words = set(stopwords.words('english'))

    # Lowercase and tokenize
    tokens = word_tokenize(message.lower())

    # Remove stopwords and punctuation
    tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]

    # Get average Word2Vec vector
    valid_vectors = [w2v_model[word] for word in tokens if word in w2v_model]
    if valid_vectors:
        return np.mean(valid_vectors, axis=0)
    else:
        return np.zeros(vector_size)

# Main prediction function
def predict_message_class(model, w2v_model, message):
    vec = preprocess_and_vectorize(message, w2v_model)
    vec = vec.reshape(1, -1)  # Reshape for sklearn input
    prediction = model.predict(vec)[0]
    return 'spam' if prediction == 1 else 'ham'


In [43]:
predict_message_class(clf, word2vec_model, "win a dollar.")

'spam'