In [11]:
# ✅ STEP 1: Install Gensim
!pip install gensim --quiet

# ✅ STEP 2: Import Libraries (no nltk this time!)
import pandas as pd
import numpy as np
import re
import gensim.downloader as api
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# ✅ STEP 3: Preprocessing Without NLTK
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)         # Remove punctuation
    words = text.split()                         # Tokenize
    words = [word for word in words if word not in ENGLISH_STOP_WORDS]
    return words

# ✅ STEP 4: Load Word2Vec
w2v_model = api.load("word2vec-google-news-300")  # ⏳ loads 1.5 GB

# ✅ STEP 5: Vectorize function
def vectorize_message(msg, model):
    words = preprocess_text(msg)
    word_vectors = [model[word] for word in words if word in model]
    return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(300)

# ✅ STEP 6: Load spam.csv (already uploaded)
df = pd.read_csv('spam.csv', encoding='latin-1')[['v1', 'v2']]
df.columns = ['Label', 'Message']

# ✅ STEP 7: Vectorize Messages
df['vector'] = df['Message'].apply(lambda x: vectorize_message(x, w2v_model))

# ✅ STEP 8: Prepare Training and Testing Data
X = np.vstack(df['vector'].values)
y = df['Label'].map({'ham': 0, 'spam': 1})
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ✅ STEP 9: Train Model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# ✅ STEP 10: Evaluate
print("✅ Spam Classification Accuracy:", round(accuracy_score(y_test, clf.predict(X_test)) * 100, 2), "%")

# ✅ STEP 11: Prediction Function
def predict_message_class(message):
    vec = vectorize_message(message, w2v_model).reshape(1, -1)
    return 'spam' if clf.predict(vec)[0] == 1 else 'ham'

# ✅ STEP 12: Try Example Predictions
print("Example 1:", predict_message_class("Congratulations! You have won a free iPhone. Click here to claim."))
print("Example 2:", predict_message_class("Hi, let's catch up tomorrow after class."))


✅ Spam Classification Accuracy: 94.35 %
Example 1: spam
Example 2: ham
