In [21]:
import pandas as pd
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Define the path to your CSV file in Google Drive
file_path = '/content/drive/My Drive/spam.csv' # Adjust the path if your file is not in the root of My Drive

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(file_path,encoding='ISO-8859-1')

# Display the first few rows of the DataFrame to verify
print(df.head())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  


In [29]:
df = df.rename(columns={'v1': 'label', 'v2': 'message'})
df = df[['label', 'message']]

In [33]:
df.duplicated().sum()

0

In [28]:
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import gensim.downloader as api
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [36]:
# Load stopwords
stop_words = set(stopwords.words('english'))

def preprocess(text):
    # Tokenize and lowercase
    tokens = simple_preprocess(text)
    # Remove stopwords
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return filtered_tokens

df["Tokens"] = df["message"].apply(preprocess)


In [38]:
w2v_model = api.load("word2vec-google-news-300")

In [40]:
# Convert messages to fixed-length vectors
def vectorize(tokens, model, vector_size=300):
    vectors = [model[word] for word in tokens if word in model]
    if len(vectors) == 0:
        return np.zeros(vector_size)
    return np.mean(vectors, axis=0)

df["Vector"] = df["Tokens"].apply(lambda tokens: vectorize(tokens, w2v_model))

In [43]:
# Encode labels
df["Label_encoded"] = df["label"].map({"ham": 0, "spam": 1})

In [45]:
# Prepare data
X = np.stack(df["Vector"].values)
y = df["Label_encoded"].values

In [48]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Logistic Regression
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [49]:
# Evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on test set: {accuracy:.4f}")

Accuracy on test set: 0.9497


In [51]:
# Prediction function
def predict_message_class(model, w2v_model, message):
    tokens = preprocess(message)
    vector = vectorize(tokens, w2v_model).reshape(1, -1)
    prediction = model.predict(vector)[0]
    return "spam" if prediction == 1 else "ham"

# Example use
msg = "Your Recharge is going to end this month!"
print("Predicted class:", predict_message_class(model, w2v_model, msg))

Predicted class: ham
