# Assignment 2: NLP Classification with Word2Vec

This notebook contains solutions to:
- **Problem 1**: SMS Spam Detection
- **Problem 2**: Twitter Sentiment Classification

Both use pre-trained Word2Vec embeddings (Google News) and Logistic Regression.

In [None]:
import pandas as pd
import numpy as np
import nltk
import re
import string

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

## Problem 1: SMS Spam Classification

In [None]:
# Load SMS Spam Dataset
df = pd.read_csv("spam.csv", encoding='latin-1')[['v1', 'v2']]
df.columns = ['Label', 'Message']

stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = text.lower()
    tokens = word_tokenize(text)
    return [word for word in tokens if word.isalpha() and word not in stop_words]

df['tokens'] = df['Message'].apply(preprocess)

# Load pre-trained Word2Vec model
w2v_model = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)

def vectorize(tokens):
    vectors = [w2v_model[word] for word in tokens if word in w2v_model]
    return np.mean(vectors, axis=0) if vectors else np.zeros(300)

df['vector'] = df['tokens'].apply(vectorize)

X = np.vstack(df['vector'].values)
y = df['Label'].map({'ham': 0, 'spam': 1}).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

print("Test Accuracy:", accuracy_score(y_test, clf.predict(X_test)))

def predict_message_class(model, w2v_model, message):
    tokens = preprocess(message)
    vector = vectorize(tokens)
    prediction = model.predict([vector])[0]
    return 'spam' if prediction == 1 else 'ham'

## Problem 2: Twitter US Airline Sentiment Classification

In [None]:
# Load Twitter dataset
tweets_df = pd.read_csv("Tweets.csv")[['airline_sentiment', 'text']]

lemmatizer = WordNetLemmatizer()
contractions = {"don't": "do not", "can't": "cannot", "i'm": "i am", "it's": "it is"}

def clean_tweet(text):
    text = text.lower()
    for k, v in contractions.items():
        text = text.replace(k, v)
    text = re.sub(r"http\S+|@\w+|#\w+", "", text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(t) for t in tokens if t.isalpha()]
    return tokens

tweets_df['tokens'] = tweets_df['text'].apply(clean_tweet)
tweets_df['vector'] = tweets_df['tokens'].apply(vectorize)

X = np.vstack(tweets_df['vector'].values)
y = tweets_df['airline_sentiment'].map({'negative': 0, 'neutral': 1, 'positive': 2}).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf_sentiment = LogisticRegression(max_iter=1000, multi_class='multinomial')
clf_sentiment.fit(X_train, y_train)
print("Sentiment Test Accuracy:", accuracy_score(y_test, clf_sentiment.predict(X_test)))

def predict_tweet_sentiment(model, w2v_model, tweet):
    tokens = clean_tweet(tweet)
    vector = vectorize(tokens)
    pred = model.predict([vector])[0]
    return ['negative', 'neutral', 'positive'][pred]