# Problem 1

In [3]:
import pandas as pd
import numpy as np
import gensim.downloader as api
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

In [11]:
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
df = pd.read_csv('spam.csv', encoding='latin-1')[['v1', 'v2']]
df.columns = ['Label', 'Message']
df['Label'] = df['Label'].map({'ham': 0, 'spam': 1})

In [7]:
stop_words = set(stopwords.words('english'))

In [12]:
def preprocess(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return tokens

df['Tokens'] = df['Message'].apply(preprocess)

In [13]:
w2v_model = api.load("word2vec-google-news-300")



In [14]:
def vectorize(tokens, model, vector_size=300):
    vectors = [model[word] for word in tokens if word in model]
    if not vectors:
        return np.zeros(vector_size)
    return np.mean(vectors, axis=0)

df['Vector'] = df['Tokens'].apply(lambda tokens: vectorize(tokens, w2v_model))

In [15]:
X = np.vstack(df['Vector'].values)
y = df['Label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

In [17]:
y_pred = clf.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))

Test Accuracy: 0.9417040358744395


In [18]:
def predict_message_class(model, w2v_model, message):
    tokens = preprocess(message)
    vector = vectorize(tokens, w2v_model).reshape(1, -1)
    prediction = model.predict(vector)
    return 'spam' if prediction[0] == 1 else 'ham'

# Problem 2

In [19]:
import pandas as pd
import numpy as np
import re
import string
import gensim.downloader as api
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [20]:
df = pd.read_csv("Tweets.csv")
df = df[['airline_sentiment', 'text']]  # keep only relevant columns
df.dropna(inplace=True)

In [21]:
contractions_dict = {
    "can't": "cannot", "won't": "will not", "don't": "do not",
    "didn't": "did not", "it's": "it is", "i'm": "i am",
    "they're": "they are", "we're": "we are", "isn't": "is not",
    "aren't": "are not", "wasn't": "was not", "weren't": "were not",
    "couldn't": "could not", "shouldn't": "should not"
}

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    for c in contractions_dict:
        text = text.replace(c, contractions_dict[c])
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r"@\w+|#\w+", '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w.isalpha() and w not in stop_words]
    return tokens

In [22]:
def vectorize_tweet(tokens, model):
    vectors = [model[word] for word in tokens if word in model]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

df['tokens'] = df['text'].apply(clean_text)
df['vector'] = df['tokens'].apply(lambda x: vectorize_tweet(x, w2v_model))

In [23]:
X = np.stack(df['vector'].values)
y = df['airline_sentiment'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))

Test Accuracy: 0.7718579234972678


In [24]:
def predict_tweet_sentiment(model, w2v_model, tweet):
    tokens = clean_text(tweet)
    vector = vectorize_tweet(tokens, w2v_model).reshape(1, -1)
    return model.predict(vector)[0]