In [1]:
!pip install gensim


Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m72.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━

In [2]:
# Install required packages (uncomment and run once if needed)
# !pip install pandas nltk scikit-learn gensim

import pandas as pd
import numpy as np
import nltk
import gensim.downloader as api
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Download stopwords (no punkt needed anymore)
nltk.download('stopwords')

# Load dataset
df = pd.read_csv('/content/ham.csv', encoding='latin-1')[['v1', 'v2']]
df.columns = ['Label', 'Message']

# Preprocessing function using RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('english'))

def preprocess(text):
    tokens = tokenizer.tokenize(text.lower())
    return [word for word in tokens if word not in stop_words]

df['Tokens'] = df['Message'].apply(preprocess)

# Load Google News Word2Vec model
print("Loading Word2Vec model...")
w2v_model = api.load("word2vec-google-news-300")

# Convert each message into a fixed-length vector
def vectorize(tokens, model, size=300):
    valid_words = [word for word in tokens if word in model]
    if not valid_words:
        return np.zeros(size)
    return np.mean([model[word] for word in valid_words], axis=0)

df['Vector'] = df['Tokens'].apply(lambda x: vectorize(x, w2v_model))
X = np.vstack(df['Vector'].values)
y = df['Label'].map({'ham': 0, 'spam': 1})

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train logistic regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Predict and print accuracy
y_pred = clf.predict(X_test)
print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.4f}")

# Prediction function
def predict_message_class(model, w2v_model, message):
    tokens = preprocess(message)
    vector = vectorize(tokens, w2v_model).reshape(1, -1)
    prediction = model.predict(vector)[0]
    return 'spam' if prediction == 1 else 'ham'

# Test predictions
print(predict_message_class(clf, w2v_model, "Congratulations! You've won a free ticket. Reply YES to claim."))
print(predict_message_class(clf, w2v_model, "Are we still meeting today at 5?"))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Loading Word2Vec model...
Test Accuracy: 0.9453
spam
ham


In [18]:
# Optional: Uncomment to install required packages
# !pip install pandas nltk scikit-learn gensim

import pandas as pd
import numpy as np
import nltk
import gensim.downloader as api
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# 📥 Download stopwords
nltk.download('stopwords')

# 📂 Load dataset
df = pd.read_csv('twitter.csv')  # Update the path as needed
df.columns = df.columns.str.strip()  # Clean column names
df = df[['text', 'airline_sentiment']].dropna()

# 🧼 Preprocessing using RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('english'))

def preprocess(text):
    tokens = tokenizer.tokenize(text.lower())
    return [word for word in tokens if word not in stop_words]

df['Tokens'] = df['text'].apply(preprocess)

# 🌐 Load Word2Vec model
print("Loading Word2Vec model...")
w2v_model = api.load("word2vec-google-news-300")

# 🔢 Vectorize each tweet
def vectorize(tokens, model, size=300):
    valid_words = [word for word in tokens if word in model]
    if not valid_words:
        return np.zeros(size)
    return np.mean([model[word] for word in valid_words], axis=0)

df['Vector'] = df['Tokens'].apply(lambda x: vectorize(x, w2v_model))

# 🎯 Prepare features and labels
X = np.vstack(df['Vector'].values)
y = df['airline_sentiment']  # Sentiment labels: typically 'positive', 'neutral', 'negative'

# 🧪 Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 🏋️ Train logistic regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# ✅ Evaluate model
y_pred = clf.predict(X_test)
print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.4f}")

# 🔍 Prediction function
def predict_tweet_sentiment(model, w2v_model, message):
    tokens = preprocess(message)
    vector = vectorize(tokens, w2v_model).reshape(1, -1)
    return model.predict(vector)[0]

# 💬 Test predictions
print(predict_tweet_sentiment(clf, w2v_model, "This airline is the worst. Never flying again!"))
print(predict_tweet_sentiment(clf, w2v_model, "Really happy with the flight service today."))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Loading Word2Vec model...
Test Accuracy: 0.7804
negative
positive
