In [4]:
# twitter_sentiment_classifier.py

import pandas as pd
import numpy as np
import re
import nltk
import string
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from gensim.models import KeyedVectors

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to C:\Users\divya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\divya/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\divya/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [5]:
# 1. Load data, rename text → 'Tweet', sentiment → label
df = pd.read_csv('Tweets.csv')
df = df[['airline_sentiment', 'text']].rename(columns={'airline_sentiment':'Label','text':'Tweet'})
df = df[df['Label'].isin(['positive', 'negative', 'neutral'])]

# 2. Preprocess each tweet
lemmatizer = WordNetLemmatizer()
contraction_replacements = { "don't":"do not", "can't":"can not", "it's":"it is", "i'm":"i am" }

def clean_tweet(text):
    text = text.lower()
    for c, r in contraction_replacements.items():
        text = text.replace(c, r)
    text = re.sub(r"http\S+|@\w+|#\w+", ' ', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = nltk.word_tokenize(text)
    return [lemmatizer.lemmatize(tok) for tok in tokens if tok.isalpha()]

df['tokens'] = df['Tweet'].map(clean_tweet)

In [None]:
import gensim.downloader as api

# Load the pre-trained Google News Word2Vec model
w2v_model = api.load("word2vec-google-news-300")

In [14]:
# 4. Tweet → average embeddings
def tokens_to_vec(tokens, model, size=300):
    vecs = [model[w] for w in tokens if w in model]
    return np.mean(vecs, axis=0) if vecs else np.zeros(size)

df['vec'] = df['tokens'].map(lambda toks: tokens_to_vec(toks, w2v_model))

# 5. Train/test split
X = np.vstack(df['vec'].values)
y = df['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Train & evaluate
clf = LogisticRegression(max_iter=1000, multi_class='auto')
clf.fit(X_train, y_train)
acc2 = clf.score(X_test, y_test)
print(f"Twitter Sentiment Accuracy: {acc2:.4f}")



Twitter Sentiment Accuracy: 0.7859


In [15]:
# 7. Prediction function
def predict_tweet_sentiment(model, w2v_model, tweet):
    tokens = clean_tweet(tweet)
    vec = tokens_to_vec(tokens, w2v_model).reshape(1,-1)
    return model.predict(vec)[0]

In [16]:
# Example 1: Negative Sentiment Tweet
tweet1 = "This is the worst experience I've ever had with an airline. Never flying again!"
prediction1 = predict_tweet_sentiment(clf, w2v_model, tweet1)
print(f"Tweet: {tweet1}\nPredicted Sentiment: {prediction1}\n")

# Example 2: Positive Sentiment Tweet
tweet2 = "Amazing service and very friendly crew. Loved flying with you!"
prediction2 = predict_tweet_sentiment(clf, w2v_model, tweet2)
print(f"Tweet: {tweet2}\nPredicted Sentiment: {prediction2}\n")

# Example 3: Neutral Sentiment Tweet
tweet3 = "I will be flying with United Airlines tomorrow."
prediction3 = predict_tweet_sentiment(clf, w2v_model, tweet3)
print(f"Tweet: {tweet3}\nPredicted Sentiment: {prediction3}")


Tweet: This is the worst experience I've ever had with an airline. Never flying again!
Predicted Sentiment: negative

Tweet: Amazing service and very friendly crew. Loved flying with you!
Predicted Sentiment: positive

Tweet: I will be flying with United Airlines tomorrow.
Predicted Sentiment: neutral
