In [20]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score

In [21]:
df = pd.read_csv('Tweets.csv')

In [34]:
df.head()

Unnamed: 0,airline_sentiment,text,tokens,embedding
0,neutral,@VirginAmerica What @dhepburn said.,"[virginamerica, dhepburn, said]","[-0.009094238, -0.044189453, 0.099609375, -0.0..."
1,positive,@VirginAmerica plus you've added commercials t...,"[virginamerica, plus, 've, added, commercials,...","[0.025349936, 0.013427734, -0.07526652, 0.1082..."
2,neutral,@VirginAmerica I didn't today... Must mean I n...,"[virginamerica, n't, today, ..., must, mean, n...","[-0.016059875, 0.08164978, -0.015533447, 0.039..."
3,negative,@VirginAmerica it's really aggressive to blast...,"[virginamerica, 's, really, aggressive, blast,...","[0.012815857, 0.09603272, -0.012805176, 0.0520..."
4,negative,@VirginAmerica and it's a really big bad thing...,"[virginamerica, 's, really, big, bad, thing]","[0.11010742, 0.06271362, 0.0031738281, 0.13183..."


In [13]:
def preprocess_text(text):
    if not isinstance(text, str):
        return []  # Return empty list for non-string (e.g., NaN) values

    # 1. Lowercase
    text = text.lower()

    # 2. Tokenize
    tokens = word_tokenize(text)

    # 3. Remove punctuation and stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]

    return tokens


In [25]:
nltk.download('punkt')
nltk.download('stopwords')
# nltk.download('punkt_tab') # This download is not needed and can be removed


# Apply preprocessing to each message
df['tokens'] = df['text'].apply(preprocess_text)

# View the first few rows
print(df.head())

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


  airline_sentiment                                               text  \
0           neutral                @VirginAmerica What @dhepburn said.   
1          positive  @VirginAmerica plus you've added commercials t...   
2           neutral  @VirginAmerica I didn't today... Must mean I n...   
3          negative  @VirginAmerica it's really aggressive to blast...   
4          negative  @VirginAmerica and it's a really big bad thing...   

                                              tokens  
0                    [virginamerica, dhepburn, said]  
1  [virginamerica, plus, 've, added, commercials,...  
2  [virginamerica, n't, today, ..., must, mean, n...  
3  [virginamerica, 's, really, aggressive, blast,...  
4       [virginamerica, 's, really, big, bad, thing]  


In [28]:
pip install gensim



In [29]:
import gensim.downloader as api
word2vec_model = api.load("word2vec-google-news-300")

In [30]:
import numpy as np

def get_avg_word2vec(tokens, model, vector_size=300):
    valid_vectors = [model[word] for word in tokens if word in model]
    return np.mean(valid_vectors, axis=0) if valid_vectors else np.zeros(vector_size)

# Apply to entire dataset
df['embedding'] = df['tokens'].apply(lambda tokens: get_avg_word2vec(tokens, word2vec_model))


In [31]:
df = df[df['embedding'].apply(lambda x: isinstance(x, np.ndarray) and x.size > 0)]


In [38]:
# Step 1: Keep only necessary columns
df_cleaned = df[['text', 'airline_sentiment']].copy()

# Step 2: Encode sentiment labels
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df_cleaned['label'] = label_encoder.fit_transform(df_cleaned['airline_sentiment'])

# Step 3: Check if label was added
print(df_cleaned.columns)
print(df_cleaned.head())


Index(['text', 'airline_sentiment', 'label'], dtype='object')
                                                text airline_sentiment  label
0                @VirginAmerica What @dhepburn said.           neutral      1
1  @VirginAmerica plus you've added commercials t...          positive      2
2  @VirginAmerica I didn't today... Must mean I n...           neutral      1
3  @VirginAmerica it's really aggressive to blast...          negative      0
4  @VirginAmerica and it's a really big bad thing...          negative      0


In [44]:
from sklearn.preprocessing import LabelEncoder

X = np.stack(df['embedding'].values)  # shape (n_samples, 300)

# Encode 'airline_sentiment' labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['airline_sentiment'])  # Encodes string labels to integers

print("Class mapping:", dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))

Class mapping: {'negative': 0, 'neutral': 1, 'positive': 2}


In [46]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [47]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)


In [48]:
from sklearn.metrics import classification_report, accuracy_score

y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

Accuracy: 0.7882513661202186
              precision    recall  f1-score   support

    negative       0.83      0.92      0.87      1889
     neutral       0.61      0.46      0.53       580
    positive       0.78      0.65      0.71       459

    accuracy                           0.79      2928
   macro avg       0.74      0.68      0.70      2928
weighted avg       0.78      0.79      0.78      2928



In [59]:
# Function to preprocess and vectorize a single message
def preprocess_and_vectorize(message, w2v_model, vector_size=300):
    stop_words = set(stopwords.words('english'))

    # Lowercase and tokenize
    tokens = word_tokenize(message.lower())

    # Remove stopwords and punctuation
    tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]

    # Get average Word2Vec vector
    valid_vectors = [w2v_model[word] for word in tokens if word in w2v_model]
    if valid_vectors:
        return np.mean(valid_vectors, axis=0)
    else:
        return np.zeros(vector_size)

# Main prediction function
def predict_message_class(model, w2v_model, message):
    vec = preprocess_and_vectorize(message, w2v_model)
    vec = vec.reshape(1, -1)  # Reshape for sklearn input
    prediction = model.predict(vec)[0]

    # Map integer prediction to sentiment string
    if prediction == 2:
        return 'positive'
    elif prediction == 0:
        return 'negative'
    elif prediction ==1:
        return 'neutral'
    else:
        return 'unknown' # Handle unexpected predictions

In [74]:
predict_message_class(clf,word2vec_model, " worst experiance of my life")

'negative'