In [1]:
import pandas as pd
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Define the path to your CSV file in Google Drive
file_path = '/content/drive/My Drive/Tweets.csv' # Adjust the path if your file is not in the root of My Drive

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(file_path,encoding='ISO-8859-1')

# Display the first few rows of the DataFrame to verify
print(df.head())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
             tweet_id airline_sentiment  airline_sentiment_confidence  \
0  570306133677760513           neutral                        1.0000   
1  570301130888122368          positive                        0.3486   
2  570301083672813571           neutral                        0.6837   
3  570301031407624196          negative                        1.0000   
4  570300817074462722          negative                        1.0000   

  negativereason  negativereason_confidence         airline  \
0            NaN                        NaN  Virgin America   
1            NaN                     0.0000  Virgin America   
2            NaN                        NaN  Virgin America   
3     Bad Flight                     0.7033  Virgin America   
4     Can't Tell                     1.0000  Virgin America   

  airline_sentiment_gold        name negativereason_g

In [2]:
df.columns

Index(['tweet_id', 'airline_sentiment', 'airline_sentiment_confidence',
       'negativereason', 'negativereason_confidence', 'airline',
       'airline_sentiment_gold', 'name', 'negativereason_gold',
       'retweet_count', 'text', 'tweet_coord', 'tweet_created',
       'tweet_location', 'user_timezone'],
      dtype='object')

In [3]:
# required columns
df = df[['text' , 'airline_sentiment']]

df['text'] = df['text'].apply(lambda x: x.lower())

df.head()

Unnamed: 0,text,airline_sentiment
0,@virginamerica what @dhepburn said.,neutral
1,@virginamerica plus you've added commercials t...,positive
2,@virginamerica i didn't today... must mean i n...,neutral
3,@virginamerica it's really aggressive to blast...,negative
4,@virginamerica and it's a really big bad thing...,negative


In [4]:
!pip install contractions



In [5]:
import re, string, contractions
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

In [6]:
import nltk
nltk.download('punkt_tab')  # Tokenizer models
nltk.download('wordnet')  # Lemmas
nltk.download('averaged_perceptron_tagger_eng')  # POS tags
nltk.download('stopwords')  # Stop words


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [8]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [9]:
def preprocess_tweet(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", '', text)       # URLs
    text = re.sub(r"@\w+", '', text)                 # Mentions
    text = re.sub(r"#(\w+)", r"\1", text)            # Hashtags: keep the word
    text = contractions.fix(text)                    # Expand contractions
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = text.encode('ascii', 'ignore').decode('ascii')             # Remove emojis
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)
    lemmatized = [lemmatizer.lemmatize(word, get_wordnet_pos(tag))
                  for word, tag in tagged if word not in stop_words]
    return ' '.join(lemmatized)

In [10]:
df['preprocess'] = df['text'].apply(lambda x: preprocess_tweet(x))

df = df[df['airline_sentiment'].isin(["positive", "negative", "neutral"])]  # drop unknowns

In [11]:
df['airline_sentiment'].isnull().sum()

0

In [12]:
!pip install gensim



In [13]:
import gensim.downloader as api

In [14]:
w2v_model = api.load("word2vec-google-news-300")



In [15]:
import numpy as np

In [16]:
def vectorize(tokens, model, size=300):
    vectors = [model[word] for word in tokens if word in model]
    if not vectors:
        return np.zeros(size)
    return np.mean(vectors, axis=0)

In [17]:
df['vector'] = df['preprocess'].apply(lambda tokens: vectorize(tokens, w2v_model))
X = np.stack(df['vector'].values)
y = df['airline_sentiment']

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [20]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)

model = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs')
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")



Test Accuracy: 0.6578


In [23]:
def predict_tweet_sentiment(model, w2v_model, tweet):
    tokens = preprocess_tweet(tweet)
    vector = vectorize(tokens, w2v_model).reshape(1, -1)
    return model.predict(vector)[0]

example_tweet = "The flight was not good but the staff was really kind and helpful!"
print("Predicted Sentiment:", predict_tweet_sentiment(model, w2v_model, example_tweet))


Predicted Sentiment: negative
