In [1]:
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
import string
import numpy as np
import re
import gensim.downloader as api

In [2]:
data = pd.read_csv('Tweets.csv',usecols=['airline_sentiment','text'])
data

Unnamed: 0,airline_sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...
...,...,...
14635,positive,@AmericanAir thank you we got on a different f...
14636,negative,@AmericanAir leaving over 20 minutes Late Flig...
14637,neutral,@AmericanAir Please bring American Airlines to...
14638,negative,"@AmericanAir you have my money, you change my ..."


#### PreProcessing

In [3]:
contraction_map = {
    "won't": "will not",
    "can't": "can not",
    "i'm": "i am",
    "ain't": "is not",
    "don't": "do not",
    "didn't": "did not",
    "it's": "it is",
    "i've": "i have",
    "you're": "you are",
    "they're": "they are",
    "couldn't": "could not",
    "shouldn't": "should not",
    "wouldn't": "would not",
    "doesn't": "does not",
    "isn't": "is not",
    "aren't": "are not",
    "wasn't": "was not",
    "weren't": "were not",
    "you've": "you have"
}

def expand_contractions(tweet):
    pattern = re.compile(r'\b(' + '|'.join(re.escape(key) for key in contraction_map.keys()) + r')\b')
    return pattern.sub(lambda x: contraction_map[x.group()],tweet)

def preprocess(tweet):
    tweet = tweet.lower()
    # Remove URLs
    tweet = re.sub(r'http\S+|www\S+|https\S+','',tweet)
    # Remove mentions
    tweet = re.sub(r'@\w+','',tweet)
    # Remove Hashtags
    tweet = re.sub(r'#\w*','',tweet)
    # Expanding contractions
    tweet = expand_contractions(tweet)
    # Removing special characters
    tweet = re.sub(r'[^\w\s]','',tweet) # Removes everything expect alphanumerics and spaces
    # Removing Emojis
    tweet = re.sub(r'[\U00010000-\U0010ffff]','',tweet)
    # tokenize
    words = tweet.split()
    tweet = [word for word in words if word not in string.punctuation]
    # Lemmatization
    Lemmatizer = WordNetLemmatizer()
    tweet = [Lemmatizer.lemmatize(word) for word in tweet]

    return tweet

print(data.iloc[1,1])
preprocess(data.iloc[1,1])

@VirginAmerica plus you've added commercials to the experience... tacky.


['plus',
 'you',
 'have',
 'added',
 'commercial',
 'to',
 'the',
 'experience',
 'tacky']

#### Loading Word2Vec Model

In [4]:
w2v_model = api.load('word2vec-google-news-300')

In [5]:
def avgWord2Vec(tweet,model):
    valid_vectors = [model[word] for word in tweet if word in model]

    if not valid_vectors:
        return np.zeros(model.vector_size)
    else:
        return np.mean(valid_vectors,axis=0)
    
tweets = []
for i in range(data.shape[0]):
    tweets.append(preprocess(data.iloc[i,1]))

X = np.zeros((data.shape[0],300))

for i in range(len(tweets)):
    vector = avgWord2Vec(tweets[i],w2v_model)
    X[i] = vector

Y = data['airline_sentiment']
print('\n=== Feature Vector ===\n')
print(X)
print('\n=== Labels ===\n')
print(Y)


=== Feature Vector ===

[[ 0.0652771  -0.025177    0.15722656 ...  0.10083008  0.14013672
  -0.16064453]
 [ 0.02523804 -0.00747681 -0.0210495  ... -0.10122681 -0.00774956
  -0.02825165]
 [-0.01674028  0.03279252  0.05932617 ... -0.07518422  0.01525879
  -0.07044566]
 ...
 [-0.01654053 -0.00634766  0.08666992 ...  0.00134277  0.0660553
  -0.15252686]
 [ 0.01665982 -0.01029275  0.04662392 ... -0.07168579 -0.01131439
  -0.03199317]
 [ 0.03410181  0.00179948  0.04938612 ... -0.04442938  0.02539852
  -0.04221081]]

=== Labels ===

0         neutral
1        positive
2         neutral
3        negative
4        negative
           ...   
14635    positive
14636    negative
14637     neutral
14638    negative
14639     neutral
Name: airline_sentiment, Length: 14640, dtype: object


#### Train Test Split

In [6]:
from sklearn.model_selection import train_test_split

X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=42)

#### Training & Accuracy of the Model

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

classifier_model = LogisticRegression()
classifier_model.fit(X_train,Y_train)
Y_Pred = classifier_model.predict(X_test)
print("Accuracy of the Model : ",accuracy_score(Y_test,Y_Pred))

Accuracy of the Model :  0.7858606557377049


In [8]:
def predict_tweet_class(model,w2v_model,tweet):
    tweet = preprocess(tweet)
    tweet_vector = avgWord2Vec(tweet,w2v_model).reshape(1,-1)
    return model.predict(tweet_vector)[0]

#### Testing on new Inputs

In [9]:
tweet1 = "Just landed my dream job! So grateful for this opportunity. 💼✨ #blessed #careergoals"
predict_tweet_class(classifier_model,w2v_model,tweet1)

'positive'

In [10]:
tweet2 = "Customer service was absolutely terrible. Wasted an hour and got no help. 😡 #disappointed"
predict_tweet_class(classifier_model,w2v_model,tweet2)

'negative'

In [11]:
tweet3 = "Today I arrived at New York City !!"
predict_tweet_class(classifier_model,w2v_model,tweet3)

'neutral'