In [1]:
import nltk
from nltk.corpus import stopwords
import string
from nltk import word_tokenize, FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix
from sklearn.preprocessing import OrdinalEncoder
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
%matplotlib inline
from sklearn.manifold import TSNE
nltk.download('wordnet')
import string
from  nltk import FreqDist
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.collocations import *

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\eduar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Cleaning the data:

In [2]:
#Import Twitter Data
data = pd.read_csv('Data/Twitter_sentiment.csv', encoding='latin1')
data

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion
...,...,...,...
9088,Ipad everywhere. #SXSW {link},iPad,Positive emotion
9089,"Wave, buzz... RT @mention We interrupt your re...",,No emotion toward brand or product
9090,"Google's Zeiger, a physician never reported po...",,No emotion toward brand or product
9091,Some Verizon iPhone customers complained their...,,No emotion toward brand or product


In [3]:
#Renaming columns
data = data.rename(columns={"tweet_text": "Tweet", "emotion_in_tweet_is_directed_at": "Subject_of_tweet",
                     "is_there_an_emotion_directed_at_a_brand_or_product": "Emotion"})
data

Unnamed: 0,Tweet,Subject_of_tweet,Emotion
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion
...,...,...,...
9088,Ipad everywhere. #SXSW {link},iPad,Positive emotion
9089,"Wave, buzz... RT @mention We interrupt your re...",,No emotion toward brand or product
9090,"Google's Zeiger, a physician never reported po...",,No emotion toward brand or product
9091,Some Verizon iPhone customers complained their...,,No emotion toward brand or product


In [4]:
#looking at value counts
data['Emotion'].value_counts()

No emotion toward brand or product    5389
Positive emotion                      2978
Negative emotion                       570
I can't tell                           156
Name: Emotion, dtype: int64

In [5]:
#Encoding the emotion column so the classifer can understand it better
ord_enc = OrdinalEncoder()
data['Emotion'] = ord_enc.fit_transform(data[["Emotion"]])

In [6]:
#Changing the data type so it can be ran through the tokenizer
data['Emotion'] = data['Emotion'].astype('int64')

In [7]:
data

Unnamed: 0,Tweet,Subject_of_tweet,Emotion
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,1
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,3
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,3
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,1
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,3
...,...,...,...
9088,Ipad everywhere. #SXSW {link},iPad,3
9089,"Wave, buzz... RT @mention We interrupt your re...",,2
9090,"Google's Zeiger, a physician never reported po...",,2
9091,Some Verizon iPhone customers complained their...,,2


In [8]:
# data.to_csv(r'Data/Twitter_sentiment_processed.csv', index = False)

In [9]:
#Negative emotion = 0

#Positive emotion = 1


data['Emotion'].value_counts()

2    5389
3    2978
1     570
0     156
Name: Emotion, dtype: int64

In [10]:
#Looking for null values
data.isna().sum()

Tweet                  1
Subject_of_tweet    5802
Emotion                0
dtype: int64

In [11]:
#Changing the data type to String so it can be passed through the cleaning function
data['Tweet'] = data['Tweet'].astype('str')

## Train-Test Split:

In [12]:
# Splitting the target varible from the rest of the data set and dropped subject of tweet for the time being
y = data['Emotion']
X = data['Tweet']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20, random_state=42)

In [14]:
X_train

7678    For any friends coming to Austin for SXSW: {li...
4019    Survival kit provided to folks in line at the ...
3772    RT ÛÏ@mention Google to Launch Major New Soci...
7465    Attended preso on living simply #100tc at #sxs...
8220    deviantART buys 3 iPad 2's in Austin, tests Mu...
                              ...                        
5734    RT @mention For many of you that asked last ni...
5191    RT @mention &quot;So if you Google &quot;refri...
5390    RT @mention Android developers and friends: le...
860     So geeky! Love it! RT @mention Apple is openin...
7270    Anyone know of iPhone developers that will be ...
Name: Tweet, Length: 7274, dtype: object

## Tokenization:

In [15]:
#created stop words list
stopwords_list = stopwords.words('english') + list(string.punctuation)
stopwords_list += ["''", '""', '...', '``', 'link']

In [16]:
#tokenized and lemmantized the tweets
def process_tweet(tweet):
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(tweet)
    stopwords_removed = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.lower() not in stopwords_list]
    return stopwords_removed 

In [17]:
processed_data = list(map(process_tweet, X_train))

In [18]:
processed_data[2]

['rt',
 '\x89ûï',
 'mention',
 'google',
 'launch',
 'major',
 'new',
 'social',
 'network',
 'called',
 'circle',
 'possibly',
 'today',
 'sxsw\x89û\x9d']

In [19]:
#total vocabulary size
total_vocab = set()
for tweet in processed_data:
    total_vocab.update(tweet)
len(total_vocab)

8910

In [20]:
#Concating everything to look at frequency distributions
tweet_concat = []

for tweet in processed_data:
    tweet_concat += tweet

In [21]:
#Frequency distriburions
tweet_freqdist = FreqDist(tweet_concat)
tweet_freqdist.most_common(200)

[('sxsw', 7608),
 ('mention', 5703),
 ('rt', 2331),
 ('google', 2059),
 ('ipad', 1948),
 ('apple', 1839),
 ('quot', 1322),
 ('iphone', 1230),
 ('store', 1209),
 ("'s", 988),
 ('2', 915),
 ('new', 872),
 ('austin', 784),
 ('amp', 678),
 ('app', 656),
 ('launch', 562),
 ('circle', 533),
 ('social', 524),
 ('today', 467),
 ('android', 455),
 ('network', 380),
 ('ipad2', 372),
 ("n't", 365),
 ('line', 349),
 ('pop-up', 341),
 ('get', 338),
 ('via', 333),
 ('party', 315),
 ('free', 309),
 ('called', 293),
 ('sxswi', 278),
 ('mobile', 276),
 ('one', 252),
 ('major', 249),
 ('time', 240),
 ('like', 234),
 ("'m", 227),
 ('map', 222),
 ('day', 221),
 ("'re", 205),
 ('temporary', 203),
 ('u', 198),
 ('open', 198),
 ('opening', 196),
 ('win', 196),
 ('possibly', 192),
 ('downtown', 183),
 ('need', 182),
 ('go', 180),
 ('see', 180),
 ('great', 175),
 ('apps', 174),
 ('popup', 171),
 ('come', 171),
 ('check', 168),
 ('people', 168),
 ('going', 165),
 ('mayer', 164),
 ('know', 155),
 ('got', 154),
 

## TF-IDF:

In [22]:
#TF-IDF Vectorizing the tokens
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

In [23]:
#vectorizing the x_train
tf_idf_data_train = vectorizer.fit_transform(X_train)

In [24]:
#Vectorizting the x_test
tf_idf_data_test = vectorizer.transform(X_test)

In [25]:
#Checking shape
tf_idf_data_train.shape

(7274, 8849)

In [26]:
#Looking at the sparsity of the data
non_zero_cols = tf_idf_data_train.nnz / float(tf_idf_data_train.shape[0])
print("Average Number of Non-Zero Elements in Vectorized Articles: {}".format(non_zero_cols))

percent_sparse = 1 - (non_zero_cols / float(tf_idf_data_train.shape[1]))
print('Percentage of columns containing 0: {}'.format(percent_sparse))

Average Number of Non-Zero Elements in Vectorized Articles: 16.261616717074514
Percentage of columns containing 0: 0.9981623215372274


## Baseline Model:

In [27]:
#Running random forest and Naive Bayes to check models accuracy
rf_classifier = RandomForestClassifier(n_estimators=100)
nb_classifier = MultinomialNB()

In [28]:
#Fitting the classifiers
rf_classifier.fit(tf_idf_data_train, y_train)
rf_train_preds = rf_classifier.predict(tf_idf_data_train)
rf_test_preds = rf_classifier.predict(tf_idf_data_test)

In [29]:
rf_train_score = accuracy_score(y_train, rf_train_preds)
rf_test_score = accuracy_score(y_test, rf_test_preds)

print('Random Forest')
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(rf_train_score, rf_test_score))

Random Forest
Training Accuracy: 0.9962 		 Testing Accuracy: 0.6701


In [30]:
nb_classifier.fit(tf_idf_data_train, y_train)
nb_train_preds = nb_classifier.predict(tf_idf_data_train)
nb_test_preds = nb_classifier.predict(tf_idf_data_test)

In [31]:
nb_train_score = accuracy_score(y_train, nb_train_preds)
nb_test_score = accuracy_score(y_test, nb_test_preds)

print("Multinomial Naive Bayes")
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(nb_train_score, nb_test_score))

Multinomial Naive Bayes
Training Accuracy: 0.7242 		 Testing Accuracy: 0.6427
