# Elena's EDA notebook

In [251]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Models
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor,\
RandomForestClassifier, BaggingClassifier, ExtraTreesRegressor, VotingRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB

# Transformers
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import LabelBinarizer, LabelEncoder

# Modeling Evaluation
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import roc_curve, auc, recall_score, f1_score
from sklearn.metrics import classification_report, accuracy_score

# Pipelines
from imblearn.pipeline import Pipeline, make_pipeline

import pickle
import sys

sys.path.append("../../src")

from model_evaluation import plot_confusion_matrix

pd.set_option('display.max_colwidth', None)
np.random.seed(111)

In [252]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dense, LSTM, Embedding, Flatten
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.layers import Dropout, Activation, Bidirectional
from keras.models import Sequential
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.preprocessing import text, sequence
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import load_model

In [253]:
clean_df = pd.read_csv('../../data/targeted_combined.csv')
clean_df.dropna(inplace = True)
clean_df.head()

Unnamed: 0,emotion,txt_cleaned
0,0.0,product_target hr tweet riseaustin dead need upgrade plugin station sxsw
1,2.0,know awesome ipadiphone product_target youll likely appreciate design also theyre give free t sxsw
2,2.0,wait product_target also sale sxsw
3,0.0,hope year festival isnt crashy year iphone product_target sxsw
4,2.0,great stuff fri sxsw marissa mayer product_target tim oreilly tech booksconferences amp matt mullenweg wordpress


In [254]:
# divides data into X and y, and then turns the model target labels into numerical format

X = clean_df['txt_cleaned'].values.astype('U')
y = clean_df['emotion']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.25)


# Logistic Regression

In [257]:
cv = CountVectorizer(ngram_range = (1,2))

X_train_vec = cv.fit_transform(X_train)
X_test_vec = cv.transform(X_test)

In [258]:
LR_file = open("../../src/final_model_targeted.pkl", "rb") # "rb" means "read as bytes"
LR = pickle.load(LR_file)
LR_file.close()

In [259]:
LR_predictions = LR.predict(X_test_vec)
accuracy_score(y_test, LR_predictions)

0.6597664543524416

## CNN

In [260]:
X_train_list = list(X_train)
X_test_list = list(X_test)


X_train_corpus = []
for tweet in X_train_list:
    X_train_corpus.extend(tweet.split(' '))
    
X_train_unique = len(set(X_train_corpus))

sequence_length = 100

full_tokenizer = text.Tokenizer(num_words=X_train_unique)

full_tokenizer.fit_on_texts(X_train_list)

X_train_tokenized= full_tokenizer.texts_to_sequences(X_train_list)
X_test_tokenized= full_tokenizer.texts_to_sequences(X_test_list)

X_train_CNN = sequence.pad_sequences(X_train_tokenized, maxlen=sequence_length)
X_test_CNN = sequence.pad_sequences(X_test_tokenized, maxlen=sequence_length)

In [261]:
CNN = load_model('CNN_final_model.h5')

In [262]:
CNN_predictions = np.argmax(CNN.predict(X_test_CNN), axis = 1)

## RNN

In [263]:
RNN = load_model('BiLSTMsmall64.h5')

In [264]:
RNN_predictions = np.argmax(RNN.predict(X_test_CNN), axis = 1)

## Concatenting df and predictions

In [267]:
tweets = pd.DataFrame(X_test, columns = ['Tweets'])
true_labels = y_test.reset_index(drop=True)
LR_predictions = pd.DataFrame(LR_predictions, columns = ['LR Predictions'])
CNN_predictions = pd.DataFrame(CNN_predictions, columns = ['CNN Predictions'])
RNN_predictions = pd.DataFrame(RNN_predictions, columns = ['RNN Predictions'])

compare_predictions = pd.concat([tweets, true_labels, LR_predictions, CNN_predictions, RNN_predictions], axis = 1)


In [268]:
compare_predictions.head(10)

Unnamed: 0,Tweets,emotion,LR Predictions,CNN Predictions,RNN Predictions
0,thing would go festival,2.0,0.0,2,2
1,good stuff happen america englandsucks,2.0,2.0,2,1
2,dash accepts product_targetpay thanks make happen mobilepayments paywithdash httptcooxwmavcscn,2.0,2.0,2,0
3,pretty sure make zero sense product_target,0.0,0.0,1,1
4,who buy new product_targets anyway aapl,1.0,1.0,1,2
5,cat really worked product_target nad deflatedballs,1.0,0.0,0,2
6,please bit alec baldwin talk football bill belichick style peteschweddy product_target deflatedballs,0.0,1.0,2,2
7,whyd u,0.0,1.0,1,2
8,even begin product_target win sxsw,2.0,2.0,2,0
9,last month discovr ipad midem time discovr iphone sxsw climb product_target store chart,2.0,2.0,2,0


In [269]:
compare_predictions.to_csv('../../data/compare_top_model_predictions.csv', index = False)

## Creating sample tweets for presentation

In [270]:
sample_tweets = pd.read_csv('../../data/judge-1377884607_tweet_product_company.csv',encoding = 'latin-1')
sample_tweets.dropna(inplace = True)
sample_tweets.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,".@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead! I need to upgrade. Plugin stations at #SXSW.",iPhone,Negative emotion
1,"@jessedee Know about @fludapp ? Awesome iPad/iPhone app that you'll likely appreciate for its design. Also, they're giving free Ts at #SXSW",iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. They should sale them down at #SXSW.,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as crashy as this year's iPhone app. #sxsw,iPad or iPhone App,Negative emotion
4,"@sxtxstate great stuff on Fri #SXSW: Marissa Mayer (Google), Tim O'Reilly (tech books/conferences) &amp; Matt Mullenweg (Wordpress)",Google,Positive emotion


In [271]:
sample_X = sample_tweets['tweet_text']
sample_vec = cv.transform(sample_X)
sample_y = LR.predict(sample_vec)
pred_samples = pd.concat([sample_tweets.reset_index(drop = True), 
                          pd.DataFrame(sample_y).reset_index(drop = True)], axis = 1)
pred_samples.columns = ['tweets','product','true emotions','LR predicted emotions']
pred_samples.loc[pred_samples['LR predicted emotions'] == 2.0, 'LR predicted emotions'] = 'Positive emotion'
pred_samples.loc[pred_samples['LR predicted emotions'] == 1.0, 'LR predicted emotions'] = 'Neutral emotion'
pred_samples.loc[pred_samples['LR predicted emotions'] == 0.0, 'LR predicted emotions'] = 'Negative emotion'
pred_samples.head()

Unnamed: 0,tweets,product,true emotions,LR predicted emotions
0,".@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead! I need to upgrade. Plugin stations at #SXSW.",iPhone,Negative emotion,Positive emotion
1,"@jessedee Know about @fludapp ? Awesome iPad/iPhone app that you'll likely appreciate for its design. Also, they're giving free Ts at #SXSW",iPad or iPhone App,Positive emotion,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. They should sale them down at #SXSW.,iPad,Positive emotion,Positive emotion
3,@sxsw I hope this year's festival isn't as crashy as this year's iPhone app. #sxsw,iPad or iPhone App,Negative emotion,Positive emotion
4,"@sxtxstate great stuff on Fri #SXSW: Marissa Mayer (Google), Tim O'Reilly (tech books/conferences) &amp; Matt Mullenweg (Wordpress)",Google,Positive emotion,Positive emotion


In [272]:
pred_samples[(pred_samples['LR predicted emotions'] == 'Positive emotion') & 
             (pred_samples['true emotions'] == 
              'Positive emotion')].head()

Unnamed: 0,tweets,product,true emotions,LR predicted emotions
1,"@jessedee Know about @fludapp ? Awesome iPad/iPhone app that you'll likely appreciate for its design. Also, they're giving free Ts at #SXSW",iPad or iPhone App,Positive emotion,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. They should sale them down at #SXSW.,iPad,Positive emotion,Positive emotion
4,"@sxtxstate great stuff on Fri #SXSW: Marissa Mayer (Google), Tim O'Reilly (tech books/conferences) &amp; Matt Mullenweg (Wordpress)",Google,Positive emotion,Positive emotion
5,"#SXSW is just starting, #CTIA is around the corner and #googleio is only a hop skip and a jump from there, good time to be an #android fan",Android,Positive emotion,Positive emotion
6,Beautifully smart and simple idea RT @madebymany @thenextweb wrote about our #hollergram iPad app for #sxsw! http://bit.ly/ieaVOB,iPad or iPhone App,Positive emotion,Positive emotion


In [273]:
pred_samples[(pred_samples['LR predicted emotions'] == 'Negative emotion') & 
             (pred_samples['true emotions'] == 
              'Negative emotion')].head()

Unnamed: 0,tweets,product,true emotions,LR predicted emotions
39,attending @mention iPad design headaches #sxsw {link},iPad,Negative emotion,Negative emotion
266,"I thought I would use my iPad a lot during #SXSW, but I haven't even touched it once. Hmmzies.",iPad,Negative emotion,Negative emotion
288,the first iPad didn't even exist here last year and I already feel like I'm pulling out an antique everytime I use my iPad #sxsw #ipad2,iPad,Negative emotion,Negative emotion
333,"Google to launch product! Wait, no launch, but product exists. Wait, product does not exist! #sxsw {link}",Google,Negative emotion,Negative emotion
488,Do i know u? Why u selling to me! Weve never talked. I now Hate your product! RT @mention check out Heyo 4 iPhone #SXSW #SXSWi,iPhone,Negative emotion,Negative emotion


In [274]:
pred_samples[(pred_samples['LR predicted emotions'] == 'Positive emotion') & 
             (pred_samples['true emotions'] == 
              'Negative emotion')].tail()

Unnamed: 0,tweets,product,true emotions,LR predicted emotions
3192,Dear #SXSW goer... Please look up from your fucking iPhone when walking the halls. Thanks Hipsters. Hilarious!,iPhone,Negative emotion,Positive emotion
3203,"If there was a popup store in Austin that sold nothing but iPhone battery extenders, it would make so much money. #sxsw",iPhone,Negative emotion,Positive emotion
3251,I think my effing hubby is in line for an #iPad 2. Can someone point him towards the line-up for wife number #2. #sxswi #sxsw,iPad,Negative emotion,Positive emotion
3261,I'm pretty sure the panelist that thinks &quot;Apple is drowning in their success&quot; is fucking insane. #SXSW,Apple,Negative emotion,Positive emotion
3288,"Diller says Google TV &quot;might be run over by the PlayStation and the Xbox, which are essentially ready today.&quot; #sxsw #diller",Other Google product or service,Negative emotion,Positive emotion


In [275]:
correct1 = pd.DataFrame(pred_samples.iloc[1,:]).T
correct2 = pd.DataFrame(pred_samples.iloc[1143,:]).T
wrong1 = pd.DataFrame(pred_samples.iloc[3251,:]).T
export_samples = pd.concat([correct1,correct2,wrong1], axis = 0)
export_samples

Unnamed: 0,tweets,product,true emotions,LR predicted emotions
1,"@jessedee Know about @fludapp ? Awesome iPad/iPhone app that you'll likely appreciate for its design. Also, they're giving free Ts at #SXSW",iPad or iPhone App,Positive emotion,Positive emotion
1143,Oh no utter @mention #sxsw fail. Can't see the letter of the ballroom in the iPhone app as it is too long and doesn't wrap /cc @mention,iPad or iPhone App,Negative emotion,Negative emotion
3251,I think my effing hubby is in line for an #iPad 2. Can someone point him towards the line-up for wife number #2. #sxswi #sxsw,iPad,Negative emotion,Positive emotion


In [276]:
export_samples.to_csv('../../data/presentation_sample_tweets.csv')