# Josh Combining Predictions

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Models
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor,\
RandomForestClassifier, BaggingClassifier, ExtraTreesRegressor, VotingRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB

# Transformers
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import LabelBinarizer, LabelEncoder

# Modeling Evaluation
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import roc_curve, auc, recall_score, f1_score
from sklearn.metrics import classification_report, accuracy_score

# Pipelines
from imblearn.pipeline import Pipeline, make_pipeline

import pickle
import sys

sys.path.append("../../src")

from model_evaluation import plot_confusion_matrix

pd.set_option('display.max_colwidth', None)
np.random.seed(111)

In [3]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dense, LSTM, Embedding, Flatten
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.layers import Dropout, Activation, Bidirectional
from keras.models import Sequential
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.preprocessing import text, sequence
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import load_model

In [4]:
clean_df = pd.read_csv('../../data/targeted_combined.csv')
clean_df.dropna(inplace = True)
clean_df.head()

Unnamed: 0,emotion,txt_cleaned
0,0.0,product_target hr tweet riseaustin dead need upgrade plugin station sxsw
1,2.0,know awesome ipadiphone product_target youll likely appreciate design also theyre give free t sxsw
2,2.0,wait product_target also sale sxsw
3,0.0,hope year festival isnt crashy year iphone product_target sxsw
4,2.0,great stuff fri sxsw marissa mayer product_target tim oreilly tech booksconferences amp matt mullenweg wordpress


In [5]:
# divides data into X and y, and then turns the model target labels into numerical format

X = clean_df['txt_cleaned'].values.astype('U')
y = clean_df['emotion']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.25)


# Logistic Regression

In [6]:
cv = CountVectorizer(ngram_range = (1,2))

X_train_vec = cv.fit_transform(X_train)
X_test_vec = cv.transform(X_test)

In [9]:
LR = LogisticRegression(max_iter=1000)
LR.fit(X_train_vec, y_train)

LogisticRegression(max_iter=1000)

In [14]:
LR_file = open("../../src/final_model_targeted.pkl", "rb") # "rb" means "read as bytes"
LR = pickle.load(LR_file)
LR_file.close()

In [16]:
LR_predictions = LR.predict(X_test_vec)
accuracy_score(y_test, LR_predictions)

0.6597664543524416

## CNN

In [17]:
X_train_list = list(X_train)
X_test_list = list(X_test)


X_train_corpus = []
for tweet in X_train_list:
    X_train_corpus.extend(tweet.split(' '))
    
X_train_unique = len(set(X_train_corpus))

sequence_length = 100

full_tokenizer = text.Tokenizer(num_words=X_train_unique)

full_tokenizer.fit_on_texts(X_train_list)

X_train_tokenized= full_tokenizer.texts_to_sequences(X_train_list)
X_test_tokenized= full_tokenizer.texts_to_sequences(X_test_list)

X_train_CNN = sequence.pad_sequences(X_train_tokenized, maxlen=sequence_length)
X_test_CNN = sequence.pad_sequences(X_test_tokenized, maxlen=sequence_length)

In [18]:
CNN = load_model('CNN_final_model.h5')

In [19]:
CNN_predictions = np.argmax(CNN.predict(X_test_CNN), axis = 1)

## RNN

In [20]:
RNN = load_model('BiLSTMsmall64.h5')

In [21]:
RNN_predictions = np.argmax(RNN.predict(X_test_CNN), axis = 1)

## Concatenting df and predictions

In [23]:
tweets = pd.DataFrame(X_test, columns = ['Tweets'])
true_labels = y_test.reset_index(drop=True)
LR_predictions = pd.DataFrame(LR_predictions, columns = ['LR Predictions'])
CNN_predictions = pd.DataFrame(CNN_predictions, columns = ['CNN Predictions'])
RNN_predictions = pd.DataFrame(RNN_predictions, columns = ['RNN Predictions'])

compare_predictions = pd.concat([tweets, true_labels, LR_predictions, CNN_predictions, RNN_predictions], axis = 1)


In [29]:
y_pred = compare_predictions[['LR Predictions','CNN Predictions','RNN Predictions']].astype(int)

In [30]:
y_pred

Unnamed: 0,LR Predictions,CNN Predictions,RNN Predictions
0,0,2,2
1,2,2,1
2,2,2,0
3,0,1,1
4,1,1,2
...,...,...,...
5647,0,0,2
5648,1,1,2
5649,0,0,2
5650,0,0,2


In [32]:
vote_y_pred = y_pred.mode(axis=1)[0]

0       2.0
1       2.0
2       2.0
3       1.0
4       1.0
       ... 
5647    0.0
5648    1.0
5649    0.0
5650    0.0
5651    2.0
Name: 0, Length: 5652, dtype: float64

In [36]:
accuracy_score(compare_predictions['emotion'],vote_y_pred)

0.5189313517338995

In [None]:
compare_predictions.head(10)

In [None]:
compare_predictions.to_csv('../../data/compare_top_model_predictions.csv', index = False)

## Creating sample tweets for presentation

In [None]:
sample_tweets = pd.read_csv('../../data/judge-1377884607_tweet_product_company.csv',encoding = 'latin-1')
sample_tweets.dropna(inplace = True)
sample_tweets.head()

In [None]:
sample_X = sample_tweets['tweet_text']
sample_vec = cv.transform(sample_X)
sample_y = LR.predict(sample_vec)
pred_samples = pd.concat([sample_tweets.reset_index(drop = True), 
                          pd.DataFrame(sample_y).reset_index(drop = True)], axis = 1)
pred_samples.columns = ['tweets','product','true emotions','LR predicted emotions']
pred_samples.loc[pred_samples['LR predicted emotions'] == 2.0, 'LR predicted emotions'] = 'Positive emotion'
pred_samples.loc[pred_samples['LR predicted emotions'] == 1.0, 'LR predicted emotions'] = 'Neutral emotion'
pred_samples.loc[pred_samples['LR predicted emotions'] == 0.0, 'LR predicted emotions'] = 'Negative emotion'
pred_samples.head()

In [None]:
pred_samples[(pred_samples['LR predicted emotions'] == 'Positive emotion') & 
             (pred_samples['true emotions'] == 
              'Positive emotion')].head()

In [None]:
pred_samples[(pred_samples['LR predicted emotions'] == 'Negative emotion') & 
             (pred_samples['true emotions'] == 
              'Negative emotion')].head()

In [None]:
pred_samples[(pred_samples['LR predicted emotions'] == 'Positive emotion') & 
             (pred_samples['true emotions'] == 
              'Negative emotion')].tail()

In [None]:
correct1 = pd.DataFrame(pred_samples.iloc[1,:]).T
correct2 = pd.DataFrame(pred_samples.iloc[1143,:]).T
wrong1 = pd.DataFrame(pred_samples.iloc[3251,:]).T
export_samples = pd.concat([correct1,correct2,wrong1], axis = 0)
export_samples

In [None]:
export_samples.to_csv('../../data/presentation_sample_tweets.csv')