In [92]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelBinarizer, LabelEncoder

import re
from nltk import pos_tag
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer 

import seaborn as sns
import matplotlib.pyplot as plt

sns.set(context = 'notebook', style = 'whitegrid')
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows',50)

## Omg, I love the coding style here, so clean, so readable.

## Let's split this notebook into EDA and Feature engineering, for clarity sake.

### Feature Engineering choices:
1. combine all apple products into one (Is this legit for business question?)
2. drop "I can't tell" (only lose a few rows, apparently)

In [93]:
df = pd.read_csv('../../data/judge-1377884607_tweet_product_company.csv', encoding = 'latin1')
df.columns = ['text', 'target', 'emotion']

df = df[df['emotion'] != 'I can\'t tell'] #engineering choice
df['target'].replace(['iPad', 'iPad or iPhone App', 'iPhone', 'Other Apple product or service'], 
                     'Apple', inplace = True) #engineering choice
df['target'].replace(['Other Google product or service'], 'Google', inplace = True)
df['target'].replace(['Android App'], 'Android', inplace = True)

df['target'].fillna('No Target', inplace = True)
df.dropna(inplace = True)


We are getting a lot of purely numeric, combination letter and numeric, and words starting with funky "ui" and "uo" beginnings, how can we filter those as well?

In [94]:
sw = stopwords.words('english')
sw.extend(['link', 'rt', 'sxsw'])
punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~“!'
twitter_re = re.compile('[#@][a-zA-Z]*')
num_re = re.compile('^\d{1}$')

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def txt_clean(txt, stop_words=sw):
    t = txt.split(' ')
    t = [w.lower() for w in t]
    t = [w.translate(w.maketrans('','', punctuation)) for w in t]
    t = [w for w in t if not twitter_re.match(w)]
    t = [w for w in t if not num_re.match(w)]
    t = [w for w in t if w not in stop_words]
    t = [w for w in t if w]
    t = pos_tag(t)
    t = [(w[0], get_wordnet_pos(w[1])) for w in t]
    lem = WordNetLemmatizer()
    t = [lem.lemmatize(w[0], w[1]) for w in t]
    return ' '.join(t)

df['txt_cleaned'] = df['text'].map(txt_clean)

# Data Preprocessing 

In [113]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dense, LSTM, Embedding
from keras.layers import Dropout, Activation, Bidirectional, GlobalMaxPool1D
from keras.models import Sequential
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.preprocessing import text, sequence
from sklearn.preprocessing import LabelBinarizer

In [122]:
X = df['txt_cleaned']
y = df['emotion']

binarizer = LabelBinarizer()
y = pd.DataFrame(binarizer.fit_transform(y), columns = binarizer.classes_)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.25)

In [123]:
sequence_len = 128

tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(list(X_train))
list_tokenized_headlines = tokenizer.texts_to_sequences(X_train)
X_train = sequence.pad_sequences(list_tokenized_headlines, maxlen=100)

tokenized_X_test = tokenizer.texts_to_sequences(X_test)
X_test = sequence.pad_sequences(tokenized_X_test, maxlen=128)

X_train

array([[   0,    0,    0, ...,   31,    1,   23],
       [   0,    0,    0, ...,  184,  261,  697],
       [   0,    0,    0, ...,  617, 1063,  413],
       ...,
       [   0,    0,    0, ...,  117,  458, 3605],
       [   0,    0,    0, ...,   99,  145,   40],
       [   0,    0,    0, ...,    8, 2497,    3]])

In [124]:
X_test

array([[   0,    0,    0, ...,  858, 1622, 4601],
       [   0,    0,    0, ...,   86,  487, 3630],
       [   0,    0,    0, ...,   18, 1964,   14],
       ...,
       [   0,    0,    0, ...,   28,    1,   15],
       [   0,    0,    0, ...,    2,    9,    7],
       [   0,    0,    0, ...,   10,  618, 7072]])

In [133]:
y_train

Unnamed: 0,Negative emotion,No emotion toward brand or product,Positive emotion
3299,0,0,1
3145,0,1,0
4379,0,0,1
2273,0,0,1
6192,0,0,1
...,...,...,...
5734,0,1,0
5191,0,0,1
5390,0,1,0
860,0,1,0


In [146]:
embedding_size = 128
model = Sequential()
model.add(Embedding(len(tokenizer.word_index.keys())+1, sequence_len))
model.add(LSTM(25, return_sequences=True))
model.add(GlobalMaxPool1D())
model.add(Dropout(0.5))
model.add(Dense(50, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(3, activation='sigmoid'))

model.compile(optimizer = 'rmsprop', loss = 'categorical_crossentropy', metrics = ['acc'])

In [147]:
model.fit(X_train, y_train, epochs = 1, batch_size = 50)



<tensorflow.python.keras.callbacks.History at 0x247c0270490>