In [6]:
import json
import io
import numpy as np
import pandas as pd


from keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense, Input, GlobalMaxPooling1D, Dropout, concatenate, LSTM
from keras.models import Model
from keras import optimizers
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [9]:
MAX_SEQ_LENGTH = 20
EMBEDDING_DIM = 300

In [2]:
def load_vec(emb_path, nmax=1000000):
    vectors = []
    word2id = {}
    with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        next(f)
        for i, line in enumerate(f):
            word, vect = line.rstrip().split(' ', 1)
            vect = np.fromstring(vect, sep=' ')
            assert word not in word2id, 'word found twice'
            vectors.append(vect)
            word2id[word] = len(word2id)
            if len(word2id) == nmax:
                break
    id2word = {v: k for k, v in word2id.items()}
    embeddings = np.vstack(vectors)
    return embeddings, id2word, word2id


In [3]:
def process_data(df):
    dff = df[:300000]
    X = [text_to_word_sequence(rev) for rev in dff['text'].tolist()]
    y = np.array(dff['tone'])
    return X, y

In [4]:
def load_reviews(pathes):
    reviews = []
    for path in pathes:
        with open(path) as f:
            rev = ""
            for line in f.readlines():
                rev += line
               
                if line.strip().endswith("}"):

                    rev = rev.replace("'", "")
                    reviews.append(json.loads(rev))

                    rev = ""
    df = pd.DataFrame(reviews)
    return df

In [7]:
embd, id2w, w2id = load_vec('data/wiki.multi.en.vec')

#load eng data
eng_df = load_reviews(['data/xag.json'])
mapper = {3: 1, 1: -1, 2:-1, 4:1, 5:1}
eng_df["tone"] = eng_df["stars"].apply(lambda x: mapper[x])
eng_df.head()

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id,tone
0,0SCHQdFVdfnF8AN_xjsozA,0,2016-10-08,0,PqxpFCnDGxCvueU-MtJ0dw,5,I absolutely love this place! I used to come h...,0,jp1xSiy-DD1BX2btSI8d1A,1
1,eJKnymd0BywNPrJw1IuXVw,0,2015-02-23,0,bGV8N7DUW5yPL2UURkYXxQ,5,Open late. GREAT bar food and excellent mixed ...,0,62AIXTmXC8IOSoedJCSj4Q,1
2,-BmqghX1sv7sgsxOIS2yAg,0,2015-02-23,0,HurvJRTuZ_Xh32oSpCcdOA,5,"I am a big fan of Ethiopian food, if its done ...",4,62AIXTmXC8IOSoedJCSj4Q,1
3,U5U3c6fUEfrPt-kLqiwbKQ,0,2015-03-06,0,Y6YE0Deui30HQgvG19BgmA,5,"I needed a last minute cut before an event, an...",0,62AIXTmXC8IOSoedJCSj4Q,1
4,ID5wWJ9C7G0hfbQilvlfxA,0,2015-02-23,0,55RFjcugTHIeWKrLUBwwwA,5,This place is a gem downtown. The decor is nic...,0,62AIXTmXC8IOSoedJCSj4Q,1


In [10]:
processed_rev, y = process_data(eng_df)
print(len(processed_rev))
X = []
for rev in processed_rev:
    emdb_rev = []
    for w in rev:
        try:
            emdb_rev.append(w2id[w])
        except KeyError:
            continue
    X.append(emdb_rev)
X = pad_sequences(X, maxlen=MAX_SEQ_LENGTH)

261687


In [8]:
y.shape

(261687,)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)
X_train.shape

(196265, 20)

In [10]:
sequence_input = Input(shape=(MAX_SEQ_LENGTH, ), dtype='int32')
embedding_layer = Embedding(len(embd), EMBEDDING_DIM, weights=[embd],
                                               input_length=MAX_SEQ_LENGTH, trainable=False)

x = embedding_layer(sequence_input)
x = Dropout(0.3)(x)
x = Conv1D(200, 5, activation='relu')(x)
x = MaxPooling1D(pool_size=2)(x)
x = LSTM(100)(x)
x = Dropout(0.3)(x)
# x = Dense(200, activation='relu')(x)
prob = Dense(1, activation='sigmoid')(x)

model_1 = Model(sequence_input, prob)
optimizer = optimizers.Adam(lr=0.0004)

In [11]:
model_1.compile(loss='binary_crossentropy',optimizer=optimizer, metrics=['accuracy'])

print(model_1.summary()) 

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 20)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 20, 300)           60000000  
_________________________________________________________________
dropout_1 (Dropout)          (None, 20, 300)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 16, 200)           300200    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 8, 200)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               120400    
_________________________________________________________________
dropout_2 (Dropout)          (None, 100)               0         
__________

In [12]:
model_1.fit(X_train, y_train, epochs=25, batch_size=128) #train only on english data

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x7f7c20fc3390>

In [13]:
model_1.save('lstm_cnn_1.h5')

In [12]:
from keras.models import load_model
model1 = load_model('lstm_cnn.h5')
model1.evaluate(X_test, y_test)



[-1.6273707234858739, 0.6778606584977289]

In [18]:
from sklearn.metrics import classification_report, accuracy_score
pred = model1.predict(X_test)
pred_classes = []
for p in pred:
    if p > 0.5:
        pred_classes.append(1)
    else:
        pred_classes.append(-1)
                                  
        
classification_report(pred_classes, y_test)

'             precision    recall  f1-score   support\n\n         -1       0.84      0.65      0.73     18766\n          1       0.87      0.95      0.91     46656\n\navg / total       0.86      0.86      0.86     65422\n'

In [17]:
accuracy_score(y_test, pred_classes)

0.8632111522117942