In [2]:
import pandas as pd
import numpy as np


#### Defining a function to read the pre-trained word embeddings

In [10]:
def read_glove(file_path):
    with open(file_path,'r',encoding="utf8") as f:
        vocab=set()
        word_to_vec={}
        word_to_index={}
        index_to_word={}
        for line in f:
            line=line.strip().split()
            vocab.add(line[0])
            word_to_vec[line[0]]= np.array(line[1:],dtype=np.float64)
        index=0
        for w in sorted(vocab):
            word_to_index[w]=index
            index_to_word[index]=w
            index+=1
        return (word_to_vec,word_to_index,index_to_word)
            

In [11]:
embeddings_path="D:/Stanford_sentiment_tree/glove.6B.50d.txt"
word_to_vec,word_to_index,index_to_word=read_glove(embeddings_path)

#### Importing our training data

In [104]:
data_path="D:/Stanford_sentiment_tree/dictionary.txt"
data=pd.read_table(data_path,sep="|")
data.rename(columns={'!':"phrases",'0':"id"},inplace=True)


        

Unnamed: 0,phrases,id
0,! ',22935
1,! '',18235
2,! Alas,179257
3,! Brilliant,22936
4,! Brilliant !,40532


In [96]:
labels_path=data_path="D:/Stanford_sentiment_tree/sentiment_labels.txt"
labels=pd.read_table(labels_path,sep="|")
labels.head()
labels.set_index("phrase ids")

Unnamed: 0_level_0,sentiment values
phrase ids,Unnamed: 1_level_1
0,0.50000
1,0.50000
2,0.44444
3,0.50000
4,0.42708
...,...
239227,0.36111
239228,0.38889
239229,0.33333
239230,0.88889


In [105]:
data_labeled=pd.merge(left=data,right=labels,left_on="id",right_on="phrase ids")

In [109]:
data_labeled.head()

Unnamed: 0,phrases,id,phrase ids,sentiment values
0,! ',22935,22935,0.52778
1,! '',18235,18235,0.5
2,! Alas,179257,179257,0.44444
3,! Brilliant,22936,22936,0.86111
4,! Brilliant !,40532,40532,0.93056


In [113]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(data_labeled["phrases"].values, data_labeled["sentiment values"].values, test_size=0.2, shuffle=True)

In [148]:
def sentences_to_indices(sentences,word_to_index,maxlen):
    indices=np.zeros(shape=(sentences.shape[0],maxlen),dtype=np.int64)
    vocab=word_to_index.keys()
    for i in range(len(sentences)):
        words= sentences[i].lower().split()
        j=0
        for w in words: 
            if (w in vocab):
                indices[i,j]=word_to_index[w]
                #print(word_to_index[w])
                j+=1
    return(indices)

In [149]:
sentences_to_indices(X_train[0:5],word_to_index,40)

array([[390138, 264549, 222483,  71089, 336113, 345296,  60664, 360914,
        160417,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0],
       [151348, 193918, 132927,    451,  45130,  97034,    451,  54717,
        151348, 357265, 111388, 357964, 357211, 204678, 280943, 163744,
        188480, 358159, 111388, 222137,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0],
       [   157,  51873,    323,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0

#### Method that returns a pre-trained embedding layer

In [150]:
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding


Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [156]:
def pretrained_embedding(word_to_vec,word_to_index):
    vocab_len=len(word_to_index)+1
    vec_size=word_to_vec["i"].shape[0]
    emb_matrix=np.zeros(shape=(vocab_len,vec_size))
    for w,idx in word_to_index.items():
        emb_matrix[idx,:]=word_to_vec[w]
    embedding_layer = Embedding(input_dim=vocab_len,output_dim=vec_size,trainable=False)
    embedding_layer.build((None,))
    embedding_layer.set_weights([emb_matrix])
    return (embedding_layer)

#### Let's build the model

In [238]:
def sentiment_analyzer(word_to_vec,word_to_index,maxlen):
    sentence_indices=Input(shape=(maxlen,),dtype=np.int64)
    embedding_layer=pretrained_embedding(word_to_vec,word_to_index)
    X=embedding_layer(sentence_indices)
    X=LSTM(units=128,return_sequences=True)(X)
    X=Dropout(0.5)(X)
    X=LSTM(units=128,return_sequences=False)(X)
    X=Dropout(0.5)(X)
    X=Dense(1)(X)
    X=Activation("sigmoid")(X)
    model=Model(inputs=sentence_indices,outputs=X)
    return (model)

In [242]:
model=sentiment_analyzer(word_to_vec,word_to_index,60)

In [255]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 60)                0         
_________________________________________________________________
embedding_5 (Embedding)      (None, 60, 50)            20000050  
_________________________________________________________________
lstm_7 (LSTM)                (None, 60, 128)           91648     
_________________________________________________________________
dropout_8 (Dropout)          (None, 60, 128)           0         
_________________________________________________________________
lstm_8 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dropout_9 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 129       
__________

In [188]:
from tensorflow.python.keras.callbacks import EarlyStopping, ModelCheckpoint

In [256]:
checkpoint_path="D:/Stanford_sentiment_tree/best_model"
callback_checkpoint = ModelCheckpoint(filepath=checkpoint_path,
                                      monitor='val_loss',
                                      verbose=1,
                                      save_weights_only=True,
                                      save_best_only=True)

callback_early_stopping = EarlyStopping(monitor='val_loss',
                                        patience=2, verbose=1)
callbacks = [callback_early_stopping,
             callback_checkpoint]


In [257]:
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_squared_error'])

In [254]:
X_train_indices=sentences_to_indices(X_train,word_to_index,60)
X_test_indices=sentences_to_indices(X_test,word_to_index,60)

In [281]:
model.fit(x=X_train_indices,y=Y_train,batch_size=32,epochs=100,callbacks=callbacks,verbose=1,
          validation_data=(X_test_indices, Y_test))
                

Train on 191384 samples, validate on 47847 samples
Epoch 1/100

Epoch 00001: val_loss improved from inf to 0.03064, saving model to D:/Stanford_sentiment_tree/best_model
Epoch 2/100

Epoch 00002: val_loss improved from 0.03064 to 0.03064, saving model to D:/Stanford_sentiment_tree/best_model
Epoch 3/100

Epoch 00003: val_loss improved from 0.03064 to 0.03064, saving model to D:/Stanford_sentiment_tree/best_model
Epoch 4/100

Epoch 00004: val_loss improved from 0.03064 to 0.01799, saving model to D:/Stanford_sentiment_tree/best_model
Epoch 5/100

Epoch 00005: val_loss improved from 0.01799 to 0.01508, saving model to D:/Stanford_sentiment_tree/best_model
Epoch 6/100

Epoch 00006: val_loss improved from 0.01508 to 0.01408, saving model to D:/Stanford_sentiment_tree/best_model
Epoch 7/100

Epoch 00007: val_loss improved from 0.01408 to 0.01374, saving model to D:/Stanford_sentiment_tree/best_model
Epoch 8/100

Epoch 00008: val_loss improved from 0.01374 to 0.01253, saving model to D:/Stan

<keras.callbacks.History at 0x1909cfc0cc0>

In [310]:
def predict_sentence(sentence,word_to_index,maxlen,model):
    sentence_to_array=np.array(sentence)
    sentence_to_array=sentence_to_array.reshape(1)
    sentence_to_array=sentences_to_indices(sentence_to_array,word_to_index,maxlen)
    res=model.predict(sentence_to_array)
    if (0<=res and res<=0.2):
        sentiment="very negative"
    elif((0.2<res and res<=0.4)):
        sentiment="negative"
    elif((0.4<res and res<=0.6)):
        sentiment="neutral"
    elif((0.6<res and res<=0.8)):
        sentiment="positive"
    else :
        sentiment="very positive"
    print(sentiment)
                
    

In [316]:
predict_sentence("i love being here",word_to_index,60,model)

positive
