In [1]:
import numpy as np
import pandas as pd

In [2]:
import keras

Using TensorFlow backend.


In [3]:
# Importing the data

In [4]:
df = pd.read_csv('training_set_rel3.tsv', delimiter='\t', quoting=3, encoding='ISO-8859-1')

In [5]:
df.head()

Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,rater3_domain1,domain1_score,rater1_domain2,rater2_domain2,domain2_score,...,rater2_trait3,rater2_trait4,rater2_trait5,rater2_trait6,rater3_trait1,rater3_trait2,rater3_trait3,rater3_trait4,rater3_trait5,rater3_trait6
0,1,1,"""Dear local newspaper, I think effects compute...",4,4,,8,,,,...,,,,,,,,,,
1,2,1,"""Dear @CAPS1 @CAPS2, I believe that using comp...",5,4,,9,,,,...,,,,,,,,,,
2,3,1,"""Dear, @CAPS1 @CAPS2 @CAPS3 More and more peop...",4,3,,7,,,,...,,,,,,,,,,
3,4,1,"""Dear Local Newspaper, @CAPS1 I have found tha...",5,5,,10,,,,...,,,,,,,,,,
4,5,1,"""Dear @LOCATION1, I know having computers has ...",4,4,,8,,,,...,,,,,,,,,,


In [6]:
df = df.dropna(axis=1)
df.head()

Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,domain1_score
0,1,1,"""Dear local newspaper, I think effects compute...",4,4,8
1,2,1,"""Dear @CAPS1 @CAPS2, I believe that using comp...",5,4,9
2,3,1,"""Dear, @CAPS1 @CAPS2 @CAPS3 More and more peop...",4,3,7
3,4,1,"""Dear Local Newspaper, @CAPS1 I have found tha...",5,5,10
4,5,1,"""Dear @LOCATION1, I know having computers has ...",4,4,8


In [7]:
X = df['essay']
X.head()

0    "Dear local newspaper, I think effects compute...
1    "Dear @CAPS1 @CAPS2, I believe that using comp...
2    "Dear, @CAPS1 @CAPS2 @CAPS3 More and more peop...
3    "Dear Local Newspaper, @CAPS1 I have found tha...
4    "Dear @LOCATION1, I know having computers has ...
Name: essay, dtype: object

In [8]:
type(X)

pandas.core.series.Series

In [9]:
y = df['domain1_score']
y.head()

0     8
1     9
2     7
3    10
4     8
Name: domain1_score, dtype: int64

In [10]:
# Data Preprocessing

In [11]:
import re
import nltk

In [12]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\amans\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\amans\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [13]:
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords

In [14]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\amans\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [15]:
def preprocess(X):
    
    corpus = []
    for i in range(0,len(X)):
        essay = re.sub('[^a-zA-Z]',' ',X[i]) # Removes every char except [^a-zA-Z] 
        essay = essay.lower() # All characters in lower
        essay = essay.split() # split all the words
        lemmatizer = WordNetLemmatizer() # Lemmatize every word
        essay = [lemmatizer.lemmatize(word) for word in essay if not word in set(stopwords.words('english'))]
        essay = ' '.join(essay) # Join all the words
        corpus.append(essay)
        
    sent = []
    for i in range(len(corpus)):
        sent.append(nltk.sent_tokenize(corpus[i])) # Change all para into list of sentences
        
    word = []
    for i in range(len(sent)):
        word.append(nltk.word_tokenize(sent[i][0])) # Change all the sent into list of words
        
    return word

In [16]:
word = preprocess(X)

In [17]:
word[0]

['dear',
 'local',
 'newspaper',
 'think',
 'effect',
 'computer',
 'people',
 'great',
 'learning',
 'skill',
 'affect',
 'give',
 'u',
 'time',
 'chat',
 'friend',
 'new',
 'people',
 'help',
 'u',
 'learn',
 'globe',
 'astronomy',
 'keep',
 'u',
 'troble',
 'thing',
 'dont',
 'think',
 'would',
 'feel',
 'teenager',
 'always',
 'phone',
 'friend',
 'ever',
 'time',
 'chat',
 'friend',
 'buisness',
 'partner',
 'thing',
 'well',
 'new',
 'way',
 'chat',
 'computer',
 'plenty',
 'site',
 'internet',
 'organization',
 'organization',
 'cap',
 'facebook',
 'myspace',
 'ect',
 'think',
 'setting',
 'meeting',
 'bos',
 'computer',
 'teenager',
 'fun',
 'phone',
 'rushing',
 'get',
 'cause',
 'want',
 'use',
 'learn',
 'country',
 'state',
 'outside',
 'well',
 'computer',
 'internet',
 'new',
 'way',
 'learn',
 'going',
 'time',
 'might',
 'think',
 'child',
 'spends',
 'lot',
 'time',
 'computer',
 'ask',
 'question',
 'economy',
 'sea',
 'floor',
 'spreading',
 'even',
 'date',
 'surpri

In [18]:
# Training the word2vec model

In [20]:
from gensim.models import Word2Vec

In [21]:
num_features = 150

In [22]:
vec = Word2Vec(word, min_count=5, size=num_features)

In [23]:
vec.wv.vocab

{'dear': <gensim.models.keyedvectors.Vocab at 0x1bd02ab5688>,
 'local': <gensim.models.keyedvectors.Vocab at 0x1bd044d8948>,
 'newspaper': <gensim.models.keyedvectors.Vocab at 0x1bd044d8788>,
 'think': <gensim.models.keyedvectors.Vocab at 0x1bd044aad48>,
 'effect': <gensim.models.keyedvectors.Vocab at 0x1bd044b4a88>,
 'computer': <gensim.models.keyedvectors.Vocab at 0x1bd044b4488>,
 'people': <gensim.models.keyedvectors.Vocab at 0x1bd044b4e48>,
 'great': <gensim.models.keyedvectors.Vocab at 0x1bd044b4c88>,
 'learning': <gensim.models.keyedvectors.Vocab at 0x1bd044aae88>,
 'skill': <gensim.models.keyedvectors.Vocab at 0x1bd044b43c8>,
 'affect': <gensim.models.keyedvectors.Vocab at 0x1bd044b4648>,
 'give': <gensim.models.keyedvectors.Vocab at 0x1bd044b44c8>,
 'u': <gensim.models.keyedvectors.Vocab at 0x1bd044b4048>,
 'time': <gensim.models.keyedvectors.Vocab at 0x1bd044b4fc8>,
 'chat': <gensim.models.keyedvectors.Vocab at 0x1bd044b4708>,
 'friend': <gensim.models.keyedvectors.Vocab at 0x

In [24]:
vec['dear']

  """Entry point for launching an IPython kernel.


array([ 0.05507124, -0.57404244, -2.1236284 ,  2.1481616 ,  1.1486664 ,
       -0.38436928, -1.1930691 ,  0.7938984 , -1.0582592 , -0.76002663,
        0.82205945, -0.7639695 , -0.91305566, -2.1619873 ,  2.0387795 ,
        1.3983226 , -0.32995537, -1.2885841 ,  1.6247355 , -3.1107125 ,
        1.2497758 ,  1.0077168 , -1.4004478 ,  0.5205935 ,  0.9477795 ,
       -1.2932314 , -0.5309005 , -2.4648468 ,  0.3570563 ,  2.3643563 ,
       -1.5367496 ,  0.2539195 , -1.3557298 , -0.14220917, -0.649685  ,
       -0.19502404, -1.3714997 , -1.2523214 ,  2.736435  , -1.3496289 ,
       -0.47780353,  0.8633374 , -0.2804938 ,  0.57324386, -0.5073209 ,
       -0.69269645, -0.377241  ,  1.1540724 ,  0.3934464 ,  0.333351  ,
        0.3155654 , -0.65767205, -0.52224433, -0.16823965, -0.41854784,
        0.97287184,  0.19560811,  0.10124855,  1.5187714 ,  0.9385061 ,
       -0.29709023,  0.48454416,  0.27507886,  0.4039023 ,  0.5495643 ,
        0.22860512,  0.57777137, -0.9883676 ,  0.6480119 ,  0.70

In [25]:
def makeFeatureVec(words, model, num_features):
    """Make Feature Vector from the words list of an Essay."""
    featureVec = np.zeros((num_features,))
    num_words = 0.
    index2word_set = set(model.wv.index2word)
    for word in words:
        if word in index2word_set:
            num_words += 1
            featureVec = np.add(featureVec,model[word])        
    featureVec = np.divide(featureVec,num_words)
    return featureVec

def getAvgFeatureVecs(essays, model, num_features):
    """Main function to generate the word vectors for word2vec model."""
    counter = 0
    essayFeatureVecs = np.zeros((len(essays),num_features))
    for essay in essays:
        essayFeatureVecs[counter] = makeFeatureVec(essay, model, num_features)
        counter = counter + 1
    return essayFeatureVecs

In [26]:
final_X = getAvgFeatureVecs(word, vec, num_features)

  if __name__ == '__main__':


In [27]:
final_X

array([[ 0.18242029,  0.33923656,  0.26098029, ..., -0.02968219,
        -0.07837768,  0.4356697 ],
       [ 0.05108483,  0.36870425,  0.07579173, ..., -0.21773472,
        -0.00979356,  0.41006652],
       [ 0.02194581,  0.18249852, -0.00491926, ..., -0.00802658,
        -0.06918276,  0.33210004],
       ...,
       [ 0.24565808,  0.31499432,  0.16860829, ...,  0.00145448,
        -0.02487484, -0.03660882],
       [ 0.14494165,  0.27025834,  0.25117121, ...,  0.03491832,
         0.01277465, -0.10691901],
       [ 0.12533526,  0.29602973,  0.3825591 , ...,  0.15060224,
        -0.15690097,  0.04885805]])

In [29]:
len(final_X), len(final_X[0])

(12978, 150)

In [30]:
final_X.shape

(12978, 150)

In [31]:
# Splitting into train and test set

In [32]:
from sklearn.model_selection import train_test_split

In [36]:
X_train, X_test, y_train, y_test = train_test_split(final_X, y, test_size=0.2)

In [37]:
len(X_train), len(y_train)

(10382, 10382)

In [38]:
len(X_test), len(y_test)

(2596, 2596)

In [39]:
# Reshaping X

In [40]:
X_train = np.reshape(X_train,(X_train.shape[0],1,X_train.shape[1]))
X_test = np.reshape(X_test,(X_test.shape[0],1,X_test.shape[1]))

In [41]:
X_train.shape, X_test.shape

((10382, 1, 150), (2596, 1, 150))

In [42]:
# Building the LSTM model

In [47]:
model = keras.models.Sequential([
    keras.layers.LSTM(150, dropout=0.4, recurrent_dropout=0.4, input_shape=[1, 150], return_sequences=True),
    keras.layers.LSTM(64, recurrent_dropout=0.4),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(1,activation='relu')
])

In [48]:
model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['mae'])

In [49]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_3 (LSTM)                (None, 1, 150)            180600    
_________________________________________________________________
lstm_4 (LSTM)                (None, 64)                55040     
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65        
Total params: 235,705
Trainable params: 235,705
Non-trainable params: 0
_________________________________________________________________


In [50]:
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=15,batch_size=64)

Train on 10382 samples, validate on 2596 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.callbacks.History at 0x1bd11a8d948>

In [51]:
y_pred = model.predict(X_test)

In [52]:
y_pred

array([[2.006921 ],
       [2.5693684],
       [2.011354 ],
       ...,
       [1.8839948],
       [2.0064695],
       [8.25703  ]], dtype=float32)

In [53]:
y_test.shape, y_pred.shape

((2596,), (2596, 1))

In [54]:
# Checking kappa score

In [56]:
from sklearn.metrics import cohen_kappa_score

In [59]:
result = cohen_kappa_score(y_test,np.round(y_pred),weights='quadratic')
result

0.9595757968460387

In [60]:
# Saving word2vec model

In [15]:
from gensim.models import Word2Vec

In [61]:
vec.wv.save_word2vec_format('word2vecmodel.bin', binary=True)

In [62]:
# Saving lstm model

In [63]:
model.save('lstmmodel.h5')