# Answer Evaluator using CNN

In [1]:
#import necessary modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from inltk.inltk import tokenize,get_embedding_vectors
import tensorflow as tf

In [2]:
#import csv file
dataset = pd.read_csv('q1.csv')
dataset= dataset[dataset['ans'].notna()]#ignore NAN values
dataset.reset_index(drop= True, inplace= True)  # reset dataset index after ignoring NAN values
print(dataset.head())

  img_id                                                ans  marks
0  IMG01  ଇଂଗରେଜକଂ ସାମରାଜ୍ୟ କୁ ସୁଦୃଢ କରିବା ପାଇଁ ୱେଲସେଲି ...    5.0
1  IMG02  ଇଂଗରେଜକଂ ପ୍ରଭାବିତ ରାଜ୍ୟ ବିସ୍ତାର ପାଇଁ ରାଜ୍ୟାଭିଳ...    5.0
2  IMG04  ସାମନ୍ତ ସନ୍ଧି ପ୍ରଥା ୱେଲସେଲି ପ୍ରଣୟନ କରିଥିଲେ|ଏହାର...    4.0
3  IMG05  ସାମନ୍ତ ସନ୍ଧି ଲର୍ଡ଼ ମାକଏଲେକଂ ଦ୍ୱାରା ପ୍ରଣୀତ ହୋଇଥ...    1.0
4  IMG06  ଭାରତରେ ଇଂଗ୍ରେଜ଼ ମାନଂକ ସାମ୍ରାଜ୍ୟ ବିସ୍ତାର ପାଇଁ ୱ...    5.0


In [3]:
dataset = dataset.astype({'marks' : int})
print(dataset.marks.unique())

[5 4 1 3 2 0]


## Vectorization and Padding

In [4]:
%%time

# creating a new series
vectors = []
    
for i in dataset['ans']:
    vectors.append(get_embedding_vectors(i, 'or'))     # getting embedding vectors



CPU times: user 15.9 s, sys: 2.88 s, total: 18.8 s
Wall time: 11.3 s


In [5]:
print(len(vectors))

67


In [6]:
print(len(vectors[0]))
print(len(vectors[1]))

178
211


In [7]:
#Padding
print("Length before padding",len(vectors[0])," ",len(vectors[1]))
vectors = tf.keras.preprocessing.sequence.pad_sequences(vectors, padding="post", maxlen=211, dtype='float32')
print("Length after padding",len(vectors[0])," ",len(vectors[1]))


Length before padding 178   211
Length after padding 211   211


## Training and Testing 

In [8]:
#classify training and test set
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(vectors,dataset['marks'],test_size=0.15,random_state=42)

In [9]:
print(y_test)

36    2
16    3
4     5
9     2
45    5
40    4
61    4
5     5
64    2
12    5
25    0
Name: marks, dtype: int64


In [10]:
y_test = np.array(y_test)
y_train = np.array(y_train)

In [11]:
y_train

array([4, 4, 5, 4, ..., 1, 3, 3, 5])

## CNN 

In [12]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, Conv1D, MaxPooling1D,LSTM

In [13]:
#Add Sequential
model = Sequential()
# !st convolutional layer
model.add(Dropout(0.2))
model.add(Conv1D(64, 4, activation='relu'))
# model.add(Conv1D(input_shape=(211,400), filters = 211, kernel_size = 40))
model.add(MaxPooling1D(pool_size=2))
# model.add(LSTM(180))
model.add(Flatten())
model.add(Dense(60,activation='relu'))
model.add(Dense(6,activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs = 10,batch_size=10)

Train on 56 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f4a30165e10>

In [14]:
model.summary()
rel = model.evaluate(X_test,y_test)
print(rel[1]*100)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dropout (Dropout)            multiple                  0         
_________________________________________________________________
conv1d (Conv1D)              multiple                  102464    
_________________________________________________________________
max_pooling1d (MaxPooling1D) multiple                  0         
_________________________________________________________________
flatten (Flatten)            multiple                  0         
_________________________________________________________________
dense (Dense)                multiple                  399420    
_________________________________________________________________
dense_1 (Dense)              multiple                  366       
Total params: 502,250
Trainable params: 502,250
Non-trainable params: 0
__________________________________________________

In [15]:
result = model.predict(X_test)
# len(result[1])
print(result)

[[1.806335e-01 1.131919e-01 2.824092e-02 8.977705e-03 6.593186e-01 9.637419e-03]
 [1.850683e-03 1.447694e-02 1.141162e-03 4.597169e-01 3.722636e-01 1.505508e-01]
 [2.576575e-05 3.375008e-04 1.807620e-04 2.996749e-02 8.705543e-01 9.893417e-02]
 [2.698992e-01 3.320460e-01 2.102666e-01 8.336794e-02 3.093567e-02 7.348456e-02]
 ...
 [1.296175e-09 8.284001e-08 5.250368e-08 1.847554e-08 7.655748e-09 9.999999e-01]
 [7.656575e-10 1.090129e-07 1.493662e-07 3.301665e-08 4.053788e-08 9.999996e-01]
 [1.424053e-05 1.157832e-04 4.207866e-05 1.343860e-04 1.828126e-02 9.814122e-01]
 [4.667041e-01 4.216103e-01 3.580407e-02 2.453931e-02 2.585656e-02 2.548546e-02]]


In [16]:
final_res = []
for val in result:
    final_res.append(np.where(val == val.max()))
print(final_res)
print(y_test)

[(array([4]),), (array([3]),), (array([4]),), (array([1]),), (array([5]),), (array([4]),), (array([4]),), (array([5]),), (array([5]),), (array([5]),), (array([0]),)]
[2 3 5 2 5 4 4 5 2 5 0]
