## Importing necessary libraries

In [1]:
# importing inltk and its dependencies
import torch
from inltk.inltk import tokenize
from inltk.inltk import get_embedding_vectors

# importing for data exploration and analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# classical model - random forest
from sklearn.ensemble import RandomForestClassifier

# deep learning model - cnn, lstm
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, Conv1D, MaxPooling1D,LSTM

# importing for training and test split
from sklearn.model_selection import train_test_split

# Tensorboard for visualisation
from tensorflow.keras.callbacks import TensorBoard
from time import time

## Reading Data

In [2]:
data = pd.read_csv("q1.csv")

## Preprocessing Data

In [3]:
%%time

# converting marks into int to prevent errors during training (as classification task requires int classes)
data = data.astype({'marks' : int})

# removing the nan ans from the dataframe
data = data[data['ans'].notna()]

# after droping the index gets reorganised, resetting it here
data.reset_index(drop= True, inplace= True)

CPU times: user 6.79 ms, sys: 0 ns, total: 6.79 ms
Wall time: 6.68 ms


In [4]:
%%time

# Feature extraction - No. of words
# creating a new series
no_of_words = pd.Series([], dtype = int) 

for row_index,row in data.iterrows():
    no = len(tokenize(row['ans'], 'or'))
    no_of_words[row_index] = no
    
# adding the created series into the dataframe at position 2
data.insert(2, "no_of_words", no_of_words)

pad_length = 124  # median no of words

CPU times: user 6.56 s, sys: 65 ms, total: 6.62 s
Wall time: 6.62 s


In [5]:
%%time

# Vectorising Answer Text
# creating a new series
vectors = pd.Series([], dtype = float) 
    
for row_index,row in data.iterrows():
    vect = get_embedding_vectors(row['ans'], 'or')     # getting embedding vectors
    vectors[row_index] =  vect
    
# Padding and trimming the vectors to a common length
pad_vectors = keras.preprocessing.sequence.pad_sequences(vectors, padding="post", maxlen=pad_length, dtype='float32')



CPU times: user 13.4 s, sys: 3.24 s, total: 16.6 s
Wall time: 16.9 s


In [6]:
# checking if padding done correctly
for _ in pad_vectors:
    if len(_) != pad_length:
        print(pad_vectors.index(_))

## Training and Testing

### Function for classical model - Random Forest Classifier

In [7]:
def train_rfc(x_train, x_test, y_train, y_test, n):
    # Reshaping the vector input into 2-d from 3-d as RFC exxcepts <=2d input
    nsamples, nx, ny = x_train.shape
    x_train_rfc = x_train.reshape((nsamples,nx*ny))

    nsamples, nx, ny = x_test.shape
    x_test_rfc = x_test.reshape((nsamples,nx*ny))

    rfc = RandomForestClassifier(n_estimators=26, random_state=200)
    clf = rfc.fit(x_train_rfc, y_train)
    acc = clf.score(x_test_rfc, y_test)*100
    return acc

### Function for DL model - CNN

In [8]:
def train_cnn(x_train, x_test, y_train, y_test, n):
    #Add Sequential
    model_cnn = keras.Sequential()

    # 1st convolutional layer
    model_cnn.add(Dropout(0.2))
    model_cnn.add(Conv1D(64, 4, activation='relu'))
    model_cnn.add(MaxPooling1D(pool_size=8))
    model_cnn.add(Flatten())
    model_cnn.add(Dense(60,activation='relu'))
    model_cnn.add(Dense(6,activation='softmax'))

    model_cnn.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        
#     Create a TensorBoard instance with the path to the logs directory
    tensorboard = TensorBoard(log_dir='logs/{}/cnn/{}'.format(n, time()))
    
    model_cnn.fit(x_train, y_train, epochs = 10,  verbose = 1, callbacks=[tensorboard])

    results = model_cnn.evaluate(x_test, y_test)
    return results[1]*100

### Function for DL model - LSTM

In [9]:
def train_lstm(x_train, x_test, y_train, y_test, n):
    model_lstm = keras.Sequential()

    model_lstm.add(Conv1D(input_shape=(pad_length,400), filters = pad_length, kernel_size = 40))
    model_lstm.add(LSTM(96, dropout=0.2))
    model_lstm.add(Dense(24,activation='sigmoid'))
    model_lstm.add(Dense(6,activation='softmax'))

    model_lstm.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

#     Create a TensorBoard instance with the path to the logs directory
    tensorboard = TensorBoard(log_dir='logs/{}/lstm/{}'.format(n, time()))

    fitmodel = model_lstm.fit(x_train, y_train, epochs = 10,  verbose = 1, callbacks=[tensorboard])   

    results = model_lstm.evaluate(x_test, y_test)
    return results[1]*100

### Training

In [10]:
%%time

acc_rfc_list = []
acc_cnn_list = []
acc_lstm_list = []
acc = {}

for n in [round(_ * 0.1, 1) for _ in range(2, 9)]:
    print("Training on:",n,"\n")
#     splitting data into training and testing set
    x_train, x_test, y_train, y_test = train_test_split(pad_vectors, data['marks'], test_size=round(1-n,1), random_state=22)
    
#     calling rfc for training
    rfc_acc = train_rfc(x_train, x_test, y_train, y_test, n)
    acc_rfc_list.append(rfc_acc)
    acc[n] = [rfc_acc]
    
#     converting input to tensors for input into DL models
    x_train_dl = tf.convert_to_tensor(x_train, np.float32)
    y_train_dl = tf.convert_to_tensor(y_train, np.float32)

    x_test_dl = tf.convert_to_tensor(x_test, np.float32)
    y_test_dl = tf.convert_to_tensor(y_test, np.float32)
    
    print("CNN Training--------------------------------------------",n,"\n")    
#     calling cnn for training
    cnn_acc = train_cnn(x_train_dl, x_test_dl, y_train_dl, y_test_dl, n)
    acc_cnn_list.append(cnn_acc)
    acc[n].append(cnn_acc)
    
    print("\nLSTM Training--------------------------------------------",n,"\n")
#     calling lstm for training
    lstm_acc = train_lstm(x_train_dl, x_test_dl, y_train_dl, y_test_dl, n)
    acc_lstm_list.append(lstm_acc)
    acc[n].append(lstm_acc)
    
    print("\n-------------------------------------------------------------------------------------------")
    print("-------------------------------------------------------------------------------------------")
    print("-------------------------------------------------------------------------------------------\n\n\n\n")

Training on: 0.2 

CNN Training-------------------------------------------- 0.2 

Train on 13 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

LSTM Training-------------------------------------------- 0.2 

Train on 13 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

-------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------




Training on: 0.3 

CNN Training-------------------------------------------- 0.3 

Train on 20 samples
Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

LSTM Training-------------------------------------------- 0.3 

Train on 20 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

-------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------




Training on: 0.4 

CNN Training-------------------------------------------- 0.4 

Train on 26 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10



LSTM Training-------------------------------------------- 0.4 

Train on 26 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

-------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------




Training on: 0.5 

CNN Training-------------------------------------------- 0.5 

Train on 33 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

LSTM Training-------------------------------------------- 0.5 

Train on 33 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10


Epoch 8/10
Epoch 9/10
Epoch 10/10

-------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------




Training on: 0.6 

CNN Training-------------------------------------------- 0.6 

Train on 40 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

LSTM Training-------------------------------------------- 0.6 

Train on 40 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

-------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

LSTM Training-------------------------------------------- 0.7 

Train on 46 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

-------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------




Training on: 0.8 

CNN Training-------------------------------------------- 0.8 

Train on 53 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

LSTM Training-------------------------------------------- 0.8 

Train on 53 samples
Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

-------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------




CPU times: user 3min 38s, sys: 3.83 s, total: 3min 42s
Wall time: 1min 53s


In [11]:
print("RFC:",acc_rfc_list)
print("CNN:",acc_cnn_list)
print("LSTM:",acc_lstm_list)

RFC: [35.18518518518518, 44.680851063829785, 39.02439024390244, 52.94117647058824, 51.85185185185185, 57.14285714285714, 42.857142857142854]
CNN: [38.88888955116272, 48.93617033958435, 48.78048896789551, 50.0, 48.148149251937866, 57.14285969734192, 64.28571343421936]
LSTM: [44.44444477558136, 57.446807622909546, 51.21951103210449, 32.35294222831726, 48.148149251937866, 61.90476417541504, 71.42857313156128]


In [12]:
acc

{0.2: [35.18518518518518, 38.88888955116272, 44.44444477558136],
 0.3: [44.680851063829785, 48.93617033958435, 57.446807622909546],
 0.4: [39.02439024390244, 48.78048896789551, 51.21951103210449],
 0.5: [52.94117647058824, 50.0, 32.35294222831726],
 0.6: [51.85185185185185, 48.148149251937866, 48.148149251937866],
 0.7: [57.14285714285714, 57.14285969734192, 61.90476417541504],
 0.8: [42.857142857142854, 64.28571343421936, 71.42857313156128]}