<a href="https://colab.research.google.com/github/Aayush360/Natural_langauge_processing/blob/main/Question_Classifier_using_NN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# notebook imports

In [2]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

import pandas as pd
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [9]:
train_data = open('training_data.txt','r+', encoding='utf-8')
test_data = open('test_label.txt','r+',encoding='utf-8')

In [14]:
train_data_4 = open('train_4000.label.txt','r+',encoding='utf-8')

In [10]:
train = pd.DataFrame(train_data.readlines(),columns=['Questions'])
test = pd.DataFrame(test_data.readlines(),columns=['Questions'])

In [15]:
train_4 = pd.DataFrame(train_data_4.readlines(),columns=['Questions'])

In [11]:
train.head()

Unnamed: 0,Questions
0,DESC:manner How did serfdom develop in and the...
1,ENTY:cremat What films featured the character ...
2,DESC:manner How can I find a list of celebriti...
3,ENTY:animal What fowl grabs the spotlight afte...
4,ABBR:exp What is the full form of .com ?\n


In [16]:
train_4.head()

Unnamed: 0,Questions
0,DESC:manner How did serfdom develop in and the...
1,ENTY:cremat What films featured the character ...
2,DESC:manner How can I find a list of celebriti...
3,ENTY:animal What fowl grabs the spotlight afte...
4,ABBR:exp What is the full form of .com ?\n


In [17]:
# Our dataset offers a unique challenge in terms of segregating it into Questions and Question Types, 
# which are attached together. Furthermore, as the Question Type consists of both coarse and fine classes,
#  we will need to perform separation for these as well

In [41]:
# example 

# labels: ABBREVIATION ENTITY DESCRIPTION HUMAN LOCATION NUMERIC

In [19]:
sent = 'DESC:manner How did serfdom develop in and th'

In [35]:
sent.split(' ',1)[1]

'How did serfdom develop in and th'

In [30]:
# split the data points to obtain question strings and coarse and fine question categories

In [36]:
train['Qtype'] = train.Questions.apply(lambda x: x.split(' ',1)[0])
train['Questions'] = train.Questions.apply(lambda x: x.split(' ',1)[1])

train['QType-Coarse'] = train.Qtype.apply(lambda x: x.split(':')[0])
train['QType-Fine'] = train.Qtype.apply(lambda x: x.split(':')[1])

In [37]:
train.head()

Unnamed: 0,Questions,Qtype,QType-Coarse,QType-Fine
0,How did serfdom develop in and then leave Russ...,DESC:manner,DESC,manner
1,What films featured the character Popeye Doyle...,ENTY:cremat,ENTY,cremat
2,How can I find a list of celebrities ' real na...,DESC:manner,DESC,manner
3,What fowl grabs the spotlight after the Chines...,ENTY:animal,ENTY,animal
4,What is the full form of .com ?\n,ABBR:exp,ABBR,exp


In [38]:
# performing the same for test dataset

In [39]:
test['Qtype'] = test.Questions.apply(lambda x: x.split(' ',1)[0])
test['Questions'] = test.Questions.apply(lambda x: x.split(' ',1)[1])

test['QType-Coarse'] = test.Qtype.apply(lambda x: x.split(':')[0])
test['QType-Fine'] = test.Qtype.apply(lambda x: x.split(':')[1])

In [40]:
test.head()

Unnamed: 0,Questions,Qtype,QType-Coarse,QType-Fine
0,How far is it from Denver to Aspen ?\n,NUM:dist,NUM,dist
1,"What county is Modesto , California in ?\n",LOC:city,LOC,city
2,Who was Galileo ?\n,HUM:desc,HUM,desc
3,What is an atom ?\n,DESC:def,DESC,def
4,When did Hawaii become a state ?\n,NUM:date,NUM,date


In [42]:
# as we are interested in finding coarse class, we will pop out the QType and Qtype-Fine

In [44]:
train.pop('Qtype')
train.pop('QType-Fine')

test.pop('Qtype')
test.pop('QType-Fine')

0           dist
1           city
2           desc
3            def
4           date
         ...    
495          ind
496     currency
497        count
498    substance
499          def
Name: QType-Fine, Length: 500, dtype: object

In [45]:
# let us look at the classes of data in our dataset

In [49]:
train.columns

Index(['Questions', 'QType-Coarse'], dtype='object')

In [53]:
classes = train['QType-Coarse'].unique()
classes

array(['DESC', 'ENTY', 'ABBR', 'HUM', 'NUM', 'LOC'], dtype=object)

In [54]:
# now we will use label encoder to convert our classes into integral classes

In [60]:
le = LabelEncoder()
le.fit(pd.Series(train['QType-Coarse'].tolist()+ test['QType-Coarse'].tolist()).values)
train['QType-Coarse']=le.transform(train['QType-Coarse'].values)
test['QType-Coarse']=le.transform(test['QType-Coarse'].values)

In [58]:
pd.Series(train['QType-Coarse'].tolist()+ test['QType-Coarse'].tolist()).values

array(['DESC', 'ENTY', 'DESC', ..., 'NUM', 'ENTY', 'DESC'], dtype=object)

In [61]:
train.head()

Unnamed: 0,Questions,QType-Coarse
0,How did serfdom develop in and then leave Russ...,1
1,What films featured the character Popeye Doyle...,2
2,How can I find a list of celebrities ' real na...,1
3,What fowl grabs the spotlight after the Chines...,2
4,What is the full form of .com ?\n,0


In [62]:
# preprocess the question using the preprocessing pipleline

In [64]:
def text_clean(corpus):
    '''
    Purpose : Function to keep only alphabets, digits and certain words (punctuations, qmarks, tabs etc. removed)
    
    Input : Takes a text corpus, 'corpus' to be cleaned along with a list of words, 'keep_list', which have to be retained
            even after the cleaning process
    
    Output : Returns the cleaned text corpus
    
    '''
    cleaned_corpus = []
    for row in corpus:
        qs = []
        for word in row.split():
            p1 = re.sub(pattern='[^a-zA-Z0-9]',repl=' ',string=word)
            p1 = p1.lower()
            qs.append(p1)
        cleaned_corpus.append(' '.join(qs))
    return cleaned_corpus

In [65]:
def stopwords_removal(corpus):
    wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom']
    stop = set(stopwords.words('english'))
    for word in wh_words:
        stop.remove(word) # removing wh words from the set of stopwords
    corpus = [[x for x in x.split() if x not in stop] for x in corpus]
    return corpus

In [66]:
def lemmatize(corpus):
    lem = WordNetLemmatizer()
    corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus]
    return corpus

In [67]:
def stem(corpus, stem_type = None):
    if stem_type == 'snowball':
        stemmer = SnowballStemmer(language = 'english')
        corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    else :
        stemmer = PorterStemmer()
        corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    return corpus

In [68]:
def preprocess(corpus, cleaning = True, stemming = False, stem_type = None, lemmatization = False, remove_stopwords = True):
    '''
    Purpose : Function to perform all pre-processing tasks (cleaning, stemming, lemmatization, stopwords removal etc.)
    
    Input : 
    'corpus' - Text corpus on which pre-processing tasks will be performed
    'keep_list' - List of words to be retained during cleaning process
    'cleaning', 'stemming', 'lemmatization', 'remove_stopwords' - Boolean variables indicating whether a particular task should 
                                                                  be performed or not
    'stem_type' - Choose between Porter stemmer or Snowball(Porter2) stemmer. Default is "None", which corresponds to Porter
                  Stemmer. 'snowball' corresponds to Snowball Stemmer
    
    Note : Either stemming or lemmatization should be used. There's no benefit of using both of them together
    
    Output : Returns the processed text corpus
    
    '''
    
    if cleaning == True:
        corpus = text_clean(corpus)
    
    if remove_stopwords == True:
        corpus = stopwords_removal(corpus)
    else :
        corpus = [[x for x in x.split()] for x in corpus]
    
    if lemmatization == True:
        corpus = lemmatize(corpus)
        
        
    if stemming == True:
        corpus = stem(corpus, stem_type)
    
    corpus = [' '.join(x) for x in corpus]        

    return corpus

In [72]:
pd.Series(train.Questions.tolist()+test.Questions.tolist())

0       How did serfdom develop in and then leave Russ...
1       What films featured the character Popeye Doyle...
2       How can I find a list of celebrities ' real na...
3       What fowl grabs the spotlight after the Chines...
4                       What is the full form of .com ?\n
                              ...                        
5947             Who was the 22nd President of the US ?\n
5948             What is the money they use in Zambia ?\n
5949                          How many feet in a mile ?\n
5950                What is the birthstone of October ?\n
5951                                   What is e-coli ?\n
Length: 5952, dtype: object

In [73]:
all_corpus = pd.Series(train.Questions.tolist()+test.Questions.tolist()).astype('str')
all_corpus = preprocess(all_corpus, remove_stopwords=True)

In [75]:
all_corpus[0]

'how serfdom develop leave russia'

In [76]:
# now we will convert the data back into training and test corpus , as we have merged it in the above line of codes

In [77]:
train_corpus = all_corpus[:train.shape[0]]
test_corpus = all_corpus[train.shape[0]:]

In [78]:
print('size of training dataset',len(train_corpus))
print('size of test dataset is: ',len(test_corpus))

size of training dataset 5452
size of test dataset is:  500


In [79]:
# since we require our features to be in mathematical format we will vectorie it using TF-IDF vectorizer

In [105]:
vectorizer = TfidfVectorizer()
tf_idf_matrix_train = vectorizer.fit_transform(train_corpus)
tf_idf_matrix_test = vectorizer.transform(test_corpus)

In [81]:
# our embeddings are now ready,,, we will be using keras for training our ANN model

In [82]:
import keras
from keras.models import Sequential,Model
from keras import layers
from keras.layers import Dense,Dropout,Input

from keras.utils import np_utils

In [83]:
# as we know that our labels are not oridanl, they need to be onehot encoded before feeding into the model

In [85]:
y_train = np_utils.to_categorical(train['QType-Coarse'],train['QType-Coarse'].nunique())
y_test = np_utils.to_categorical(test['QType-Coarse'],test['QType-Coarse'].nunique())

In [97]:
tf_idf_matrix_train.todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [87]:
y_train # since 6 unique labels, 6 columns -- 1 if true 0 otherwise

array([[0., 1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0., 0.]], dtype=float32)

In [88]:
# defininng our network architecture

In [89]:
model = Sequential()
model.add(Dense(128,activation='relu',input_dim=tf_idf_matrix_train.shape[1]))
model.add(Dropout(0.3))
model.add(Dense(6,activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 128)               1061760   
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 774       
Total params: 1,062,534
Trainable params: 1,062,534
Non-trainable params: 0
_________________________________________________________________


In [90]:
tf_idf_matrix_train.shape[1]

8294

In [91]:
tf_idf_matrix_train.shape

(5452, 8294)

In [92]:
train.shape

(5452, 2)

In [95]:
y_train.shape

(5452, 6)

In [93]:
# Since we have provided a value of 0.3, it will randomly delete 30% of the neurons during each training epoch so as to prevent overfitting.


In [98]:
training_history = model.fit(tf_idf_matrix_train.todense(),y_train,epochs=10, batch_size=100)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [99]:
# model evaluation

In [108]:
loss,accuracy = model.evaluate(tf_idf_matrix_test.todense(),y_test,verbose=False)

In [109]:
print('Testing Accuracy: {:4f} '.format(accuracy))

Testing Accuracy: 0.854000 


In [110]:
# our model achieves performance of 85.4% on the test data

In [106]:
tf_idf_matrix_test.shape

(500, 8294)

In [107]:
y_test.shape

(500, 6)

In [104]:
tf_idf_matrix_test.todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [111]:
# let us save the model architecture and weights 

In [112]:
import h5py

In [113]:
model_structure = model.to_json()

with open('question_classification_model.json','w') as json_file:
  json_file.write(model_structure)

model.save_weights('question_classification_model_weights.h5')