In [11]:
# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding

## Plot
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)
import matplotlib as plt

# NLTK
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

# Other
import re
import string
from sklearn.manifold import TSNE

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('/content/Question - Sheet1.csv')
df.head()

Unnamed: 0,Question,Class
0,In which decade was the American Institute of ...,Sc
1,What is part of a database that holds only one...,Sc
2,OS' computer abbreviation usually means ?,Sc
3,In which decade with the first transatlantic r...,Sc
4,.MOV' extension refers usually to what kind of...,Sc


In [3]:
df.describe()

Unnamed: 0,Question,Class
count,131,131
unique,130,2
top,Which of the above statements is/are correct?,Ht
freq,2,91


In [4]:
def ratio(x):
  if x=='Sc':
    return 1
  else:
    return 0;

In [15]:
df['label'] = df.Class.apply(lambda x: ratio(x))

Labelling

In [16]:
df

Unnamed: 0,Question,Class,processed_tweets,label
0,In which decade was the American Institute of ...,Sc,decade american institute electrical engineer ...,1
1,What is part of a database that holds only one...,Sc,part database hold one type information,1
2,OS' computer abbreviation usually means ?,Sc,o computer abbreviation usually mean,1
3,In which decade with the first transatlantic r...,Sc,decade first transatlantic radio broadcast occur,1
4,.MOV' extension refers usually to what kind of...,Sc,mov extension refer usually kind file,1
...,...,...,...,...
126,Which of the following ancient Tamil Kingdoms ...,Ht,follow ancient tamil kingdom come know sangam ...,0
127,Which of the following book is the sequel of a...,Ht,follow book sequel epic silappadikarma,0
128,Which of the following is the equivalent term ...,Ht,follow equivalent term use raja early vedic era,0
129,Who among the following was contemporary of Pu...,Ht,among follow contemporary pushyamitra sunga,0


Processing of the Text

In [6]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re

In [7]:
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [8]:
stop_words = set(stopwords.words("english"))
stop_words.add('rt')
stop_words.remove('not')
lemmatizer = WordNetLemmatizer()
giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|' '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
mention_regex = '@[\w\-]+'

def clean_text(text):
    text = re.sub('"', "", text)
    text = re.sub(mention_regex, ' ',text) #removing all user names
    text = re.sub(giant_url_regex, ' ', text)  #remocing the urls
    text = text.lower()
    text = re.sub("hm+", "", text) #removing variants of hmmm
    text = re.sub("[^a-z]+", " ", text) #removing all numbers, special chars like @,#,? etc
    text = text.split()
    text = [word for word in text if not word in stop_words]
    #text = [d[word] if word in d else word for word in text]  #replacing some slangs
    text = [lemmatizer.lemmatize(token) for token in text]
    text = [lemmatizer.lemmatize(token, "v") for token in text]
    text = " ".join(text)
    return text

In [10]:
df['processed_tweets'] = df.Question.apply(lambda x: clean_text(x))   # df.review.map(clean_text) Also can be used
df.head()

Unnamed: 0,Question,Class,processed_tweets
0,In which decade was the American Institute of ...,Sc,decade american institute electrical engineer ...
1,What is part of a database that holds only one...,Sc,part database hold one type information
2,OS' computer abbreviation usually means ?,Sc,o computer abbreviation usually mean
3,In which decade with the first transatlantic r...,Sc,decade first transatlantic radio broadcast occur
4,.MOV' extension refers usually to what kind of...,Sc,mov extension refer usually kind file


In [12]:
from sklearn.model_selection import train_test_split as tts

In [13]:
df['processed_tweets'] = df['processed_tweets'].astype(str)

In [17]:
x = df['processed_tweets']
y = df['label']

In [18]:
num_words = 8000
embed_dim = 32
tokenizer = Tokenizer(num_words=num_words,oov_token = "<oov>" )
tokenizer.fit_on_texts(x)
word_index=tokenizer.word_index
sequences = tokenizer.texts_to_sequences(x)
length=[]
for i in sequences:
    length.append(len(i))
print(len(length))
print("Mean is: ",np.mean(length))
print("Max is: ",np.max(length))
print("Min is: ",np.min(length))

131
Mean is:  7.0534351145038165
Max is:  17
Min is:  2


In [40]:
pad_length = 24
sequences = pad_sequences(sequences, maxlen = pad_length, truncating = 'pre', padding = 'post')
sequences.shape

(131, 24)

In [41]:
x_train,x_test,y_train,y_test = tts(sequences,y,test_size = 0.2)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(104, 24)
(27, 24)
(104,)
(27,)


Using LSTM Binnary Classification

In [42]:
model_lstm = Sequential()
model_lstm.add(Embedding(20000, 100, input_length=50))
model_lstm.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model_lstm.add(Dense(1, activation='sigmoid'))
model_lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [44]:
model_lstm.fit(x_train, y_train, validation_split=0.2, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7fc72e3eec90>

In [45]:
model_lstm.evaluate(x_test, y_test)



[0.6093665361404419, 0.7037037014961243]

CNN+LSTM

In [48]:
vocabulary_size = 20000

In [49]:
def create_conv_model():
    model_conv = Sequential()
    model_conv.add(Embedding(vocabulary_size, 100, input_length=50))
    model_conv.add(Dropout(0.2))
    model_conv.add(Conv1D(64, 5, activation='relu'))
    model_conv.add(MaxPooling1D(pool_size=4))
    model_conv.add(LSTM(100))
    model_conv.add(Dense(1, activation='sigmoid'))
    model_conv.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model_conv

In [50]:
model_conv = create_conv_model()
model_conv.fit(x_train, y_train, validation_split=0.2, epochs = 50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7fc72e3548d0>

In [51]:
model_conv.evaluate(x_test, y_test)



[0.6166264414787292, 0.7037037014961243]

RNN

In [56]:
from keras.layers import Dense, Embedding, Dropout , Activation, Flatten, SimpleRNN
import tensorflow as tf
from keras.layers import GlobalMaxPool1D

In [62]:
recall = tf.keras.metrics.Recall()
precision = tf.keras.metrics.Precision()

model = Sequential([Embedding(num_words, embed_dim, input_length = pad_length),
                   SimpleRNN(8, return_sequences = True),
                   GlobalMaxPool1D(),
                   Dense(20,activation = 'relu'),
                   Dropout(0.25),
                   Dense(1,activation = 'sigmoid')])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 24, 32)            256000    
_________________________________________________________________
simple_rnn_4 (SimpleRNN)     (None, 24, 8)             328       
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 8)                 0         
_________________________________________________________________
dense_12 (Dense)             (None, 20)                180       
_________________________________________________________________
dropout_4 (Dropout)          (None, 20)                0         
_________________________________________________________________
dense_13 (Dense)             (None, 1)                 21        
Total params: 256,529
Trainable params: 256,529
Non-trainable params: 0
_______________________________________________

In [63]:
history = model.fit(x = x_train, y = y_train, epochs = 10,validation_split = 0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [65]:
predictions = model.predict(x_test)
predict = []
for i in predictions:
    predict.append(np.argmax(i))

In [66]:
from sklearn import metrics

print(metrics.classification_report(y_test, predict))

              precision    recall  f1-score   support

           0       0.70      1.00      0.83        19
           1       0.00      0.00      0.00         8

    accuracy                           0.70        27
   macro avg       0.35      0.50      0.41        27
weighted avg       0.50      0.70      0.58        27




Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



Applying Machine Learning Algoritham

In [73]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC
from sklearn.svm import SVC

In [74]:
vectorizer = TfidfVectorizer(max_features = 1000 )
# tokenize and build vocab

vectorizer.fit(x)
# summarize

print(len(vectorizer.vocabulary_))
print(vectorizer.idf_.shape)

516
(516,)


In [75]:
x_tfidf = vectorizer.transform(x).toarray()
print(x_tfidf.shape)

(131, 516)


Linear SVM

In [76]:
svm_model = LinearSVC(class_weight='balanced',multi_class='crammer_singer',max_iter = -1).fit(x_train, y_train)
svm_model_predict = svm_model.predict(x_test)
svm_report = classification_report(y_test, svm_model_predict )
print(svm_report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        19
           1       0.30      1.00      0.46         8

    accuracy                           0.30        27
   macro avg       0.15      0.50      0.23        27
weighted avg       0.09      0.30      0.14        27




Liblinear failed to converge, increase the number of iterations.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



Logistic Regression

In [77]:
logistic_reg_model = LogisticRegression(n_jobs = -1, penalty='l2', multi_class='multinomial',class_weight = 'balanced',verbose=1).fit(x_train,y_train)
lr_model_predict = logistic_reg_model.predict(x_test)
lr_model_report = classification_report(y_test, lr_model_predict)
print(lr_model_report)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


              precision    recall  f1-score   support

           0       0.70      1.00      0.83        19
           1       0.00      0.00      0.00         8

    accuracy                           0.70        27
   macro avg       0.35      0.50      0.41        27
weighted avg       0.50      0.70      0.58        27



[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    1.0s finished

Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



Decision Tree

In [78]:
from sklearn.tree import DecisionTreeClassifier
dct = DecisionTreeClassifier(criterion='entropy', random_state=1)
decision_tree_model = dct.fit(x_train,y_train)
decision_tree_model_predict = decision_tree_model.predict(x_test)
decision_tree_report = classification_report(y_test,decision_tree_model_predict)
print(decision_tree_report)

              precision    recall  f1-score   support

           0       0.70      1.00      0.83        19
           1       0.00      0.00      0.00         8

    accuracy                           0.70        27
   macro avg       0.35      0.50      0.41        27
weighted avg       0.50      0.70      0.58        27




Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



Random Forest

In [79]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=20)
random_forest_model = clf.fit(x_train,y_train)
random_forest_model_predict = random_forest_model.predict(x_test)
random_forest_report = classification_report(y_test,random_forest_model_predict)
print(random_forest_report)

              precision    recall  f1-score   support

           0       0.70      1.00      0.83        19
           1       0.00      0.00      0.00         8

    accuracy                           0.70        27
   macro avg       0.35      0.50      0.41        27
weighted avg       0.50      0.70      0.58        27




Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



Multi Class Naive Bayes

In [80]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
naive_bayes_model = model.fit(x_train,y_train)
naive_bayes_model_predict = naive_bayes_model.predict(x_test)
naive_bayes_report = classification_report(y_test,naive_bayes_model_predict)
print(naive_bayes_report)

              precision    recall  f1-score   support

           0       0.70      1.00      0.83        19
           1       0.00      0.00      0.00         8

    accuracy                           0.70        27
   macro avg       0.35      0.50      0.41        27
weighted avg       0.50      0.70      0.58        27




Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



Adaboost

In [81]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier() 
clf = AdaBoostClassifier(n_estimators=100, base_estimator=dt,learning_rate=1)
# training the model
clf.fit(x_train,y_train)
adaboost_model_predict = clf.predict(x_test)
adaboost_model_report = classification_report(y_test, adaboost_model_predict)
print(adaboost_model_report)

              precision    recall  f1-score   support

           0       0.70      1.00      0.83        19
           1       0.00      0.00      0.00         8

    accuracy                           0.70        27
   macro avg       0.35      0.50      0.41        27
weighted avg       0.50      0.70      0.58        27




Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



XGBoost

In [82]:
from xgboost import XGBClassifier
model = XGBClassifier()
xgboost_model = model.fit(x_train, y_train)
xgboost_model_predict = xgboost_model.predict(x_test)
xgboost_model_report = classification_report(y_test,xgboost_model_predict)
print(xgboost_model_report)

              precision    recall  f1-score   support

           0       0.70      1.00      0.83        19
           1       0.00      0.00      0.00         8

    accuracy                           0.70        27
   macro avg       0.35      0.50      0.41        27
weighted avg       0.50      0.70      0.58        27




Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.

