### <font color='green'>Quora Question Pairs - Random Forest Model</font>

#### Import required libraries

In [1]:
import pandas as pd
import itertools as itertools
import sklearn as skl
import numpy as np
import matplotlib.pyplot as plt
import nltk as nk

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, mean_squared_error, make_scorer, confusion_matrix

from keras.models import Sequential, Model
from keras.layers import Input, Embedding, LSTM, Merge, Dense, Dropout, concatenate
from keras.optimizers import Adadelta
from keras.preprocessing.sequence import pad_sequences
import keras.backend as K

from nltk.corpus import stopwords

import re
import Levenshtein as leven
from gensim.models import KeyedVectors
from math import sqrt

ModuleNotFoundError: No module named 'matplotlib'

#### Import datasets and clean data, for practice.
The training dataset provided will be split into train-test to validate model's accuracy

In [2]:
train = pd.read_csv(r'C:\Users\lim_j\Google Drive\Technical Skills\Kaggle\Quora Question Pairs\train.csv')
test = pd.read_csv(r'C:\Users\lim_j\Google Drive\Technical Skills\Kaggle\Quora Question Pairs\test.csv')
embedding_file = r'C:\Users\lim_j\Google Drive\Technical Skills\Kaggle\Quora Question Pairs\GoogleNews-vectors-negative300.bin.gz'
model_dir = r'C:\Users\lim_j\Google Drive\Technical Skills\Kaggle\Quora Question Pairs\Model'

## Exploratory Data Analysis (EDA)

In [3]:
train.describe()

Unnamed: 0,id,qid1,qid2,is_duplicate
count,404290.0,404290.0,404290.0,404290.0
mean,202144.5,217243.942418,220955.655337,0.369198
std,116708.614502,157751.700002,159903.182629,0.482588
min,0.0,1.0,2.0,0.0
25%,101072.25,74437.5,74727.0,0.0
50%,202144.5,192182.0,197052.0,0.0
75%,303216.75,346573.5,354692.5,1.0
max,404289.0,537932.0,537933.0,1.0


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404290 entries, 0 to 404289
Data columns (total 6 columns):
id              404290 non-null int64
qid1            404290 non-null int64
qid2            404290 non-null int64
question1       404290 non-null object
question2       404288 non-null object
is_duplicate    404290 non-null int64
dtypes: int64(4), object(2)
memory usage: 18.5+ MB


There are 2 null values under column question2, hence we will have to remove these 2 pairs of questions with null values.

In [5]:
type(train.question2[0])

str

In [6]:
train[train.isnull().any(axis=1)]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
105780,105780,174363,174364,How can I develop android app?,,0
201841,201841,303951,174364,How can I create an Android app?,,0


In [7]:
train = train.dropna(axis=0, how='any')

In [8]:
# Verify that rows with null values have been removed
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 404288 entries, 0 to 404289
Data columns (total 6 columns):
id              404288 non-null int64
qid1            404288 non-null int64
qid2            404288 non-null int64
question1       404288 non-null object
question2       404288 non-null object
is_duplicate    404288 non-null int64
dtypes: int64(4), object(2)
memory usage: 21.6+ MB


### Feature Engineering
Creation of classical features

#### Feature engineering functions

In [9]:
# Count number of words in a question
def words(question):
    return len(question.split())

# Average length of a word in a question
def avg_word_length(question):
    total_words_len = 0
    no_of_words = 0
    question = question.split()
    for word in question:
        total_words_len += len(word)
        no_of_words += 1
    return total_words_len/no_of_words

# Number of characters in a question
def char_count(question):
    return len(question)

# Caps count of question, only takes into account whether the first character of each word is in uppercase
def caps_count(question):
    question = question.split()
    count = 0
    for word in question:
        if word[0].isupper():
            count += 1
    
    return count

# Jaccard Similiarity Coefficient
# Obtain the Jaccard Similiarity Coeefficient between 2 questions
# (X intersect Y) / (X union Y)
def jaccard_coeff(dataframe):
    question1 = dataframe['question1']
    question2 = dataframe['question2']
    question1 = question1.split(' ')
    question2 = question2.split(' ')
    shared_words = 0
    total_words = len(question1) + len(question2)
    
    for word1 in question1:
        for word2 in question2:
            if word1 == word2:
                shared_words += 1
    
    if (total_words-shared_words) == 0:
        return 1
    else:
        return shared_words/(total_words-shared_words)
    
# Levenshtein distance
# Obtain the Levenshtein distance between 2 questions
def levenshtein(dataframe):
    return leven.distance(dataframe['question1'], dataframe['question2'])

In [10]:
# Slicing imported dataframe into question1 series, question2 series and questions dataframe
q1 = train.iloc[:,3]
q2 = train.iloc[:,4]
q = train.iloc[:,3:5]
dup = train.iloc[:,5]

# Creating new features using feature engineering functions
word_len_diff = abs(q1.apply(words) - q2.apply(words))
avg_word_len_diff = abs(q1.apply(avg_word_length) - q2.apply(avg_word_length))
char_diff = abs(q1.apply(char_count) - q2.apply(char_count))
caps_diff = abs(q1.apply(caps_count) - q2.apply(caps_count))
jaccard = q.apply(jaccard_coeff, axis=1)
leven_dist = q.apply(levenshtein, axis=1)

# Creating a new dataframe with values of new features
classic_feat = pd.DataFrame({'word_len_diff': word_len_diff, 'avg_word_len_diff': avg_word_len_diff, 
                             'char_diff': char_diff, 'caps_diff': caps_diff, 'jaccard': jaccard, 
                             'leven_dist': leven_dist, 'duplicate': dup})
classic_feat = classic_feat[['word_len_diff', 'avg_word_len_diff', 'char_diff', 'caps_diff', 'jaccard', 'leven_dist', 'duplicate']]

# Create train = true/false boolean column for train-test split
classic_feat['is_train'] = np.random.uniform(0, 1, len(classic_feat)) <= .75

# Train-test dataframes split
train, test = classic_feat[classic_feat['is_train']==True], classic_feat[classic_feat['is_train']==False]

# Number of examplples for training and test dataframes
print('# of examples in the training data:', len(train))
print('# of examples in the test data:',len(test))

# of examples in the training data: 303372
# of examples in the test data: 100916


Index(['word_len_diff', 'avg_word_len_diff', 'char_diff', 'caps_diff',
       'jaccard', 'leven_dist'],
      dtype='object')

In [12]:
# Obtaining y from the training data
y = train['duplicate']


0         0
1         0
2         0
4         0
5         1
6         0
8         0
9         0
11        1
12        1
13        1
14        0
15        1
16        1
18        1
19        0
21        0
23        0
24        0
25        0
26        0
27        0
28        0
29        1
30        0
32        1
33        0
34        0
35        0
36        0
         ..
404249    1
404250    0
404251    0
404253    0
404254    0
404257    1
404258    1
404260    0
404261    1
404262    0
404263    0
404264    0
404265    1
404266    0
404267    1
404269    0
404270    0
404273    1
404274    1
404275    0
404277    0
404279    0
404280    1
404281    1
404282    1
404283    0
404284    1
404285    0
404286    1
404287    0
Name: duplicate, Length: 303372, dtype: int64

In [13]:
classic_feat.head()

Unnamed: 0,word_len_diff,avg_word_len_diff,char_diff,caps_diff,jaccard,leven_dist,duplicate,is_train
0,2,0.047619,9,0,1.166667,9,0,True
1,5,0.346154,37,0,0.3125,43,0,True
2,4,0.714286,14,0,0.142857,40,0,True
3,2,2.69697,15,3,0.0,54,0,False
4,6,0.208791,37,0,0.111111,53,0,True


RandomForestClassifier(bootstrap=True, class_weight={0: 100, 1: 1},
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=200, n_jobs=1, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [15]:

yhat

array([0, 0, 0, ..., 1, 0, 1], dtype=int64)

In [16]:
# The predicted probabilities that questions will not be duplicates (=0) or duplicates (=1)
clf.predict_proba(test[features])

array([[ 1.        ,  0.        ],
       [ 0.745     ,  0.255     ],
       [ 1.        ,  0.        ],
       ..., 
       [ 0.45      ,  0.55      ],
       [ 1.        ,  0.        ],
       [ 0.11965707,  0.88034293]])

In [17]:
# Confusion matrix
pd.crosstab(test['duplicate'], yhat, rownames=['Actual Duplicate Qns'], colnames=['Predicted Duplicate Qns'])

Predicted Duplicate Qns,0,1
Actual Duplicate Qns,Unnamed: 1_level_1,Unnamed: 2_level_1
0,49741,13897
1,18754,18524


In [18]:
# Obtain the relative importance of the features
# list(zip(train[features], clf.feature_importances_))

In [19]:
# Make a scorer from a performance metric function
dup_scorer = make_scorer(mean_squared_error, greater_is_better=False, needs_proba=True)

# Random Forest Classifier Accuracy
accuracy_score(test['duplicate'], yhat)

# Manual tabulation of accuracy
tn, fp, fn, tp = confusion_matrix(test['duplicate'], yhat).ravel()
accuracy = (tp+tn)/(tp+tn+fp+fn)

In [20]:
print(accuracy)

0.676453684252


In [21]:
rmse = sqrt(mean_squared_error(test['duplicate'], yhat))
rmse

0.5688113182310551