In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
from bs4 import BeautifulSoup
import string
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
import re
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
import distance 
from fuzzywuzzy import fuzz
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import pickle

In [2]:
data = pd.read_csv('train.csv')

In [3]:
data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [4]:
data.shape

(404290, 6)

In [5]:
df = data.sample(50000)

In [6]:
df.isnull().sum()

id              0
qid1            0
qid2            0
question1       0
question2       0
is_duplicate    0
dtype: int64

In [7]:
df.duplicated().sum()

0

In [8]:
df.dropna(inplace=True)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50000 entries, 392373 to 148298
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            50000 non-null  int64 
 1   qid1          50000 non-null  int64 
 2   qid2          50000 non-null  int64 
 3   question1     50000 non-null  object
 4   question2     50000 non-null  object
 5   is_duplicate  50000 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 2.7+ MB


In [10]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
392373,392373,525057,525058,Did Napoleon introduce the family name? And if...,How do I put an individual behind me when the ...,0
397002,397002,2068,355613,What hotel in Shimla Hill-station would be saf...,What hotel in Tamiya Hill-station would be saf...,0
232485,232485,342516,342517,What are the best and easy recipes for hostell...,What are some great easy to cook recipes for h...,1
195872,195872,296439,238726,How do I calculate excess return related to di...,Dividend investing: How much capital is needed...,0
379269,379269,510774,510775,Can I hire a detective in Delhi to help me for...,I fell in love with a guy. He has gone abroad ...,0


In [11]:
def preprocess(question):
    question = question.lower()   #lowercase the characters
    
    # Replace certain special characters with their string equivalents
    question = question.replace('%', ' percent')
    question = question.replace('$', ' dollar ')
    question = question.replace('₹', ' rupee ')
    question = question.replace('€', ' euro ')
    question = question.replace('@', ' at ')
    
    # The pattern '[math]' appears around 900 times in the whole dataset.
    question = question.replace('[math]', '')
    
    # Replacing some numbers with string equivalents (not perfect, can be done better to account for more cases)
    question = question.replace(',000,000,000 ', 'b ')
    question = question.replace(',000,000 ', 'm ')
    question = question.replace(',000 ', 'k ')
    question = re.sub(r'([0-9]+)000000000', r'\1b', question)
    question = re.sub(r'([0-9]+)000000', r'\1m', question)
    question = re.sub(r'([0-9]+)000', r'\1k', question)
    
    # Decontracting words
    # https://en.wikipedia.org/wiki/Wikipedia%3aList_of_English_contractions
    # https://stackoverflow.com/a/19794953
    contractions = { 
    "ain't": "am not",
    "aren't": "are not",
    "can't": "can not",
    "can't've": "can not have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i would",
    "i'd've": "i would have",
    "i'll": "i will",
    "i'll've": "i will have",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have"
    }

    q_decontracted = []

    for word in question.split():
        if word in contractions:
            word = contractions[word]

        q_decontracted.append(word)

    q = ' '.join(q_decontracted)
    q = q.replace("'ve", " have")
    q = q.replace("n't", " not")
    q = q.replace("'re", " are")
    q = q.replace("'ll", " will")
    
    q = BeautifulSoup(q).text # removal of HTML tags
    
    for i in string.punctuation:
            q = q.replace(i,'')  #removal of punctuation
    
    
    stemming_list = []           #stemming
    for word in q.split(" "):
        stemming_list.append(stemmer.stem(word))
    
    final_question = " ".join(stemming_list)
    
    return final_question

In [12]:
df['preprocessed_question1'] = df['question1'].apply(preprocess)

In [13]:
df['preprocessed_question2'] = df['question2'].apply(preprocess)

In [14]:
def common_words(q1,q2):
    l1 = set(q1.lower().split(" "))
    l2 = set(q2.lower().split(" "))
    return len(l1 & l2)

In [15]:
def total_words(q1,q2):
    l1 = set(q1.lower().split(" "))
    l2 = set(q2.lower().split(" "))
    return len(l1) + len(l2)

In [16]:
def common_features(q1, q2):
    
    common = []
    
    common.append(len(q1))
    common.append(len(q2))
    common.append(len(q1.split(" ")))
    common.append(len(q2.split(" ")))
    common.append(common_words(q1, q2))
    common.append(total_words(q1, q2))
    common.append(round(common_words(q1, q2)/total_words(q1, q2),2))
    
    return common

In [17]:
common_features_list = df.apply(lambda x: common_features(x['preprocessed_question1'],x['preprocessed_question2']), axis =1)

In [18]:
df['q1_len'] = list(map(lambda x: x[0], common_features_list))
df['q2_len'] = list(map(lambda x: x[1], common_features_list))
df['q1_words'] = list(map(lambda x: x[2], common_features_list))
df['q2_words'] = list(map(lambda x: x[3], common_features_list))
df['common_words'] = list(map(lambda x: x[4], common_features_list))
df['total_words'] = list(map(lambda x: x[5], common_features_list))
df['words_share'] = list(map(lambda x: x[6], common_features_list))

In [19]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,preprocessed_question1,preprocessed_question2,q1_len,q2_len,q1_words,q2_words,common_words,total_words,words_share
392373,392373,525057,525058,Did Napoleon introduce the family name? And if...,How do I put an individual behind me when the ...,0,did napoleon introduc the famili name and if s...,how do i put an individu behind me when the in...,110,132,22,28,6,42,0.14
397002,397002,2068,355613,What hotel in Shimla Hill-station would be saf...,What hotel in Tamiya Hill-station would be saf...,0,what hotel in shimla hillstat would be safe fo...,what hotel in tamiya hillstat would be safe fo...,117,117,21,21,18,38,0.47
232485,232485,342516,342517,What are the best and easy recipes for hostell...,What are some great easy to cook recipes for h...,1,what are the best and easi recip for hostel,what are some great easi to cook recip for hostel,43,49,9,10,6,19,0.32
195872,195872,296439,238726,How do I calculate excess return related to di...,Dividend investing: How much capital is needed...,0,how do i calcul excess return relat to dividen...,dividend invest how much capit is need to earn...,54,111,10,20,4,29,0.14
379269,379269,510774,510775,Can I hire a detective in Delhi to help me for...,I fell in love with a guy. He has gone abroad ...,0,can i hire a detect in delhi to help me for ge...,i fell in love with a guy he ha gone abroad an...,61,113,16,27,10,40,0.25


In [20]:
def token_features(q1,q2):
    
    token_list = [0.0] * 8
    
    #converting to tokens
    
    q1_tokens = q1.split(" ")
    q2_tokens = q2.split(" ")
    
    if len(q1_tokens) == 0 or len(q2_tokens)== 0:
        return token_list
    
    #count of stop words in every question
    
    q1_stops = set([word for word in q1_tokens if word in stop_words])
    q2_stops = set([word for word in q2_tokens if word in stop_words])
    
    #count of non stop-words in every question
    
    q1_words = set([word for word in q1_tokens if word not in stop_words])
    q2_words = set([word for word in q2_tokens if word not in stop_words])
    
    #common stop-words
    
    common_stop_words = len(q1_stops.intersection(q2_stops))
    
    #common non-stop-words
    
    common_non_stop_words = len(q1_words.intersection(q2_words))
    
    #common tokens
    
    common_tokens = len(set(q1_tokens).intersection(set(q2_tokens)))
    
    SAFE_DIV = 0.0001
    
    token_list[0] = common_non_stop_words/(min(len(q1_words),len(q2_words))+ SAFE_DIV)
    token_list[1] = common_non_stop_words/(max(len(q1_words),len(q2_words))+ SAFE_DIV)
    token_list[2] = common_stop_words/(min(len(q1_stops),len(q2_stops))+ SAFE_DIV)
    token_list[3] = common_stop_words/(max(len(q1_stops),len(q2_stops))+ SAFE_DIV)
    token_list[4] = common_tokens/(min(len(q1_tokens),len(q2_tokens))+ SAFE_DIV)
    token_list[5] = common_tokens/(max(len(q1_tokens),len(q2_tokens))+ SAFE_DIV)
    
    #last word of both questions are same or not
    token_list[6] = int(q1_tokens[-1] == q2_tokens[-1])
        
    #first word of both questions are same or not
    token_list[7] = int(q1_tokens[0] == q2_tokens[0])
    
    return token_list

In [21]:
token_features_list = df.apply(lambda x: token_features(x['preprocessed_question1'],x['preprocessed_question2']), axis = 1)

In [22]:
df['cwc_min'] = list(map(lambda x: x[0], token_features_list))
df['cwc_max'] = list(map(lambda x: x[1], token_features_list))
df['csc_min'] = list(map(lambda x: x[2], token_features_list))
df['csc_max'] = list(map(lambda x: x[3], token_features_list))
df['ctc_min'] = list(map(lambda x: x[4], token_features_list))
df['ctc_max'] = list(map(lambda x: x[5], token_features_list))
df['last_word_equal'] = list(map(lambda x: x[6], token_features_list))
df['first_word_equal'] = list(map(lambda x: x[7], token_features_list))

In [23]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,preprocessed_question1,preprocessed_question2,q1_len,q2_len,...,total_words,words_share,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_equal,first_word_equal
392373,392373,525057,525058,Did Napoleon introduce the family name? And if...,How do I put an individual behind me when the ...,0,did napoleon introduc the famili name and if s...,how do i put an individu behind me when the in...,110,132,...,42,0.14,0.249997,0.22222,0.333331,0.30769,0.272726,0.214285,0,0
397002,397002,2068,355613,What hotel in Shimla Hill-station would be saf...,What hotel in Tamiya Hill-station would be saf...,0,what hotel in shimla hillstat would be safe fo...,what hotel in tamiya hillstat would be safe fo...,117,117,...,38,0.47,0.916659,0.916659,0.999986,0.999986,0.857139,0.857139,1,1
232485,232485,342516,342517,What are the best and easy recipes for hostell...,What are some great easy to cook recipes for h...,1,what are the best and easi recip for hostel,what are some great easi to cook recip for hostel,43,49,...,19,0.32,0.749981,0.599988,0.599988,0.599988,0.666659,0.599994,1,1
195872,195872,296439,238726,How do I calculate excess return related to di...,Dividend investing: How much capital is needed...,0,how do i calcul excess return relat to dividen...,dividend invest how much capit is need to earn...,54,111,...,29,0.14,0.333328,0.181817,0.499988,0.249997,0.399996,0.199999,0,0
379269,379269,510774,510775,Can I hire a detective in Delhi to help me for...,I fell in love with a guy. He has gone abroad ...,0,can i hire a detect in delhi to help me for ge...,i fell in love with a guy he ha gone abroad an...,61,113,...,40,0.25,0.333328,0.166665,0.999988,0.571424,0.624996,0.370369,0,0


In [24]:
def length_features(q1, q2):
    
    length_list = [0.0] * 3
    
    #converting to tokens
    
    q1_tokens = q1.split(" ")
    q2_tokens = q2.split(" ")
    
    if len(q1_tokens) == 0 or len(q2_tokens)== 0:
        return length_list
    
    #absolute length difference
    
    length_list[0] = abs(len(q1_tokens) - len(q2_tokens))
    
    #average token length of both questions
    
    length_list[1] = (len(q1_tokens) + len(q2_tokens))/2
    
    #long substring ratio
    
    strs = list(distance.lcsubstrings(q1, q2))
    if len(strs) !=0:
        temp =  len(strs[0])
        length_list[2] = temp/(min(len(q1), len(q2)) + 1)
    else:
        length_list[2] = 0

    return length_list

In [25]:
length_features_list = df.apply(lambda x: length_features(x['preprocessed_question1'], x['preprocessed_question2']), axis=1)

In [26]:
df['abs_len_diff'] = list(map(lambda x: x[0], length_features_list))
df['mean_len'] = list(map(lambda x: x[1], length_features_list))
df['long_substring_ratio'] = list(map(lambda x: x[2], length_features_list))

In [27]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,preprocessed_question1,preprocessed_question2,q1_len,q2_len,...,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_equal,first_word_equal,abs_len_diff,mean_len,long_substring_ratio
392373,392373,525057,525058,Did Napoleon introduce the family name? And if...,How do I put an individual behind me when the ...,0,did napoleon introduc the famili name and if s...,how do i put an individu behind me when the in...,110,132,...,0.22222,0.333331,0.30769,0.272726,0.214285,0,0,6,25.0,0.09009
397002,397002,2068,355613,What hotel in Shimla Hill-station would be saf...,What hotel in Tamiya Hill-station would be saf...,0,what hotel in shimla hillstat would be safe fo...,what hotel in tamiya hillstat would be safe fo...,117,117,...,0.916659,0.999986,0.999986,0.857139,0.857139,1,1,0,21.0,0.830508
232485,232485,342516,342517,What are the best and easy recipes for hostell...,What are some great easy to cook recipes for h...,1,what are the best and easi recip for hostel,what are some great easi to cook recip for hostel,43,49,...,0.599988,0.599988,0.599988,0.666659,0.599994,1,1,1,9.5,0.386364
195872,195872,296439,238726,How do I calculate excess return related to di...,Dividend investing: How much capital is needed...,0,how do i calcul excess return relat to dividen...,dividend invest how much capit is need to earn...,54,111,...,0.181817,0.499988,0.249997,0.399996,0.199999,0,0,10,15.0,0.181818
379269,379269,510774,510775,Can I hire a detective in Delhi to help me for...,I fell in love with a guy. He has gone abroad ...,0,can i hire a detect in delhi to help me for ge...,i fell in love with a guy he ha gone abroad an...,61,113,...,0.166665,0.999988,0.571424,0.624996,0.370369,0,0,11,21.5,0.112903


In [28]:
def fuzzy_features(q1, q2):
    fuzzy_list = [0.0] * 4
    
    #converting to tokens
    
    q1_tokens = q1.split(" ")
    q2_tokens = q2.split(" ")
    
    if len(q1_tokens) == 0 or len(q2_tokens)== 0:
        return fuzzy_list
    
    #fuzz ratio
    
    fuzzy_list[0] = fuzz.QRatio(q1,q2)
    
    #partial ratio
    
    fuzzy_list[1] = fuzz.partial_ratio(q1,q2)
    
    #token_sort_ratio
    
    fuzzy_list[2] = fuzz.token_sort_ratio(q1,q2)
    
    #token_set_ratio
    
    fuzzy_list[3] = fuzz.token_set_ratio(q1, q2)
    
    return fuzzy_list

In [29]:
fuzz_features_list = df.apply(lambda x: fuzzy_features(x['preprocessed_question1'],x['preprocessed_question2']), axis=1)

In [30]:
df['fuzz_ratio'] = list(map(lambda x: x[0], fuzz_features_list)) 
df['partial_ratio'] = list(map(lambda x: x[1], fuzz_features_list)) 
df['token_sort_ratio'] = list(map(lambda x: x[2], fuzz_features_list)) 
df['token_set_ratio'] = list(map(lambda x: x[3], fuzz_features_list)) 

In [31]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,preprocessed_question1,preprocessed_question2,q1_len,q2_len,...,ctc_max,last_word_equal,first_word_equal,abs_len_diff,mean_len,long_substring_ratio,fuzz_ratio,partial_ratio,token_sort_ratio,token_set_ratio
392373,392373,525057,525058,Did Napoleon introduce the family name? And if...,How do I put an individual behind me when the ...,0,did napoleon introduc the famili name and if s...,how do i put an individu behind me when the in...,110,132,...,0.214285,0,0,6,25.0,0.09009,45,46,48,57
397002,397002,2068,355613,What hotel in Shimla Hill-station would be saf...,What hotel in Tamiya Hill-station would be saf...,0,what hotel in shimla hillstat would be safe fo...,what hotel in tamiya hillstat would be safe fo...,117,117,...,0.857139,1,1,0,21.0,0.830508,97,97,94,97
232485,232485,342516,342517,What are the best and easy recipes for hostell...,What are some great easy to cook recipes for h...,1,what are the best and easi recip for hostel,what are some great easi to cook recip for hostel,43,49,...,0.599994,1,1,1,9.5,0.386364,76,67,72,82
195872,195872,296439,238726,How do I calculate excess return related to di...,Dividend investing: How much capital is needed...,0,how do i calcul excess return relat to dividen...,dividend invest how much capit is need to earn...,54,111,...,0.199999,0,0,10,15.0,0.181818,35,37,42,58
379269,379269,510774,510775,Can I hire a detective in Delhi to help me for...,I fell in love with a guy. He has gone abroad ...,0,can i hire a detect in delhi to help me for ge...,i fell in love with a guy he ha gone abroad an...,61,113,...,0.370369,0,0,11,21.5,0.112903,33,39,52,75


In [32]:
cv = CountVectorizer(max_features=3000)

In [33]:
new_df = df.copy()

In [34]:
new_df = new_df.drop(columns=['id','qid1','qid2','question1','question2', 'preprocessed_question1','preprocessed_question2'])

In [35]:
q1_df = pd.DataFrame(cv.fit_transform(df['preprocessed_question1']).toarray(), index=df.index)

In [36]:
q2_df = pd.DataFrame(cv.fit_transform(df['preprocessed_question2']).toarray(), index=df.index)

In [37]:
questions_df = pd.concat([q1_df, q2_df], axis=1)

In [38]:
final_df = pd.concat([new_df, questions_df], axis=1)

In [39]:
final_df.head()

Unnamed: 0,is_duplicate,q1_len,q2_len,q1_words,q2_words,common_words,total_words,words_share,cwc_min,cwc_max,...,2990,2991,2992,2993,2994,2995,2996,2997,2998,2999
392373,0,110,132,22,28,6,42,0.14,0.249997,0.22222,...,0,0,0,0,0,0,0,0,0,0
397002,0,117,117,21,21,18,38,0.47,0.916659,0.916659,...,0,0,0,0,0,0,0,0,0,0
232485,1,43,49,9,10,6,19,0.32,0.749981,0.599988,...,0,0,0,0,0,0,0,0,0,0
195872,0,54,111,10,20,4,29,0.14,0.333328,0.181817,...,0,0,0,0,0,0,0,0,0,0
379269,0,61,113,16,27,10,40,0.25,0.333328,0.166665,...,0,0,0,0,0,0,0,0,0,0


In [40]:
X = final_df.drop(columns='is_duplicate', axis=1).values

In [41]:
y = final_df['is_duplicate'].values

In [42]:
X.shape

(50000, 6022)

In [43]:
y.shape

(50000,)

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=2, test_size=0.25)

In [45]:
rdf = RandomForestClassifier()

In [46]:
rdf.fit(X_train,y_train)

In [47]:
y_pred = rdf.predict(X_test)

In [48]:
accuracy_score(y_test, y_pred)

0.794

In [49]:
confusion_matrix(y_test,y_pred)

array([[6850, 1036],
       [1539, 3075]], dtype=int64)

In [50]:
gb = GaussianNB()

In [51]:
gb.fit(X_train, y_train)

In [52]:
y_pred = gb.predict(X_test)

In [53]:
accuracy_score(y_test, y_pred)

0.6092

In [54]:
confusion_matrix(y_test, y_pred)

array([[3954, 3932],
       [ 953, 3661]], dtype=int64)

In [55]:
xgb = XGBClassifier()

In [56]:
xgb.fit(X_train, y_train)

In [57]:
y_pred = xgb.predict(X_test)

In [58]:
accuracy_score(y_test, y_pred)

0.79528

In [59]:
confusion_matrix(y_test, y_pred)

array([[6701, 1185],
       [1374, 3240]], dtype=int64)

In [60]:
def question_check(q1, q2):
    q1 = preprocess(q1)
    q2 = preprocess(q2)
    
    input_query=[]
    test_common_features = common_features(q1,q2)
    input_query.extend(test_common_features)
    
    test_token_features = token_features(q1, q2)
    input_query.extend(test_token_features)
    
    test_length_features = length_features(q1, q2)
    input_query.extend(test_length_features)
    
    test_fuzzy_features = fuzzy_features(q1, q2)
    input_query.extend(test_fuzzy_features)
    
    q1_bow = cv.transform([q1]).toarray()
    q2_bow = cv.transform([q2]).toarray()
    
    return np.hstack((np.array(input_query).reshape(1,22), q1_bow, q2_bow))

In [61]:
q1 = 'Where is the capital of India?'
q2 = 'What is the current capital of Pakistan?'
q3 = 'Which city serves as the capital of India?'
q4 = 'What is the business capital of India?'

In [62]:
rdf.predict(question_check(q1,q4))

array([1], dtype=int64)

In [63]:
rdf.predict(question_check(q1,q2))

array([0], dtype=int64)

In [64]:
cv

In [65]:
pickle.dump(rdf,open('model.pkl','wb'))
pickle.dump(cv,open('cv.pkl', 'wb'))