In [66]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import re
from bs4 import BeautifulSoup

import warnings
warnings.filterwarnings('ignore')
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sudee\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [67]:
df = pd.read_csv('train.csv')

In [68]:
new_df = df.sample(100000,random_state=42)
new_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
8067,8067,15738,15739,How do I play Pokémon GO in Korea?,How do I play Pokémon GO in China?,0
368101,368101,12736,104117,What are some of the best side dishes for crab...,What are some good side dishes for buffalo chi...,0
70497,70497,121486,121487,Which is more advisable and better material fo...,What is the best server setup for buddypress?,0
226567,226567,254474,258192,How do I improve logical programming skills?,How can I improve my logical skills for progra...,1
73186,73186,48103,3062,How close we are to see 3rd world war?,How close is a World War III?,1


# PREPROCESSING

In [69]:
def preprocess(q):
    
    q = str(q).lower().strip()
    
    # Replace certain special characters with their string equivalents
    q = q.replace('%', ' percent')
    q = q.replace('$', ' dollar ')
    q = q.replace('₹', ' rupee ')
    q = q.replace('€', ' euro ')
    q = q.replace('@', ' at ')
    
    # The pattern '[math]' appears around 900 times in the whole dataset.
    q = q.replace('[math]', '')
    
    # Replacing some numbers with string equivalents (not perfect, can be done better to account for more cases)
    q = q.replace(',000,000,000 ', 'b ')
    q = q.replace(',000,000 ', 'm ')
    q = q.replace(',000 ', 'k ')
    q = re.sub(r'([0-9]+)000000000', r'\1b', q)
    q = re.sub(r'([0-9]+)000000', r'\1m', q)
    q = re.sub(r'([0-9]+)000', r'\1k', q)
    
    # Decontracting words
    # https://en.wikipedia.org/wiki/Wikipedia%3aList_of_English_contractions
    # https://stackoverflow.com/a/19794953
    contractions = { 
    "ain't": "am not",
    "aren't": "are not",
    "can't": "can not",
    "can't've": "can not have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i would",
    "i'd've": "i would have",
    "i'll": "i will",
    "i'll've": "i will have",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have"
    }

    q_decontracted = []

    for word in q.split():
        if word in contractions:
            word = contractions[word]

        q_decontracted.append(word)

    q = ' '.join(q_decontracted)
    q = q.replace("'ve", " have")
    q = q.replace("n't", " not")
    q = q.replace("'re", " are")
    q = q.replace("'ll", " will")
    
    # Removing HTML tags
    q = BeautifulSoup(q)
    q = q.get_text()
    
    # Remove punctuations
    pattern = re.compile('\W')
    q = re.sub(pattern, ' ', q).strip()

    
    return q

In [70]:
preprocess("I'm Done Already already!. I wasn't <b>done</b>?")

'i am done already already   i was not done'

In [71]:
new_df['question1'] = new_df['question1'].apply(preprocess)
new_df['question2'] = new_df['question2'].apply(preprocess)
new_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
8067,8067,15738,15739,how do i play pokémon go in korea,how do i play pokémon go in china,0
368101,368101,12736,104117,what are some of the best side dishes for crab...,what are some good side dishes for buffalo chi...,0
70497,70497,121486,121487,which is more advisable and better material fo...,what is the best server setup for buddypress,0
226567,226567,254474,258192,how do i improve logical programming skills,how can i improve my logical skills for progra...,1
73186,73186,48103,3062,how close we are to see 3rd world war,how close is a world war iii,1


# ADDING NEW FEATURES

In [72]:
new_df['q1_len'] = new_df['question1'].str.len() 
new_df['q2_len'] = new_df['question2'].str.len()
new_df['q1_num_words'] = new_df['question1'].apply(lambda row: len(row.split(" ")))
new_df['q2_num_words'] = new_df['question2'].apply(lambda row: len(row.split(" ")))
new_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_len,q2_len,q1_num_words,q2_num_words
8067,8067,15738,15739,how do i play pokémon go in korea,how do i play pokémon go in china,0,33,33,8,8
368101,368101,12736,104117,what are some of the best side dishes for crab...,what are some good side dishes for buffalo chi...,0,52,50,11,9
70497,70497,121486,121487,which is more advisable and better material fo...,what is the best server setup for buddypress,0,95,44,17,8
226567,226567,254474,258192,how do i improve logical programming skills,how can i improve my logical skills for progra...,1,43,51,7,9
73186,73186,48103,3062,how close we are to see 3rd world war,how close is a world war iii,1,37,28,9,7


In [73]:
def common_words(row):
    w1 = set(map(lambda word: word.lower().strip(), row['question1'].split(" ")))
    w2 = set(map(lambda word: word.lower().strip(), row['question2'].split(" ")))    
    return len(w1 & w2)

In [74]:
new_df['word_common'] = new_df.apply(common_words, axis=1)
new_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_len,q2_len,q1_num_words,q2_num_words,word_common
8067,8067,15738,15739,how do i play pokémon go in korea,how do i play pokémon go in china,0,33,33,8,8,7
368101,368101,12736,104117,what are some of the best side dishes for crab...,what are some good side dishes for buffalo chi...,0,52,50,11,9,6
70497,70497,121486,121487,which is more advisable and better material fo...,what is the best server setup for buddypress,0,95,44,17,8,2
226567,226567,254474,258192,how do i improve logical programming skills,how can i improve my logical skills for progra...,1,43,51,7,9,6
73186,73186,48103,3062,how close we are to see 3rd world war,how close is a world war iii,1,37,28,9,7,4


In [75]:
def total_words(row):
    w1 = set(map(lambda word: word.lower().strip(), row['question1'].split(" ")))
    w2 = set(map(lambda word: word.lower().strip(), row['question2'].split(" ")))    
    return (len(w1) + len(w2))

In [76]:
new_df['word_total'] = new_df.apply(total_words, axis=1)
new_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_len,q2_len,q1_num_words,q2_num_words,word_common,word_total
8067,8067,15738,15739,how do i play pokémon go in korea,how do i play pokémon go in china,0,33,33,8,8,7,16
368101,368101,12736,104117,what are some of the best side dishes for crab...,what are some good side dishes for buffalo chi...,0,52,50,11,9,6,20
70497,70497,121486,121487,which is more advisable and better material fo...,what is the best server setup for buddypress,0,95,44,17,8,2,25
226567,226567,254474,258192,how do i improve logical programming skills,how can i improve my logical skills for progra...,1,43,51,7,9,6,16
73186,73186,48103,3062,how close we are to see 3rd world war,how close is a world war iii,1,37,28,9,7,4,16


In [77]:
new_df['Common_word_ratio'] = round(new_df['word_common']/new_df['word_total'],2)
new_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_len,q2_len,q1_num_words,q2_num_words,word_common,word_total,Common_word_ratio
8067,8067,15738,15739,how do i play pokémon go in korea,how do i play pokémon go in china,0,33,33,8,8,7,16,0.44
368101,368101,12736,104117,what are some of the best side dishes for crab...,what are some good side dishes for buffalo chi...,0,52,50,11,9,6,20,0.3
70497,70497,121486,121487,which is more advisable and better material fo...,what is the best server setup for buddypress,0,95,44,17,8,2,25,0.08
226567,226567,254474,258192,how do i improve logical programming skills,how can i improve my logical skills for progra...,1,43,51,7,9,6,16,0.38
73186,73186,48103,3062,how close we are to see 3rd world war,how close is a world war iii,1,37,28,9,7,4,16,0.25


In [78]:
# Advanced Features
from nltk.corpus import stopwords

def fetch_token_features(row):
    
    q1 = row['question1']
    q2 = row['question2']
    
    SAFE_DIV = 0.0001 

    STOP_WORDS = stopwords.words("english")
    
    token_features = [0.0]*8
    
    # Converting the Sentence into Tokens: 
    q1_tokens = q1.split()
    q2_tokens = q2.split()
    
    if len(q1_tokens) == 0 or len(q2_tokens) == 0:
        return token_features

    # Get the non-stopwords in Questions
    q1_words = set([word for word in q1_tokens if word not in STOP_WORDS])
    q2_words = set([word for word in q2_tokens if word not in STOP_WORDS])
    
    #Get the stopwords in Questions
    q1_stops = set([word for word in q1_tokens if word in STOP_WORDS])
    q2_stops = set([word for word in q2_tokens if word in STOP_WORDS])
    
    # Get the common non-stopwords from Question pair
    common_word_count = len(q1_words.intersection(q2_words))
    
    # Get the common stopwords from Question pair
    common_stop_count = len(q1_stops.intersection(q2_stops))
    
    # Get the common Tokens from Question pair
    common_token_count = len(set(q1_tokens).intersection(set(q2_tokens)))
    
    
    token_features[0] = common_word_count / (min(len(q1_words), len(q2_words)) + SAFE_DIV)
    token_features[1] = common_word_count / (max(len(q1_words), len(q2_words)) + SAFE_DIV)
    token_features[2] = common_stop_count / (min(len(q1_stops), len(q2_stops)) + SAFE_DIV)
    token_features[3] = common_stop_count / (max(len(q1_stops), len(q2_stops)) + SAFE_DIV)
    token_features[4] = common_token_count / (min(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
    token_features[5] = common_token_count / (max(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
    
    # Last word of both question is same or not
    token_features[6] = int(q1_tokens[-1] == q2_tokens[-1])
    
    # First word of both question is same or not
    token_features[7] = int(q1_tokens[0] == q2_tokens[0])
    
    return token_features

In [81]:
token_features = new_df.apply(fetch_token_features, axis=1)

new_df["cwc_min"]       = list(map(lambda x: x[0], token_features))
new_df["cwc_max"]       = list(map(lambda x: x[1], token_features))
new_df["csc_min"]       = list(map(lambda x: x[2], token_features))
new_df["csc_max"]       = list(map(lambda x: x[3], token_features))
new_df["ctc_min"]       = list(map(lambda x: x[4], token_features))
new_df["ctc_max"]       = list(map(lambda x: x[5], token_features))
new_df["last_word_eq"]  = list(map(lambda x: x[6], token_features))
new_df["first_word_eq"] = list(map(lambda x: x[7], token_features))

In [82]:
new_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_len,q2_len,q1_num_words,q2_num_words,...,word_total,Common_word_ratio,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,first_word_eq
8067,8067,15738,15739,how do i play pokémon go in korea,how do i play pokémon go in china,0,33,33,8,8,...,16,0.44,0.749981,0.749981,0.999975,0.999975,0.874989,0.874989,0.0,1.0
368101,368101,12736,104117,what are some of the best side dishes for crab...,what are some good side dishes for buffalo chi...,0,52,50,11,9,...,20,0.3,0.399992,0.399992,0.999975,0.666656,0.666659,0.54545,0.0,1.0
70497,70497,121486,121487,which is more advisable and better material fo...,what is the best server setup for buddypress,0,95,44,17,8,...,25,0.08,0.0,0.0,0.499988,0.249997,0.249997,0.124999,0.0,0.0
226567,226567,254474,258192,how do i improve logical programming skills,how can i improve my logical skills for progra...,1,43,51,7,9,...,16,0.38,0.999975,0.999975,0.666644,0.399992,0.857131,0.666659,0.0,1.0
73186,73186,48103,3062,how close we are to see 3rd world war,how close is a world war iii,1,37,28,9,7,...,16,0.25,0.749981,0.599988,0.333322,0.249994,0.57142,0.44444,0.0,1.0


In [83]:
import distance

def fetch_length_features(row):
    
    q1 = row['question1']
    q2 = row['question2']
    
    length_features = [0.0]*3
    
    # Converting the Sentence into Tokens: 
    q1_tokens = q1.split()
    q2_tokens = q2.split()
    
    if len(q1_tokens) == 0 or len(q2_tokens) == 0:
        return length_features
    
    # Absolute length features
    length_features[0] = abs(len(q1_tokens) - len(q2_tokens))
    
    #Average Token Length of both Questions
    length_features[1] = (len(q1_tokens) + len(q2_tokens))/2
    
    strs = list(distance.lcsubstrings(q1, q2))
    length_features[2] = len(strs[0]) / (min(len(q1), len(q2)) + 1) if strs else 0
    
    return length_features

In [84]:
length_features = new_df.apply(fetch_length_features, axis=1)

new_df['abs_len_diff'] = list(map(lambda x: x[0], length_features))
new_df['mean_len'] = list(map(lambda x: x[1], length_features))
new_df['longest_substr_ratio'] = list(map(lambda x: x[2], length_features))

In [85]:
new_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_len,q2_len,q1_num_words,q2_num_words,...,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,longest_substr_ratio
8067,8067,15738,15739,how do i play pokémon go in korea,how do i play pokémon go in china,0,33,33,8,8,...,0.749981,0.999975,0.999975,0.874989,0.874989,0.0,1.0,0.0,8.0,0.823529
368101,368101,12736,104117,what are some of the best side dishes for crab...,what are some good side dishes for buffalo chi...,0,52,50,11,9,...,0.399992,0.999975,0.666656,0.666659,0.54545,0.0,1.0,2.0,10.0,0.333333
70497,70497,121486,121487,which is more advisable and better material fo...,what is the best server setup for buddypress,0,95,44,17,8,...,0.0,0.499988,0.249997,0.249997,0.124999,0.0,0.0,8.0,12.0,0.111111
226567,226567,254474,258192,how do i improve logical programming skills,how can i improve my logical skills for progra...,1,43,51,7,9,...,0.999975,0.666644,0.399992,0.857131,0.666659,0.0,1.0,2.0,8.0,0.272727
73186,73186,48103,3062,how close we are to see 3rd world war,how close is a world war iii,1,37,28,9,7,...,0.599988,0.333322,0.249994,0.57142,0.44444,0.0,1.0,2.0,8.0,0.344828


In [86]:
ques_df = new_df[['question1','question2']]
ques_df.head()

Unnamed: 0,question1,question2
8067,how do i play pokémon go in korea,how do i play pokémon go in china
368101,what are some of the best side dishes for crab...,what are some good side dishes for buffalo chi...
70497,which is more advisable and better material fo...,what is the best server setup for buddypress
226567,how do i improve logical programming skills,how can i improve my logical skills for progra...
73186,how close we are to see 3rd world war,how close is a world war iii


In [87]:
final_df = new_df.drop(columns=['id','qid1','qid2','question1','question2'])
print(final_df.shape)
final_df.head()

(100000, 19)


Unnamed: 0,is_duplicate,q1_len,q2_len,q1_num_words,q2_num_words,word_common,word_total,Common_word_ratio,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,longest_substr_ratio
8067,0,33,33,8,8,7,16,0.44,0.749981,0.749981,0.999975,0.999975,0.874989,0.874989,0.0,1.0,0.0,8.0,0.823529
368101,0,52,50,11,9,6,20,0.3,0.399992,0.399992,0.999975,0.666656,0.666659,0.54545,0.0,1.0,2.0,10.0,0.333333
70497,0,95,44,17,8,2,25,0.08,0.0,0.0,0.499988,0.249997,0.249997,0.124999,0.0,0.0,8.0,12.0,0.111111
226567,1,43,51,7,9,6,16,0.38,0.999975,0.999975,0.666644,0.399992,0.857131,0.666659,0.0,1.0,2.0,8.0,0.272727
73186,1,37,28,9,7,4,16,0.25,0.749981,0.599988,0.333322,0.249994,0.57142,0.44444,0.0,1.0,2.0,8.0,0.344828


# Applying Glove Word2Vec

In [None]:
import gensim.downloader as api

# Resume download
word2vec_model = api.load("word2vec-google-news-300", return_path=True)
print(f"Model downloaded at: {word2vec_model}")


Model downloaded at: C:\Users\sudee/gensim-data\word2vec-google-news-300\word2vec-google-news-300.gz


In [None]:
from gensim.models import KeyedVectors
import gensim.downloader as api

# Load the Glove model properly
word2vec_model = api.load("glove-wiki-gigaword-50") 

# Save the model in Word2Vec format (text-based)
word2vec_model.save_word2vec_format("word2vec.txt", binary=False)

print("Model saved successfully!")

Model saved successfully!


In [88]:
from gensim.models import KeyedVectors
# Load the Glove model in Word2Vec format
word2vec_model = KeyedVectors.load_word2vec_format("word2vec.txt", binary=False)

# Test similarity
print(word2vec_model.most_similar("car", topn=5))

[('truck', 0.92085862159729), ('cars', 0.8870189785957336), ('vehicle', 0.8833683729171753), ('driver', 0.8464019298553467), ('driving', 0.8384189009666443)]


In [89]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

# Convert to lowercase and tokenize
ques_df['q1_tokens'] = ques_df['question1'].apply(lambda x: word_tokenize(x.lower()))
ques_df['q2_tokens'] = ques_df['question2'].apply(lambda x: word_tokenize(x.lower()))

ques_df.head()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sudee\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,question1,question2,q1_tokens,q2_tokens
8067,how do i play pokémon go in korea,how do i play pokémon go in china,"[how, do, i, play, pokémon, go, in, korea]","[how, do, i, play, pokémon, go, in, china]"
368101,what are some of the best side dishes for crab...,what are some good side dishes for buffalo chi...,"[what, are, some, of, the, best, side, dishes,...","[what, are, some, good, side, dishes, for, buf..."
70497,which is more advisable and better material fo...,what is the best server setup for buddypress,"[which, is, more, advisable, and, better, mate...","[what, is, the, best, server, setup, for, budd..."
226567,how do i improve logical programming skills,how can i improve my logical skills for progra...,"[how, do, i, improve, logical, programming, sk...","[how, can, i, improve, my, logical, skills, fo..."
73186,how close we are to see 3rd world war,how close is a world war iii,"[how, close, we, are, to, see, 3rd, world, war]","[how, close, is, a, world, war, iii]"


In [90]:
# Function to convert a list of words to a vector
def sentence_to_vector(tokens, model):
    vectors = [model[word] for word in tokens if word in model]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)



In [91]:
# Convert both questions into vectors
ques_df['q1_vector'] = ques_df['q1_tokens'].apply(lambda x: sentence_to_vector(x, word2vec_model))
ques_df['q2_vector'] = ques_df['q2_tokens'].apply(lambda x: sentence_to_vector(x, word2vec_model))

In [92]:
# Example: Assuming q1_vector and q2_vector are already in the DataFrame
ques_df['q1_vector'] =ques_df['q1_vector'].apply(np.array)  # Ensure they are NumPy arrays
ques_df['q2_vector'] = ques_df['q2_vector'].apply(np.array)

# Compute absolute difference |q1 - q2|
ques_df['abs_diff'] = ques_df.apply(lambda row: np.abs(row['q1_vector'] - row['q2_vector']), axis=1)

# Compute element-wise product q1 * q2
ques_df['elementwise_product'] = ques_df.apply(lambda row: row['q1_vector'] * row['q2_vector'], axis=1)

In [93]:
ques_df.head()

Unnamed: 0,question1,question2,q1_tokens,q2_tokens,q1_vector,q2_vector,abs_diff,elementwise_product
8067,how do i play pokémon go in korea,how do i play pokémon go in china,"[how, do, i, play, pokémon, go, in, korea]","[how, do, i, play, pokémon, go, in, china]","[0.048978735, 0.12111999, -0.08994362, 0.21367...","[0.058528736, 0.11420874, -0.14329837, 0.30333...","[0.0095500015, 0.006911248, 0.053354755, 0.089...","[0.0028666635, 0.013832962, 0.012888774, 0.064..."
368101,what are some of the best side dishes for crab...,what are some good side dishes for buffalo chi...,"[what, are, some, of, the, best, side, dishes,...","[what, are, some, good, side, dishes, for, buf...","[0.4118309, 0.21178155, -0.43707272, 0.0196644...","[0.24862221, 0.08570189, -0.46412998, -0.15518...","[0.1632087, 0.12607965, 0.02705726, 0.17484868...","[0.10239031, 0.018150078, 0.20285855, -0.00305..."
70497,which is more advisable and better material fo...,what is the best server setup for buddypress,"[which, is, more, advisable, and, better, mate...","[what, is, the, best, server, setup, for, budd...","[0.20822841, 0.023662375, 0.016414564, -0.0337...","[0.23382142, 0.15573585, -0.07020001, 0.386168...","[0.025593013, 0.13207348, 0.08661458, 0.419886...","[0.048688263, 0.0036850802, -0.0011523026, -0...."
226567,how do i improve logical programming skills,how can i improve my logical skills for progra...,"[how, do, i, improve, logical, programming, sk...","[how, can, i, improve, my, logical, skills, fo...","[0.22090183, -0.027280003, -0.12204414, -0.079...","[0.21504362, 0.16173556, -0.07358255, -0.11427...","[0.0058582127, 0.18901557, 0.048461586, 0.0351...","[0.04750353, -0.0044121468, 0.008980319, 0.009..."
73186,how close we are to see 3rd world war,how close is a world war iii,"[how, close, we, are, to, see, 3rd, world, war]","[how, close, is, a, world, war, iii]","[0.33585003, 0.11932678, 0.13839345, -0.159580...","[0.3623257, 0.38533857, -0.081577145, 0.074711...","[0.026475668, 0.2660118, 0.21997058, 0.2342924...","[0.1216871, 0.04598121, -0.011289742, -0.01192..."


In [94]:
temp_df = ques_df[['q1_vector', 'q2_vector', 'abs_diff', 'elementwise_product']].copy()
temp_df.head()

Unnamed: 0,q1_vector,q2_vector,abs_diff,elementwise_product
8067,"[0.048978735, 0.12111999, -0.08994362, 0.21367...","[0.058528736, 0.11420874, -0.14329837, 0.30333...","[0.0095500015, 0.006911248, 0.053354755, 0.089...","[0.0028666635, 0.013832962, 0.012888774, 0.064..."
368101,"[0.4118309, 0.21178155, -0.43707272, 0.0196644...","[0.24862221, 0.08570189, -0.46412998, -0.15518...","[0.1632087, 0.12607965, 0.02705726, 0.17484868...","[0.10239031, 0.018150078, 0.20285855, -0.00305..."
70497,"[0.20822841, 0.023662375, 0.016414564, -0.0337...","[0.23382142, 0.15573585, -0.07020001, 0.386168...","[0.025593013, 0.13207348, 0.08661458, 0.419886...","[0.048688263, 0.0036850802, -0.0011523026, -0...."
226567,"[0.22090183, -0.027280003, -0.12204414, -0.079...","[0.21504362, 0.16173556, -0.07358255, -0.11427...","[0.0058582127, 0.18901557, 0.048461586, 0.0351...","[0.04750353, -0.0044121468, 0.008980319, 0.009..."
73186,"[0.33585003, 0.11932678, 0.13839345, -0.159580...","[0.3623257, 0.38533857, -0.081577145, 0.074711...","[0.026475668, 0.2660118, 0.21997058, 0.2342924...","[0.1216871, 0.04598121, -0.011289742, -0.01192..."


In [95]:
final_df = pd.concat([final_df, temp_df], axis=1)
print(final_df.shape)
final_df.head()

(100000, 23)


Unnamed: 0,is_duplicate,q1_len,q2_len,q1_num_words,q2_num_words,word_common,word_total,Common_word_ratio,cwc_min,cwc_max,...,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,longest_substr_ratio,q1_vector,q2_vector,abs_diff,elementwise_product
8067,0,33,33,8,8,7,16,0.44,0.749981,0.749981,...,0.874989,0.0,1.0,0.0,8.0,0.823529,"[0.048978735, 0.12111999, -0.08994362, 0.21367...","[0.058528736, 0.11420874, -0.14329837, 0.30333...","[0.0095500015, 0.006911248, 0.053354755, 0.089...","[0.0028666635, 0.013832962, 0.012888774, 0.064..."
368101,0,52,50,11,9,6,20,0.3,0.399992,0.399992,...,0.54545,0.0,1.0,2.0,10.0,0.333333,"[0.4118309, 0.21178155, -0.43707272, 0.0196644...","[0.24862221, 0.08570189, -0.46412998, -0.15518...","[0.1632087, 0.12607965, 0.02705726, 0.17484868...","[0.10239031, 0.018150078, 0.20285855, -0.00305..."
70497,0,95,44,17,8,2,25,0.08,0.0,0.0,...,0.124999,0.0,0.0,8.0,12.0,0.111111,"[0.20822841, 0.023662375, 0.016414564, -0.0337...","[0.23382142, 0.15573585, -0.07020001, 0.386168...","[0.025593013, 0.13207348, 0.08661458, 0.419886...","[0.048688263, 0.0036850802, -0.0011523026, -0...."
226567,1,43,51,7,9,6,16,0.38,0.999975,0.999975,...,0.666659,0.0,1.0,2.0,8.0,0.272727,"[0.22090183, -0.027280003, -0.12204414, -0.079...","[0.21504362, 0.16173556, -0.07358255, -0.11427...","[0.0058582127, 0.18901557, 0.048461586, 0.0351...","[0.04750353, -0.0044121468, 0.008980319, 0.009..."
73186,1,37,28,9,7,4,16,0.25,0.749981,0.599988,...,0.44444,0.0,1.0,2.0,8.0,0.344828,"[0.33585003, 0.11932678, 0.13839345, -0.159580...","[0.3623257, 0.38533857, -0.081577145, 0.074711...","[0.026475668, 0.2660118, 0.21997058, 0.2342924...","[0.1216871, 0.04598121, -0.011289742, -0.01192..."


In [96]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score


# Extract vector columns and flatten them
def flatten_vectors(df, column_name):
    return np.stack(df[column_name].values)  # Converts lists into a 2D NumPy array



In [97]:
vec1_features = flatten_vectors(final_df, "q1_vector")  # Shape: (num_samples, vec1_dim)
vec2_features = flatten_vectors(final_df, "q2_vector")  # Shape: (num_samples, vec2_dim)
vecdiff_features = flatten_vectors(final_df, "abs_diff")
vecmultiply_features = flatten_vectors(final_df, "elementwise_product")
# Extract numerical features
numerical_features = final_df.drop(columns=["is_duplicate", "q1_vector", "q2_vector", "abs_diff", "elementwise_product"]).values  # Shape: (num_samples, num_numerical_features)

# Concatenate all features into one matrix
X = np.hstack((numerical_features, vec1_features, vec2_features, vecdiff_features, vecmultiply_features))  # Final shape: (num_samples, total_features)

# Extract target variable
y = final_df["is_duplicate"].values  # Ensure y is 1D

In [98]:
X

array([[ 3.30000000e+01,  3.30000000e+01,  8.00000000e+00, ...,
        -4.48666455e-04,  2.59953104e-02,  4.79930416e-02],
       [ 5.20000000e+01,  5.00000000e+01,  1.10000000e+01, ...,
        -9.59308352e-03,  1.49802687e-02, -2.65978687e-02],
       [ 9.50000000e+01,  4.40000000e+01,  1.70000000e+01, ...,
         1.41693908e-03, -3.35978111e-04, -1.18861655e-02],
       ...,
       [ 4.20000000e+01,  4.10000000e+01,  7.00000000e+00, ...,
         8.73979181e-02,  1.17329806e-02,  9.82841570e-03],
       [ 3.40000000e+01,  4.60000000e+01,  6.00000000e+00, ...,
        -9.33687575e-03,  2.25696489e-02, -1.81639064e-02],
       [ 2.20000000e+01,  1.90000000e+01,  5.00000000e+00, ...,
         4.42053610e-03,  4.39473763e-02,  1.67480990e-01]])

In [99]:

# First, split your dataset into training and temporary sets.
# For example, 70% for training and 30% for temporary.
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=1)

# Now, split the temporary set equally into validation and test sets.
# This gives you 15% of the data for validation and 15% for testing.
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=1)

# Ensure the correct data types for all sets.
X_train = np.array(X_train, dtype=np.float64)
X_val   = np.array(X_val, dtype=np.float64)
X_test  = np.array(X_test, dtype=np.float64)

y_train = np.array(y_train, dtype=np.int64).ravel()
y_val   = np.array(y_val, dtype=np.int64).ravel()
y_test  = np.array(y_test, dtype=np.int64).ravel()

# Optional: Print shapes to confirm the splits
print("X_train shape:", X_train.shape)
print("X_val shape:  ", X_val.shape)
print("X_test shape: ", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_val shape:  ", y_val.shape)
print("y_test shape: ", y_test.shape)


X_train shape: (70000, 218)
X_val shape:   (15000, 218)
X_test shape:  (15000, 218)
y_train shape: (70000,)
y_val shape:   (15000,)
y_test shape:  (15000,)


# TRYING OUT NEURAL NETWORKS

In [100]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import precision_recall_curve
from tensorflow import keras

In [101]:
# training will stop and the best weights (from the lowest validation loss) will be restored.
early_stop = EarlyStopping(monitor='val_loss', patience=15, min_delta=0.0001, restore_best_weights=True)

In [102]:
# Define the input shape (218 features)
input_shape = (218,)

model = Sequential([
    Input(shape=input_shape),
    Dense(512, activation='relu'),             # First Dense layer with 512 neurons
    Dropout(0.3),                              # Dropout with rate 0.1
    Dense(207, activation='relu'),             # Second Dense layer with 207 neurons
    Dropout(0.3),                              # Dropout with rate ~0.3031
    Dense(128, activation='relu'), 
    Dropout(0.3),                              # Third Dense layer with 128 neurons                      
    Dense(1, activation='sigmoid')             # Output layer for binary classification
])

# Create the Adam optimizer with the specified learning rate
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

# Compile the model with binary crossentropy loss and accuracy as a metric
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Display the model summary
model.summary()

In [103]:
history = model.fit(
    X_train, y_train,
    epochs=100,                # Maximum number of epochs; training may stop earlier due to early stopping.
    batch_size=32,
    validation_data=(X_val, y_val),
    callbacks=[early_stop]    # Include early stopping in the training process.
)

Epoch 1/100
[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.6446 - loss: 0.6374 - val_accuracy: 0.7197 - val_loss: 0.5108
Epoch 2/100
[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.7218 - loss: 0.5021 - val_accuracy: 0.7419 - val_loss: 0.4910
Epoch 3/100
[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.7414 - loss: 0.4854 - val_accuracy: 0.7504 - val_loss: 0.4679
Epoch 4/100
[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.7424 - loss: 0.4830 - val_accuracy: 0.7553 - val_loss: 0.4657
Epoch 5/100
[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.7546 - loss: 0.4685 - val_accuracy: 0.7632 - val_loss: 0.4565
Epoch 6/100
[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.7569 - loss: 0.4678 - val_accuracy: 0.7653 - val_loss: 0.4553
Epoch 7/10

# MODEL EVALUATION

In [104]:
from sklearn.metrics import precision_recall_curve
y_probs = model.predict(X_test).ravel()  # or model.predict(X_test)[:, 0] if output shape is (N,1)
precision, recall, thresholds = precision_recall_curve(y_test, y_probs)


[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step


In [105]:
# Compute F1 score for each threshold
f1_scores = 2 * (precision * recall) / (precision + recall)
best_threshold = thresholds[np.argmax(f1_scores)]

print(f"Best Threshold: {best_threshold}")

Best Threshold: 0.38282012939453125


In [106]:
def predict_with_best_threshold(model, X, threshold):
    y_probs = model.predict(X).ravel()  # Get probabilities
    return (y_probs >= threshold).astype(int)  # Apply the threshold


In [107]:
y_pred_adjusted = predict_with_best_threshold(model, X_test, best_threshold)

[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 859us/step


In [108]:
# Evaluate
new_accuracy = accuracy_score(y_test, y_pred_adjusted)
new_f1 = f1_score(y_test, y_pred_adjusted)

print(f"New Accuracy: {new_accuracy:.4f}")
print(f"New F1 Score: {new_f1:.4f}")

New Accuracy: 0.7697
New F1 Score: 0.7339


In [109]:
from sklearn.metrics import confusion_matrix
# for random forest model
confusion_matrix(y_test,y_pred_adjusted)

array([[6781, 2640],
       [ 815, 4764]], dtype=int64)

In [110]:
import pickle

pickle.dump(model,open('ImprovedNNmodel.pkl','wb'))

# TESTING

In [54]:
def test_common_words(q1,q2):
    w1 = set(map(lambda word: word.lower().strip(), q1.split(" ")))
    w2 = set(map(lambda word: word.lower().strip(), q2.split(" ")))    
    return len(w1 & w2)

In [55]:
def test_total_words(q1,q2):
    w1 = set(map(lambda word: word.lower().strip(), q1.split(" ")))
    w2 = set(map(lambda word: word.lower().strip(), q2.split(" ")))    
    return (len(w1) + len(w2))

In [56]:
def test_fetch_token_features(q1,q2):
    
    SAFE_DIV = 0.0001 

    STOP_WORDS = stopwords.words("english")
    
    token_features = [0.0]*8
    
    # Converting the Sentence into Tokens: 
    q1_tokens = q1.split()
    q2_tokens = q2.split()
    
    if len(q1_tokens) == 0 or len(q2_tokens) == 0:
        return token_features

    # Get the non-stopwords in Questions
    q1_words = set([word for word in q1_tokens if word not in STOP_WORDS])
    q2_words = set([word for word in q2_tokens if word not in STOP_WORDS])
    
    #Get the stopwords in Questions
    q1_stops = set([word for word in q1_tokens if word in STOP_WORDS])
    q2_stops = set([word for word in q2_tokens if word in STOP_WORDS])
    
    # Get the common non-stopwords from Question pair
    common_word_count = len(q1_words.intersection(q2_words))
    
    # Get the common stopwords from Question pair
    common_stop_count = len(q1_stops.intersection(q2_stops))
    
    # Get the common Tokens from Question pair
    common_token_count = len(set(q1_tokens).intersection(set(q2_tokens)))
    
    
    token_features[0] = common_word_count / (min(len(q1_words), len(q2_words)) + SAFE_DIV)
    token_features[1] = common_word_count / (max(len(q1_words), len(q2_words)) + SAFE_DIV)
    token_features[2] = common_stop_count / (min(len(q1_stops), len(q2_stops)) + SAFE_DIV)
    token_features[3] = common_stop_count / (max(len(q1_stops), len(q2_stops)) + SAFE_DIV)
    token_features[4] = common_token_count / (min(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
    token_features[5] = common_token_count / (max(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
    
    # Last word of both question is same or not
    token_features[6] = int(q1_tokens[-1] == q2_tokens[-1])
    
    # First word of both question is same or not
    token_features[7] = int(q1_tokens[0] == q2_tokens[0])
    
    return token_features

In [57]:
def test_fetch_length_features(q1,q2):
    
    length_features = [0.0]*3
    
    # Converting the Sentence into Tokens: 
    q1_tokens = q1.split()
    q2_tokens = q2.split()
    
    if len(q1_tokens) == 0 or len(q2_tokens) == 0:
        return length_features
    
    # Absolute length features
    length_features[0] = abs(len(q1_tokens) - len(q2_tokens))
    
    #Average Token Length of both Questions
    length_features[1] = (len(q1_tokens) + len(q2_tokens))/2
    
    strs = list(distance.lcsubstrings(q1, q2))
    length_features[2] = len(strs[0]) / (min(len(q1), len(q2)) + 1)
    
    return length_features

In [58]:
def test_fetch_vector_features(q1,q2, word2vec_model):
  
    # Tokenize using NLTK
   q1_tokens = word_tokenize(q1.lower())
   q2_tokens = word_tokenize(q2.lower())

   q1_vector = sentence_to_vector(q1_tokens, word2vec_model)
   q2_vector = sentence_to_vector(q2_tokens, word2vec_model)

    # Compute additional feature representations
   abs_diff = np.abs(q1_vector - q2_vector)  # |q1 - q2|
   elementwise_mult = q1_vector * q2_vector  # q1 * q2

    # Combine all features
   vector_features = np.hstack((q1_vector, q2_vector, abs_diff, elementwise_mult))

   return vector_features  # Return the feature vector only

In [59]:
def query_point_creator(q1,q2):
    
    input_query = []
    
    # preprocess
    q1 = preprocess(q1)
    q2 = preprocess(q2)
    
    # fetch basic features
    input_query.append(len(q1))
    input_query.append(len(q2))
    
    input_query.append(len(q1.split(" ")))
    input_query.append(len(q2.split(" ")))
    
    input_query.append(test_common_words(q1,q2))
    input_query.append(test_total_words(q1,q2))
    input_query.append(round(test_common_words(q1,q2)/test_total_words(q1,q2),2))
    
    # fetch token features
    token_features = test_fetch_token_features(q1,q2)
    input_query.extend(token_features)
    
    # fetch length based features
    length_features = test_fetch_length_features(q1,q2)
    input_query.extend(length_features)
   
    # fetch vector based features
    vector_features = test_fetch_vector_features(q1,q2, word2vec_model)
    input_query.extend(vector_features)   
    
    return np.hstack((np.array(input_query)))

In [None]:
q1 = 'How do I can I promote a "launching soon" Startup page?'
q2 = 'Why do people ask questions everyone Quora they could easily search via Google, Bing or Wikipedia?'
q3 = 'What is the free, simplest & fastest way to promote a know page for a new startup?'
q4 = 'Why do people ask find on Quora that could simply be googled?'

In [None]:
import pickle

pickle.dump(model,open('NNmodel.pkl','wb'))

# ML MODEL

In [39]:
# Perform Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Ensure correct data type for training
X_train = np.array(X_train, dtype=np.float64)
X_test = np.array(X_test, dtype=np.float64)
y_train = np.array(y_train, dtype=np.int64).ravel()
y_test = np.array(y_test, dtype=np.int64).ravel()



In [40]:
# Train RandomForest Model
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

# Make Predictions
y_pred = rf.predict(X_test)

# Calculate Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy:", accuracy)

# Compute F1 Score
f1 = f1_score(y_test, y_pred)

print("F1 Score:", f1)

Model Accuracy: 0.7723
F1 Score: 0.6780715396578538


In [74]:
from skopt import BayesSearchCV
# Define search space for hyperparameters
param_space = {
    "n_estimators": (50, 500),  # Number of trees (int range)
    "max_depth": (10, 50),  # Maximum depth (int range)
    "min_samples_split": (2, 10),  # Minimum samples to split a node
    "min_samples_leaf": (1, 4)  # Minimum samples per leaf
}

In [75]:
# Initialize Bayesian Search
bayes_search = BayesSearchCV(
    RandomForestClassifier(),
    param_space,
    n_iter=20,  # Number of iterations (tries different values)
    cv=5,  # 5-fold cross-validation
    scoring="accuracy",
    n_jobs=-1,  # Use all CPU cores
    random_state=42
)

# Fit the model on training data
bayes_search.fit(X_train, y_train)

# Get the best parameters
print("Best Hyperparameters:", bayes_search.best_params_)

Best Hyperparameters: OrderedDict({'max_depth': 25, 'min_samples_leaf': 2, 'min_samples_split': 9, 'n_estimators': 500})


In [41]:
rf_model = RandomForestClassifier(
    max_depth=25, 
    min_samples_leaf=2, 
    min_samples_split=9, 
    n_estimators=500,
    random_state=42,
    n_jobs=-1  # Uses all CPU cores for faster training
)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)


# EVALUATION

In [42]:
from sklearn.metrics import precision_recall_curve
y_probs = rf_model.predict_proba(X_test)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test, y_probs)

In [43]:
# Compute F1 score for each threshold
f1_scores = 2 * (precision * recall) / (precision + recall)
best_threshold = thresholds[np.argmax(f1_scores)]

print(f"Best Threshold: {best_threshold}")

Best Threshold: 0.3946886965636049


In [44]:
# Get predicted probabilities
y_probs = rf_model.predict_proba(X_test)[:, 1]

# Apply the best threshold
y_pred_adjusted = (y_probs >= best_threshold).astype(int)

# Evaluate the new predictions
from sklearn.metrics import accuracy_score, f1_score

new_accuracy = accuracy_score(y_test, y_pred_adjusted)
new_f1 = f1_score(y_test, y_pred_adjusted)

print(f"New Accuracy: {new_accuracy:.4f}")
print(f"New F1 Score: {new_f1:.4f}")

New Accuracy: 0.7516
New F1 Score: 0.7252


In [46]:
from sklearn.metrics import confusion_matrix
# for random forest model
confusion_matrix(y_test,y_pred)

array([[5293, 1028],
       [1172, 2507]], dtype=int64)

# GETTING READY TO BUILD A WEBSITE

In [47]:
def test_common_words(q1,q2):
    w1 = set(map(lambda word: word.lower().strip(), q1.split(" ")))
    w2 = set(map(lambda word: word.lower().strip(), q2.split(" ")))    
    return len(w1 & w2)

In [48]:
def test_total_words(q1,q2):
    w1 = set(map(lambda word: word.lower().strip(), q1.split(" ")))
    w2 = set(map(lambda word: word.lower().strip(), q2.split(" ")))    
    return (len(w1) + len(w2))

In [50]:
def test_fetch_token_features(q1,q2):
    
    SAFE_DIV = 0.0001 

    STOP_WORDS = stopwords.words("english")
    
    token_features = [0.0]*8
    
    # Converting the Sentence into Tokens: 
    q1_tokens = q1.split()
    q2_tokens = q2.split()
    
    if len(q1_tokens) == 0 or len(q2_tokens) == 0:
        return token_features

    # Get the non-stopwords in Questions
    q1_words = set([word for word in q1_tokens if word not in STOP_WORDS])
    q2_words = set([word for word in q2_tokens if word not in STOP_WORDS])
    
    #Get the stopwords in Questions
    q1_stops = set([word for word in q1_tokens if word in STOP_WORDS])
    q2_stops = set([word for word in q2_tokens if word in STOP_WORDS])
    
    # Get the common non-stopwords from Question pair
    common_word_count = len(q1_words.intersection(q2_words))
    
    # Get the common stopwords from Question pair
    common_stop_count = len(q1_stops.intersection(q2_stops))
    
    # Get the common Tokens from Question pair
    common_token_count = len(set(q1_tokens).intersection(set(q2_tokens)))
    
    
    token_features[0] = common_word_count / (min(len(q1_words), len(q2_words)) + SAFE_DIV)
    token_features[1] = common_word_count / (max(len(q1_words), len(q2_words)) + SAFE_DIV)
    token_features[2] = common_stop_count / (min(len(q1_stops), len(q2_stops)) + SAFE_DIV)
    token_features[3] = common_stop_count / (max(len(q1_stops), len(q2_stops)) + SAFE_DIV)
    token_features[4] = common_token_count / (min(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
    token_features[5] = common_token_count / (max(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
    
    # Last word of both question is same or not
    token_features[6] = int(q1_tokens[-1] == q2_tokens[-1])
    
    # First word of both question is same or not
    token_features[7] = int(q1_tokens[0] == q2_tokens[0])
    
    return token_features

In [51]:
def test_fetch_length_features(q1,q2):
    
    length_features = [0.0]*3
    
    # Converting the Sentence into Tokens: 
    q1_tokens = q1.split()
    q2_tokens = q2.split()
    
    if len(q1_tokens) == 0 or len(q2_tokens) == 0:
        return length_features
    
    # Absolute length features
    length_features[0] = abs(len(q1_tokens) - len(q2_tokens))
    
    #Average Token Length of both Questions
    length_features[1] = (len(q1_tokens) + len(q2_tokens))/2
    
    strs = list(distance.lcsubstrings(q1, q2))
    length_features[2] = len(strs[0]) / (min(len(q1), len(q2)) + 1)
    
    return length_features

In [52]:
def test_fetch_vector_features(q1,q2, word2vec_model):
  
    # Tokenize using NLTK
   q1_tokens = word_tokenize(q1.lower())
   q2_tokens = word_tokenize(q2.lower())

   q1_vector = sentence_to_vector(q1_tokens, word2vec_model)
   q2_vector = sentence_to_vector(q2_tokens, word2vec_model)

    # Compute additional feature representations
   abs_diff = np.abs(q1_vector - q2_vector)  # |q1 - q2|
   elementwise_mult = q1_vector * q2_vector  # q1 * q2

    # Combine all features
   vector_features = np.hstack((q1_vector, q2_vector, abs_diff, elementwise_mult))

   return vector_features  # Return the feature vector only

In [53]:
def query_point_creator(q1,q2):
    
    input_query = []
    
    # preprocess
    q1 = preprocess(q1)
    q2 = preprocess(q2)
    
    # fetch basic features
    input_query.append(len(q1))
    input_query.append(len(q2))
    
    input_query.append(len(q1.split(" ")))
    input_query.append(len(q2.split(" ")))
    
    input_query.append(test_common_words(q1,q2))
    input_query.append(test_total_words(q1,q2))
    input_query.append(round(test_common_words(q1,q2)/test_total_words(q1,q2),2))
    
    # fetch token features
    token_features = test_fetch_token_features(q1,q2)
    input_query.extend(token_features)
    
    # fetch length based features
    length_features = test_fetch_length_features(q1,q2)
    input_query.extend(length_features)
   
    # fetch vector based features
    vector_features = test_fetch_vector_features(q1,q2, word2vec_model)
    input_query.extend(vector_features)   
    
    return np.hstack((np.array(input_query)))

In [58]:
q1 = 'How do I can I promote a "launching soon" Startup page?'
q2 = 'Why do people ask questions everyone Quora they could easily search via Google, Bing or Wikipedia?'
q3 = 'What is the free, simplest & fastest way to promote a know page for a new startup?'
q4 = 'Why do people ask find on Quora that could simply be googled?'

In [59]:
query_features = query_point_creator(q2, q4).reshape(1, -1)
prediction = rf_model.predict(query_features)
print(prediction)  # Outputs 0 or 1

[1]


In [168]:
import pickle

pickle.dump(rf_model,open('model.pkl','wb'))