In [None]:
!pip3 install gensim
!pip3 install fuzzywuzzy
!pip3 install textblob

In [None]:
import os
from gensim.similarities import WmdSimilarity
import pandas as pd
import numpy as np
import gensim

from gensim import corpora
import gensim.downloader as api
from gensim.matutils import softcossim
from gensim.models import Word2Vec
from fuzzywuzzy import fuzz
from nltk.corpus import stopwords
from tqdm import tqdm_notebook
from nltk import word_tokenize
from scipy.stats import skew, kurtosis
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix  
from sklearn.metrics import accuracy_score
import xgboost as xgb
from textblob import Word
from textblob import TextBlob
import spacy
import datetime
from collections import defaultdict
from collections import Counter
import functools

In [None]:
#!python -m spacy download en_core_web_lg
nlp = spacy.load('en_core_web_lg')  

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

In [None]:
## Load the Drive helper and mount
#from google.colab import drive
#
## This will prompt for authorization.
#drive.mount('/content/drive')

# **Loading data**

In [None]:
train_path = os.path.join('..','data','train_data_v3_processed.csv')
df_train = pd.read_csv(train_path)

In [None]:
test_path = os.path.join('..','data','test_data_v3_processed.csv')
df_test = pd.read_csv(test_path)

In [None]:
print(len(df_train.columns))
df_train.columns

In [None]:
df_train.head()

# Constants

In [None]:
QUESTION1 = 'question1_lemma'
QUESTION2 = 'question2_lemma'

QUESTION1_original = 'question1'
QUESTION2_original = 'question2'

# **Fuzzy features**

In [None]:
def setFuzzyFeatures(df):  
    df['fuzz_ratio'] = df.apply(lambda x: fuzz.ratio(str(x[QUESTION1]), str(x[QUESTION2])), axis=1)
    df['fuzz_partial_ratio'] = df.apply(lambda x: fuzz.partial_ratio(str(x[QUESTION1]), str(x[QUESTION2])), axis=1)
    df['fuzz_partial_token_set_ratio'] = df.apply(lambda x: fuzz.partial_token_set_ratio(str(x[QUESTION1]), str(x[QUESTION2])), axis=1)
    df['fuzz_partial_token_sort_ratio'] = df.apply(lambda x: fuzz.partial_token_sort_ratio(str(x[QUESTION1]), str(x[QUESTION2])), axis=1)
    df['fuzz_token_set_ratio'] = df.apply(lambda x: fuzz.token_set_ratio(str(x[QUESTION1]), str(x[QUESTION2])), axis=1)
    df['fuzz_token_sort_ratio'] = df.apply(lambda x: fuzz.token_sort_ratio(str(x[QUESTION1]), str(x[QUESTION2])), axis=1)

# **String features**

### Prepare data for string features

In [None]:
def _get_weight(count, eps=10000, min_count=2):
    if count < min_count:
        return 0
    else:
        return 1 / (count + eps)
    
ques = pd.concat([df_train[['question1', 'question2']], \
                  df_test[['question1', 'question2']]], axis=0).reset_index(drop='index')
q_dict = defaultdict(set)
for i in range(ques.shape[0]):
    q_dict[ques.question1[i]].add(ques.question2[i])
    q_dict[ques.question2[i]].add(ques.question1[i])


df_train_question1 = df_train['question1'].map(lambda x: str(x).lower().split())
df_train_question2 = df_train['question2'].map(lambda x: str(x).lower().split())

train_qs = pd.Series(df_train_question1.tolist() + df_train_question2.tolist())

words = [x for y in train_qs for x in y]
counts = Counter(words)
weights = {word: _get_weight(count) for word, count in counts.items()}

In [None]:
def _word_match_share(row, stops=None):
    q1words = {}
    q2words = {}
    for word in row['question1']:
        if word not in stops:
            q1words[word] = 1
    for word in row['question2']:
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    R = (len(shared_words_in_q1) + len(shared_words_in_q2)) / (len(q1words) + len(q2words))
    return R


def _jaccard(row):
    wic = set(row['question1']).intersection(set(row['question2']))
    uw = set(row['question1']).union(row['question2'])
    if len(uw) == 0:
        uw = [1]
    return (len(wic) / len(uw))


def _common_words(row):
    return len(set(row['question1']).intersection(set(row['question2'])))


def _total_unique_words(row):
    return len(set(row['question1']).union(row['question2']))


def _total_unq_words_stop(row, stops):
    return len([x for x in set(row['question1']).union(row['question2']) if x not in stops])


def _wc_diff(row):
    return abs(len(row['question1']) - len(row['question2']))


def _wc_ratio(row):
    l1 = len(row['question1']) * 1.0
    l2 = len(row['question2'])
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2


def _wc_diff_unique(row):
    return abs(len(set(row['question1'])) - len(set(row['question2'])))


def _wc_ratio_unique(row):
    l1 = len(set(row['question1'])) * 1.0
    l2 = len(set(row['question2']))
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2


def _wc_diff_unique_stop(row, stops=None):
    return abs(len([x for x in set(row['question1']) if x not in stops]) - len(
        [x for x in set(row['question2']) if x not in stops]))


def _wc_ratio_unique_stop(row, stops=None):
    l1 = len([x for x in set(row['question1']) if x not in stops]) * 1.0
    l2 = len([x for x in set(row['question2']) if x not in stops])
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2


def _same_start_word(row):
    if not row['question1'] or not row['question2']:
        return np.nan
    return int(row['question1'][0] == row['question2'][0])


def _char_diff(row):
    return abs(len(''.join(row['question1'])) - len(''.join(row['question2'])))


def _char_ratio(row):
    l1 = len(''.join(row['question1']))
    l2 = len(''.join(row['question2']))
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2


def _char_diff_unique_stop(row, stops=None):
    return abs(len(''.join([x for x in set(row['question1']) if x not in stops])) - len(
        ''.join([x for x in set(row['question2']) if x not in stops])))

def _tfidf_word_match_share_stops(row, stops=None, weights=None):
    q1words = {}
    q2words = {}
    for word in row['question1']:
        if word not in stops:
            q1words[word] = 1
    for word in row['question2']:
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0

    shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in
                                                                                    q2words.keys() if w in q1words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]

    R = np.sum(shared_weights) / np.sum(total_weights)
    return R


def _tfidf_word_match_share(row, weights=None):
    q1words = {}
    q2words = {}
    for word in row['question1']:
        q1words[word] = 1
    for word in row['question2']:
        q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0

    shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in
                                                                                    q2words.keys() if w in q1words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]

    R = np.sum(shared_weights) / np.sum(total_weights)
    return R
def _q1_freq(row):
    return (len(q_dict[row['question1']]))

def _q2_freq(row):
    return (len(q_dict[row['question2']]))

def _q1_q2_intersect(row):
    return (len(set(q_dict[row['question1']]).intersection(set(q_dict[row['question2']]))))

def _avg_word(sentence):
    words = sentence.split()
    return (sum(len(word) for word in words)/len(words))

def _numStopWords(sentence):
    return len([x for x in sentence.split() if x in stop_words])

def _numNumbers(sentence):
    return len([x for x in sentence.split() if x.isdigit()])

def _numUppercaseWords(sentence):
    return len([x for x in sentence.split() if x.isupper()])

def setStringFeatures(df):  
    df['len_q1'] = df.question1.apply(lambda x: len(str(x)))
    df['len_q2'] = df.question2.apply(lambda x: len(str(x)))
    df['diff_len'] = df.len_q1 - df.len_q2
    df['len_char_q1'] = df.question1.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
    df['len_char_q2'] = df.question2.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
    df['len_word_q1'] = df.question1.apply(lambda x: len(str(x).split()))
    df['len_word_q2'] = df.question2.apply(lambda x: len(str(x).split()))
    df['common_words'] = df.apply(lambda x: len(set(str(x[QUESTION1]).lower().split()).intersection(set(str(x[QUESTION2]).lower().split()))), axis=1)
    df['avg_word_q1'] = df.question1.apply(lambda x: _avg_word(x))
    df['avg_word_q2'] = df.question2.apply(lambda x: _avg_word(x))
    df['num_stop_words_q1'] = df.question1.apply(lambda x: _numStopWords(x))
    df['num_stop_words_q2'] = df.question2.apply(lambda x: _numStopWords(x))
    df['numerics_q1'] = df.question1.apply(lambda x: _numNumbers(x))
    df['numerics_q2'] = df.question2.apply(lambda x: _numNumbers(x))
    df['uppercase_q1'] = df.question1.apply(lambda x: _numUppercaseWords(x))
    df['uppercase_q2'] = df.question2.apply(lambda x: _numUppercaseWords(x))
    df['char_ratio'] = df.apply(_char_ratio, axis=1, raw=True)  
    df['jaccard'] = df.apply(_jaccard, axis=1, raw=True)  
    df['wc_diff'] = df.apply(_wc_diff, axis=1, raw=True)  
    df['wc_ratio'] = df.apply(_wc_ratio, axis=1, raw=True)  
    df['wc_diff_unique'] = df.apply(_wc_diff_unique, axis=1, raw=True)  
    df['wc_ratio_unique'] = df.apply(_wc_ratio_unique, axis=1, raw=True) 
    df['same_start'] = df.apply(_same_start_word, axis=1, raw=True)  
    df['char_diff'] = df.apply(_char_diff, axis=1, raw=True)  
    df['common_words'] = df.apply(_common_words, axis=1, raw=True)  
    df['total_unique_words'] = df.apply(_total_unique_words, axis=1, raw=True)  
    df['q1_q2_intersect'] = df.apply(_q1_q2_intersect, axis=1, raw=True)
    df['q1_freq'] = df.apply(_q1_freq, axis=1, raw=True)
    df['q2_freq'] = df.apply(_q2_freq, axis=1, raw=True)

    f = functools.partial(_word_match_share, stops=stop_words)
    df['word_match'] = df.apply(f, axis=1, raw=True)
    
    f = functools.partial(_tfidf_word_match_share, weights=weights)
    df['tfidf_wm'] = df.apply(f, axis=1, raw=True) 
    
    f = functools.partial(_tfidf_word_match_share_stops, stops=stop_words, weights=weights)
    df['tfidf_wm_stops'] = df.apply(f, axis=1, raw=True)  
    
    f = functools.partial(_wc_diff_unique_stop, stops=stop_words)
    df['wc_diff_unq_stop'] = df.apply(f, axis=1, raw=True)  
    
    f = functools.partial(_wc_ratio_unique_stop, stops=stop_words)
    df['wc_ratio_unique_stop'] = df.apply(f, axis=1, raw=True)  
    
    f = functools.partial(_char_diff_unique_stop, stops=stop_words)
    df['char_diff_unq_stop'] = df.apply(f, axis=1, raw=True)  
    
    f = functools.partial(_total_unq_words_stop, stops=stop_words)
    df['total_unq_words_stop'] = df.apply(f, axis=1, raw=True)  

    
  

# **Word2Vec features**

In [None]:
path_google_news = os.path.join('..','data','GoogleNews-vectors-negative300.bin.gz')
# Load pretrained model (since intermediate data is not included, the model cannot be refined with additional data)
w2v_model = gensim.models.KeyedVectors.load_word2vec_format(path_google_news, binary=True)


In [None]:
def _getDictionary(df):  
    documents = list(df[QUESTION1_original].apply(lambda x: x.split()))+list(df[QUESTION2_original].apply(lambda x: x.split()))
    
    return corpora.Dictionary(documents)
   
def _softCossim(row, dictionary, similarity_matrix):
    q1 = row[QUESTION1_original].split()
    q2 = row[QUESTION2_original].split()
    
    q1 = dictionary.doc2bow(q1)
    q2 = dictionary.doc2bow(q2)
    
    return softcossim(q1, q2, similarity_matrix)
 
def setWord2VecFeatures(df):
    dictionary = _getDictionary(df)
    similarity_matrix = w2v_model.similarity_matrix(dictionary)
    
    df['softcossim'] = df.apply(lambda row: _softCossim(row, dictionary, similarity_matrix), axis=1)
  

# **TextBlob**

In [None]:
def _numSpellingMistakes(sentence):
    mistakes = 0
    for word in sentence.split():
        if TextBlob(word).correct() != word:
            mistakes += 1
    return mistakes


def setTextBlobFeatures(df):
    df['mistakes_q1'] = df.question1.apply(lambda x: _numSpellingMistakes(x))
    df['mistakes_q2'] = df.question2.apply(lambda x: _numSpellingMistakes(x))

# **Spacy**

In [None]:
def _getSpacySimilarity(row):
    q1 = row[QUESTION1_original]
    q2 = row[QUESTION2_original]
    
    tokens1 = nlp(q1)
    tokens2 = nlp(q2)
    
    return tokens1.similarity(tokens2)

def setSpacyFeatures(df):
    df['spacy_sim'] = df.apply(lambda row: _getSpacySimilarity(row), axis=1)
  

# Merge Features **Functions**

In [None]:
def saveDf(df):
    df2 = df.set_index('id')
    df2.to_csv(path_or_buf=train_path, sep=',')


def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60.
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)

def setFeatures(df):
    print('start setting features')
    
    start_time = datetime.datetime.now()
    setFuzzyFeatures(df)
    end_time = datetime.datetime.now()
    seconds_elapsed = (end_time - start_time).total_seconds()
    print('finished setFuzzyFeatures')
    print("It took {} to execute this".format(hms_string(seconds_elapsed)))
    
    start_time = datetime.datetime.now()
    setStringFeatures(df)
    end_time = datetime.datetime.now()
    seconds_elapsed = (end_time - start_time).total_seconds()
    print('finished setStringFeatures')
    print("It took {} to execute this".format(hms_string(seconds_elapsed)))
    
    start_time = datetime.datetime.now()
    setWord2VecFeatures(df)
    end_time = datetime.datetime.now()
    seconds_elapsed = (end_time - start_time).total_seconds()
    print('finished setWord2VecFeatures')
    print("It took {} to execute this".format(hms_string(seconds_elapsed)))
    
    start_time = datetime.datetime.now()
    setTextBlobFeatures(df)
    end_time = datetime.datetime.now()
    seconds_elapsed = (end_time - start_time).total_seconds()
    print('finished setTextBlobFeatures')
    print("It took {} to execute this".format(hms_string(seconds_elapsed)))
    
    start_time = datetime.datetime.now()
    setSpacyFeatures(df)
    end_time = datetime.datetime.now()
    seconds_elapsed = (end_time - start_time).total_seconds()
    print('finished setSpacyFeatures')
    print("It took {} to execute this".format(hms_string(seconds_elapsed)))
  

# **Execution**

In [None]:
df = df_train

In [None]:
setFeatures(df)

In [None]:
df.head()

In [None]:
saveDf(df)

In [None]:
setFeatures(df_test)

In [None]:
df_test.head()

In [None]:
saveDf(df_test)