
## DSIT 2021 BIG DATA ANALYTICS KAGGLE KERNEL
### Word2Vec + XGBoost Quora question similarity

### Import Libraries used

In [1]:
import pandas as pd
import numpy as np
import gensim
from tqdm.notebook import tqdm
from fuzzywuzzy import fuzz
from nltk.corpus import stopwords
from nltk import word_tokenize
stop_words = stopwords.words('english')
from sklearn import preprocessing
from scipy.spatial.distance import cosine
import time
import multiprocessing

### Define some helper functions to be used later

In [2]:
def tokenize_text(text):
    text = str(text)
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text.lower())
    filtered_tokens = [w for w in word_tokens if not w in stop_words]
    return filtered_tokens


def jaccard_sim(text1, text2):
    tk_text1 = set(tokenize_text(text1))
    tk_text2 = set(tokenize_text(text2))
    j_sim = float(len(tk_text1.intersection(tk_text2)) / len(tk_text1.union(tk_text2)))
    j_sim = np.nan_to_num(j_sim, posinf=100, neginf=0)
    return int(j_sim * 100)


def fuzzy_score(text1, text2):
    f_score = fuzz.token_set_ratio(text1, text2)
    return f_score

def wmd(q1, q2, model):
    q1 = tokenize_text(q1)
    q2 = tokenize_text(q2)
    return model.wv.wmdistance(q1, q2)

def q2vec(q, model):
    q = tokenize_text(q)
    V = []
    for w in q:
        try:
            V.append(model.wv[w])
        except:
            continue
    if len(V)==0:
        V = np.zeros(300)
    else:
        V = np.array(V)
    return np.mean(V, axis=0)




def cross_val(clf, X, y, k=5):
    # perform k fold cross validation and print the requested metrics
    # if to_file = True write the requested metrics to a file
    print("Attempting 5-fold cross validation...")
    scoring = {
        'acc': 'accuracy',
        'prec_macro': 'precision_macro',
        'rec_macro': 'recall_macro',
        'f1_macro': 'f1_macro'

    }
    scores = cross_validate(clf, X, y, cv=k, scoring=scoring, return_train_score=False, n_jobs=2)
    print('Accuracy:', np.mean(scores['test_acc']), scores['test_acc'])
    print('Precision:', np.mean(scores['test_prec_macro']), scores['test_prec_macro'])
    print('Recall:', np.mean(scores['test_rec_macro']), scores['test_rec_macro'])
    print('F-Measure:', np.mean(scores['test_f1_macro']), scores['test_f1_macro'])
    print('Fit-Time:', np.mean(scores['fit_time']), scores['fit_time'])

### We start off by reading our training data and tokenizing

In [3]:
df = pd.read_csv('../input/2b-training/train.csv')
print('Tokenizing Questions...')
list_of_lists = []
for index, row in tqdm(df.iterrows()):
    list_of_lists.append(tokenize_text(row['Question1']))
    list_of_lists.append(tokenize_text(row['Question2']))
print(len(list_of_lists))



Tokenizing Questions...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


566008


### Instantiate the Word2Vec gensim model and build its vocabulary

In [4]:
cores = multiprocessing.cpu_count()
model = gensim.models.Word2Vec(size=300, window=5, min_count=5, sg=1, workers=cores)
model.save("word2vec.model") # save the model for later usage
t = time.time()
model.build_vocab(list_of_lists, progress_per=10000)
print('Time to build vocab: {} mins'.format(round((time.time() - t) / 60, 2)))

Time to build vocab: 0.2 mins


### Train the Word2Vec gensim model for 30 epochs

In [5]:
t = time.time()

model.train(list_of_lists, total_examples = model.corpus_count, epochs=30, report_delay=1)
#model = gensim.models.Word2Vec.load("word2vec.model") # a pretrained model can  be loaded

print('Time to train the model: {} mins'.format(round((time.time() - t) / 60, 2)))

Time to train the model: 5.68 mins


### After training our model we generate the question embeddings (vectors)
### We also generate our hand-crafted features

In [6]:
question1_vectors = np.zeros((df.shape[0], 300))
for i, q in enumerate(tqdm(df.Question1.values)):
    question1_vectors[i, :] = q2vec(q, model)
    
question2_vectors  = np.zeros((df.shape[0], 300))
for i, q in enumerate(tqdm(df.Question2.values)):
    question2_vectors[i, :] = q2vec(q, model)

np.save('q1_vectors', question1_vectors)
np.save('q2_vectors', question2_vectors)

df['fuzz_token_set_ratio'] = df.apply(lambda x: fuzz.token_set_ratio(str(x['Question1']), str(x['Question2'])), axis=1)
print("fuzzy done")
df['jaccard_score'] = df.apply(lambda x: jaccard_sim(str(x['Question1']), str(x['Question2'])), axis=1)
print("jaccard done")
df['wmd'] = df.apply(lambda x: wmd(x['Question1'], x['Question2'], model), axis=1)
print("wmd done")
df['cos_sim'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors, posinf=1, neginf=0), np.nan_to_num(question2_vectors, posinf=1, neginf=0))]
print("cosine done")

df.head()

df.drop(['Id', 'Question1', 'Question2'], axis=1, inplace=True)

HBox(children=(FloatProgress(value=0.0, max=283004.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=283004.0), HTML(value='')))


fuzzy done
jaccard done
wmd done


  dist = 1.0 - uv / np.sqrt(uu * vv)


cosine done


In [7]:
# vectorize and extrac features from the test set
df_test = pd.read_csv('../input/2b-training/test_without_labels.csv')

question1_vectors_test = np.zeros((df_test.shape[0], 300))
for i, q in enumerate(tqdm(df_test.Question1.values)):
    question1_vectors_test[i, :] = q2vec(q, model)
    
question2_vectors_test  = np.zeros((df_test.shape[0], 300))
for i, q in enumerate(tqdm(df_test.Question2.values)):
    question2_vectors_test[i, :] = q2vec(q, model)
    
np.save('q1_vectors_test', question1_vectors_test)
np.save('q2_vectors_test', question2_vectors_test)
    
df_test['fuzz_token_set_ratio'] = df_test.apply(lambda x: fuzz.token_set_ratio(str(x['Question1']), str(x['Question2'])), axis=1)
print("fuzzy done")
df_test['jaccard_score'] = df_test.apply(lambda x: jaccard_sim(str(x['Question1']), str(x['Question2'])), axis=1)
print("jaccard done")
df_test['wmd'] = df_test.apply(lambda x: wmd(x['Question1'], x['Question2'], model), axis=1)
print("wmd done")
df_test['cos_sim'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors_test, posinf=1, neginf=0), np.nan_to_num(question2_vectors_test, posinf=1, neginf=0))]
print("cosine done")

df_test.head()

df_test.drop(['Id', 'Question1', 'Question2'], axis=1, inplace=True)

HBox(children=(FloatProgress(value=0.0, max=121287.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=121287.0), HTML(value='')))


fuzzy done
jaccard done
wmd done
cosine done


In [8]:
# clean training df
#df = pd.read_csv('out.csv')
df.isnull().sum()
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df = df.fillna(df.mean())

# clean test df
df_test.isnull().sum()
df_test.replace([np.inf, -np.inf], np.nan, inplace=True)
df_test= df_test.fillna(df_test.mean())

### Create the training set, test set as well as our labels

In [9]:
df.head()
X = df.loc[:, df.columns != 'IsDuplicate']
y = df.loc[:, df.columns == 'IsDuplicate']
X_test = df_test.loc[:, df_test.columns != 'IsDuplicate']
normalizer = preprocessing.Normalizer()
X = normalizer.fit_transform(X)
X_test = normalizer.fit_transform(X_test)

### Generate our finalized training and test set by stacking our word vectors and custom features

In [10]:


q1_train = question1_vectors
q2_train = question2_vectors
q1_test = question1_vectors_test
q2_test = question2_vectors_test

print(q1_train.shape)
print(q2_train.shape)
print(q1_test.shape)
print(q2_test.shape)
full_train = np.hstack((q1_train, q2_train, X))
full_test = np.hstack((q1_test, q2_test, X_test))

print(full_train.shape)
print(full_test.shape)


(283004, 300)
(283004, 300)
(121287, 300)
(121287, 300)
(283004, 604)
(121287, 604)


### Finally Use a model to classify our questions as duplicates or not

In [14]:
from sklearn.model_selection import cross_validate
import xgboost as xgb

le = preprocessing.LabelEncoder()
y = le.fit_transform(df["IsDuplicate"])





MAX_TREE_DEPTH = 8
TREE_METHOD = 'gpu_hist'
ITERATIONS = 1000
SUBSAMPLE = 1
REGULARIZATION = 0.0
GAMMA = 0
POS_WEIGHT = 1
EARLY_STOP = 10

clf = xgb.XGBClassifier(tree_method = 'gpu_hist', n_estimators=200, random_state=0, max_depth=MAX_TREE_DEPTH, alpha=REGULARIZATION, gamma=GAMMA, subsample=SUBSAMPLE,
                          scale_pos_weight=POS_WEIGHT, learning_rate=0.01, silent=1, objective='binary:logistic', early_stopping_rounds=EARLY_STOP,
                          colsample_bytree=0.9)

# perform cross validation
cross_val(clf, full_train, y)

# generate file for Kaggle
# clf.fit(full_train, y)
# pred = clf.predict(full_test)
# test_set_df=pd.read_csv('../input/2b-training/test_without_labels.csv')
# pred_df = pd.DataFrame(data={"Predicted": pred}, index=test_set_df['Id'])
# pred_df.to_csv('testSet_categoriescsv')



Attempting 5-fold cross validation...
Accuracy: 0.7463216140158595 [0.74733662 0.74648858 0.74668292 0.74272539 0.74837456]
Precision: 0.7439042461095585 [0.74454819 0.7443002  0.7446263  0.74033704 0.74570949]
Recall: 0.7610651825073024 [0.76172476 0.7615075  0.76186636 0.75724943 0.76297786]
F-Measure: 0.7413735645553698 [0.74229339 0.74161431 0.7418519  0.73771761 0.74339061]
Fit-Time: 123.09309844970703 [124.4544189  125.14486599 124.84476137 117.46773767 123.55370831]
