## Featurizing text data with tfidf weighted word-vectors

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import time
import warnings
warnings.filterwarnings('ignore')

from nltk.corpus import stopwords
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import sys
import os
from tqdm import tqdm

#!pip install spacy
import spacy


# exctract word2vec vectors
# https://github.com/explosion/spaCy/issues/1721
# http://landinghub.visualstudio.com/visual-cpp-build-tools

In [2]:
df = pd.read_csv('train.csv')
df.head(3)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0


In [3]:
# convert both questions into string only
df['question1'] = df['question1'].apply(lambda x: str(x))
df['question2'] = df['question2'].apply(lambda x: str(x))

In [4]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [5]:
questions = list(df['question1'])+list(df['question2'])

tfidf = TfidfVectorizer(lowercase=False)
tfidf.fit_transform(questions)

<808580x109679 sparse matrix of type '<class 'numpy.float64'>'
	with 8146555 stored elements in Compressed Sparse Row format>

In [6]:
word2tfidf = dict(zip(tfidf.get_feature_names(),tfidf.idf_))

__After we find TFIDF scores, we convert each questions to weighted average of word2vec vectors by these scores__<br>
__here we are goining to use GLOVE model, which comes free with "SPACY"__<br>
https://spacy.io/usage/vectors-similarity<br>
__It is trained on wikipedia therefore, it is stronger in terms of word semantics__

In [9]:
!python -m spacy download en_core_web_lg
# !python -m spacy download en_core_web_md
!python -m spacy download en_core_web_sm

Collecting en-core-web-lg==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.4.0/en_core_web_lg-3.4.0-py3-none-any.whl (587.7 MB)
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_lg')
Collecting en-core-web-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.0/en_core_web_sm-3.4.0-py3-none-any.whl (12.8 MB)
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [11]:
# en_vectors_web_lg, which includes over 1 million unique vectors.
nlp = spacy.load('en_core_web_lg')
vecs1 = []
# https://github.com/noamraph/tqdm
# tqdm is used to print the progrss bar
for qu1 in tqdm(list(df['question1'])):
    doc1 = nlp(qu1) 
    # 384 is the number of dimensions of vectors 
    mean_vec1 = np.zeros([len(doc1), len(doc1[0].vector)])
    for word1 in doc1:
        # word2vec
        vec1 = word1.vector
        # fetch df score
        try:
            idf = word2tfidf[str(word1)]
        except:
            idf = 0
        # compute final vec
        mean_vec1 += vec1 * idf
    mean_vec1 = mean_vec1.mean(axis=0)
    vecs1.append(mean_vec1)
df['q1_feats_m'] = list(vecs1)

100%|████████████████████████████████████████████████████████████████████████| 404290/404290 [1:09:06<00:00, 97.51it/s]


300

In [27]:
x=nlp('man')
print(len(x.vector))
print(x.vector)

300
[ -1.2867    -0.7992    -2.092     -0.77679   -2.5057     2.7123
   0.59127    3.2927    -1.5826     6.4515     1.3452    -1.9711
   0.93059    2.8943     4.2116     1.6        2.6821    -8.4476
   2.3301     6.0751    -0.39937    7.3433    -2.2546    -5.9357
   3.6748    -4.9191    -3.1941    -4.2882     3.4951    -3.1585
   0.69749    0.48132   -0.6059     0.22147   -2.9045     0.27525
  -6.0088     5.0995    -3.367      2.6089    -5.6207    -2.6762
   6.0931     3.1168     3.2641    -4.0576    -4.435      1.4214
   0.59049    8.941      2.0718     5.3188     2.8866     0.0945
  -0.25755    0.93984    7.9412    -2.2701    -0.65029    1.4952
  -2.5503    -3.7978    -5.853     -1.7847     1.4484    -3.9781
  -1.3968   -10.793     -4.5546    -0.12542    4.4986     1.7492
   0.50073   -1.1922     2.0405    -2.1606    -1.5879    10.005
   1.5086    -2.7168    -1.2617    -2.1364     1.2624    -4.1934
   0.87337    2.2741    -1.8725     4.7847    -0.19699    0.49063
   1.676     -7.2461

In [12]:
vecs2 = []

for qu2 in tqdm(list(df['question2'])):
    doc2 = nlp(qu2)
    mean_vec2 = np.zeros([len(doc2), len(doc2[0].vector)])
    
    for word2 in doc2:
        # word2vec
        vec2 = word2.vector
        # fetch df score
        try:
            idf = word2tfidf[str(word2)]
        except:
            #print word
            idf=0
            
        # compute final vec
        mean_vec2+= vec2*idf
    mean_vec2 = mean_vec2.mean(axis=0)
    vecs2.append(mean_vec2)
df['q2_feats_m'] = list(vecs2)

100%|████████████████████████████████████████████████████████████████████████| 404290/404290 [1:22:06<00:00, 82.07it/s]


### Merging all the three featurized trained dataset

In [13]:
if os.path.isfile('nlp_features_train.csv'):
    dfnlp = pd.read_csv('nlp_features_train.csv', encoding='latin-1')
else:
    print("download nlp_features_train.csv from drive or run previous notebook")
    
if os.path.isfile('df_fe_without_preprocessing_train.csv'):
    dfppro = pd.read_csv('df_fe_without_preprocessing_train.csv', encoding='latin-1')
else:
    print("download df_fe_without_preprocessing_train.csv from drive or run previous notbook")
    

In [15]:
dfnlp.head(2)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,cwc_min,cwc_max,csc_min,csc_max,...,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,token_set_ratio,token_sort_ratio,fuzz_ratio,fuzz_partial_ratio,longest_substr_ratio
0,0,1,2,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,0,0.99998,0.833319,0.999983,0.999983,...,0.785709,0.0,1.0,2.0,13.0,100,93,93,100,0.982759
1,1,3,4,what is the story of kohinoor koh i noor dia...,what would happen if the indian government sto...,0,0.799984,0.399996,0.749981,0.599988,...,0.466664,0.0,1.0,5.0,12.5,86,63,66,75,0.596154


In [16]:
dfppro.head(2)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,freq_qid1,freq_qid2,q1len,q2len,q1_n_words,q2_n_words,word_Common,word_Total,word_share,freq_q1+q2,freq_q1-q2
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,1,1,66,57,14,12,10.0,23.0,0.434783,2,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,4,1,51,88,8,13,4.0,20.0,0.2,5,3


In [18]:
df.head(3)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_feats_m,q2_feats_m
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,"[-17.302040576934814, 65.07790648937225, -262....","[-5.95164680480957, 77.03271555900574, -248.92..."
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,"[-21.27127695083618, 42.34149789810181, 84.177...","[-111.66787052154541, 108.6460747718811, -29.2..."
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,"[-48.436742186546326, 112.14766091108322, -157...","[-29.703657031059265, 19.86686795949936, -63.5..."


In [17]:
df1 = dfnlp.drop(['qid1','qid2', 'question1', 'question2'], axis=1)
df2 = dfppro.drop(['qid1','qid2', 'question1', 'question2', 'is_duplicate'], axis=1)
df3 = df.drop(['qid1','qid2','question1','question2','is_duplicate'],axis=1)

df3_q1 = pd.DataFrame(df3.q1_feats_m.values.tolist(), index=df3.index)
df3_q2 = pd.DataFrame(df3.q2_feats_m.values.tolist(), index=df3.index)

In [19]:
df1.head(1)

Unnamed: 0,id,is_duplicate,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,token_set_ratio,token_sort_ratio,fuzz_ratio,fuzz_partial_ratio,longest_substr_ratio
0,0,0,0.99998,0.833319,0.999983,0.999983,0.916659,0.785709,0.0,1.0,2.0,13.0,100,93,93,100,0.982759


In [21]:
df2.head(2)

Unnamed: 0,id,freq_qid1,freq_qid2,q1len,q2len,q1_n_words,q2_n_words,word_Common,word_Total,word_share,freq_q1+q2,freq_q1-q2
0,0,1,1,66,57,14,12,10.0,23.0,0.434783,2,0
1,1,4,1,51,88,8,13,4.0,20.0,0.2,5,3


In [22]:
df3_q1.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-17.302041,65.077906,-262.939456,-20.686562,168.056481,37.079087,-62.802002,163.003715,-256.633855,-5.613477,...,120.913699,-132.868682,-111.14414,76.800932,-17.031813,18.101131,-26.370254,-169.293018,-136.936277,95.165242
1,-21.271277,42.341498,84.177521,-106.393414,88.151337,-43.99884,45.112466,109.73658,21.342126,-31.136926,...,51.731451,21.67201,50.035608,-10.191149,-110.361197,60.418664,36.341111,-174.373243,-63.843372,79.621566
2,-48.436742,112.147661,-157.016985,66.946747,200.748916,-25.216032,68.918125,361.157204,-185.235991,79.779303,...,-38.794809,-44.498598,87.357947,27.83921,-68.311364,156.044141,117.030545,-217.245489,-194.690445,63.290935


In [23]:
df3_q2.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-5.951647,77.032716,-248.923202,-25.578063,109.20947,59.628984,-78.496038,139.783484,-279.317965,9.203245,...,89.509871,-110.533625,-100.479424,90.664803,19.507948,-15.140525,-19.345136,-129.408695,-118.284831,89.844395
1,-111.667871,108.646075,-29.244612,-92.022102,76.001256,50.877741,28.651991,189.795767,68.172862,6.925361,...,81.915579,52.01094,91.056819,-0.052428,-165.972075,103.07754,-83.073252,-68.480301,-117.666431,173.873648
2,-29.703657,19.866868,-63.588384,82.38214,110.056585,37.244608,-10.899151,268.314468,-127.871219,-0.33702,...,88.559925,19.308219,128.521377,126.306732,-44.066754,148.308988,152.486339,-244.803431,2.997449,250.107358


In [28]:
# finding the number of features
print("Number of features in nlp datafram ", df1.shape[1])
print("Number of features in preprocess datafram ", df2.shape[1])
print("Number of features in q1 word2vec datafram ", df3_q1.shape[1])
print("Number of features in q2 word2vec datafram ", df3_q2.shape[1])
print("Number of features in final dataframe ",df1.shape[1]+df2.shape[1]+df3_q1.shape[1]+df3_q2.shape[1] )

Number of features in nlp datafram  17
Number of features in preprocess datafram  12
Number of features in q1 word2vec datafram  300
Number of features in q2 word2vec datafram  300
Number of features in final dataframe  629


In [None]:
# storing the final features to csv file
if not os.path.isfile('final_features.csv'):
    df3_q1['id']=df1['id']
    df3_q2['id']=df1['id']
    df1  = df1.merge(df2, on='id',how='left')
    df2  = df3_q1.merge(df3_q2, on='id',how='left')
    result  = df1.merge(df2, on='id',how='left')
    result.to_csv('final_features.csv')