## 3.6 Featurizing Text data with tfidf weighted word vectors 

In [9]:
import pandas as pd 
import numpy as np
import seaborn as sn
import spacy
from tqdm import tqdm
import os


In [10]:
df=pd.read_csv('train.csv')

df=df.head(150000)

df['question1'] = df['question1'].apply(lambda x : str(x))
df['question2'] = df['question2'].apply(lambda x : str(x))

In [11]:
df.head(2)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

questions=  list(df['question1']) + list(df['question2'])


tfidf=TfidfVectorizer(lowercase=False)
tfidf.fit_transform(questions)

word2tfidf = dict(zip(tfidf.get_feature_names_out(),tfidf.idf_))

After we find TF-IDF scores, we convert each question to a weighted average of word2vec vectors by these scores. 

Here we use a pre-trained GLOVE model which comes free with "Spacy". https://spacy.io/usage/vectors-similarity 

It is trained on Wikipedia and therefore, it is stronger in terms of word semantics.

In [17]:
import spacy.cli

nlp = spacy.load("en_core_web_lg")


In [18]:

# en_vectors_web_lg, which includes over 1 million unique vectors.

vecs1 = []
# https://github.com/noamraph/tqdm
# tqdm is used to print the progress bar
for qu1 in tqdm(list(df['question1'])):
    doc1 = nlp(qu1) 
    # 384 is the number of dimensions of vectors 
    mean_vec1 = np.zeros([len(doc1), len(doc1[0].vector)])
    for word1 in doc1:
        # word2vec
        vec1 = word1.vector
        # fetch df score
        try:
            idf = word2tfidf[str(word1)]
        except:
            idf = 0
        # compute final vec
        mean_vec1 += vec1 * idf
    mean_vec1 = mean_vec1.mean(axis=0)
    vecs1.append(mean_vec1)
df['q1_feats_m'] = list(vecs1)


100%|██████████████████████████████████████████████████████████████████████████| 150000/150000 [36:22<00:00, 68.73it/s]


In [19]:
vecs2 = []
for qu2 in tqdm(list(df['question2'])):
    doc2 = nlp(qu2) 
    mean_vec2 = np.zeros([len(doc2), len(doc2[0].vector)])
    for word2 in doc2:
        # word2vec
        vec2 = word2.vector
        # fetch df score
        try:
            idf = word2tfidf[str(word2)]
        except:
            #print word
            idf = 0
        # compute final vec
        mean_vec2 += vec2 * idf
    mean_vec2 = mean_vec2.mean(axis=0)
    vecs2.append(mean_vec2)
df['q2_feats_m'] = list(vecs2)

100%|████████████████████████████████████████████████████████████████████████| 150000/150000 [9:09:19<00:00,  4.55it/s]


In [20]:
#prepro_features_train.csv (Simple Preprocessing Feartures)
#nlp_features_train.csv (NLP Features)
if os.path.isfile('nlp_features_train.csv'):
    dfnlp = pd.read_csv("nlp_features_train.csv",encoding='latin-1')
else:
    print("download nlp_features_train.csv from drive or run previous notebook")

if os.path.isfile('df_fe_without_preprocessing_train.csv'):
    dfppro = pd.read_csv("df_fe_without_preprocessing_train.csv",encoding='latin-1')
else:
    print("download df_fe_without_preprocessing_train.csv from drive or run previous notebook")
    

In [21]:
df1= dfnlp.drop(['qid1','qid2','question1','question2'], axis=1)



In [22]:
df2= dfppro.drop(['qid1', 'qid2', 'question1', 'question2','is_duplicate'] , axis=1)


In [23]:
df3= df.drop(['qid1','qid2','question1','question2','is_duplicate'], axis=1)


In [24]:
df3_q1= pd.DataFrame(df.q1_feats_m.values.tolist(), index=df3.index)


In [25]:
df3_q2= pd.DataFrame(df.q2_feats_m.values.tolist(), index=df3.index)

In [26]:
df1.head()

Unnamed: 0,id,is_duplicate,freq_qid1,freq_qid2,q1len,q2len,q1_n_words,q2_n_words,word_Common,word_Total,...,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,token_set_ratio,token_sort_ratio,token_fuzz_ratio,token_fuzz_partial_ratio,longest_substr_ratio
0,0,0,1,1,66,57,14,12,10.0,23,...,0.785658,1,0,2,13.0,100,93,93,100,0.982759
1,1,0,1,1,51,88,8,13,4.0,20,...,0.466636,1,0,5,12.5,86,63,66,75,0.596154
2,2,0,1,1,73,59,14,10,4.0,24,...,0.285694,1,0,4,12.0,63,63,43,47,0.166667
3,3,0,1,1,50,65,11,9,0.0,19,...,0.0,0,0,2,12.0,28,24,9,14,0.039216
4,4,0,2,1,76,39,13,7,2.0,20,...,0.307669,1,0,6,10.0,67,47,35,56,0.175


In [27]:
df2.head()

Unnamed: 0,id,freq_qid1,freq_qid2,q1len,q2len,q1_n_words,q2_n_words,word_Common,word_Total,word_share,freq_q1+q2,freq_q1-q2
0,0,1,1,66,57,14,12,10.0,23,0.434783,2,0
1,1,1,1,51,88,8,13,4.0,20,0.2,2,0
2,2,1,1,73,59,14,10,4.0,24,0.166667,2,0
3,3,1,1,50,65,11,9,0.0,19,0.0,2,0
4,4,2,1,76,39,13,7,2.0,20,0.1,3,1


In [28]:
df3_q1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-16.740091,65.415633,-263.035662,-20.775145,168.297636,37.407273,-63.481824,162.875513,-257.538417,-4.984174,...,120.907209,-133.164315,-110.969923,76.773664,-16.087856,18.627704,-26.155186,-169.588857,-136.848997,94.505627
1,-21.428642,41.31395,82.529678,-105.168384,88.669795,-44.051722,45.340442,110.331544,20.676676,-30.509416,...,51.710714,21.390209,49.634552,-9.09615,-108.569195,58.924667,36.114717,-172.130103,-63.253044,77.733645
2,-48.627473,111.529623,-157.279443,66.929383,200.594982,-25.503589,68.78389,361.208761,-185.260147,80.122401,...,-38.689391,-44.108703,87.16902,27.205132,-68.557598,155.471076,116.562505,-217.461836,-194.959052,62.689888
3,137.20321,79.582893,-110.997889,-171.54902,-100.675922,73.345609,4.323312,176.082701,-144.093351,71.702363,...,134.350396,-150.995105,67.354106,-109.594718,-40.559336,-105.716028,70.618425,85.314299,-157.557805,223.714243
4,8.487725,-110.066285,-212.328131,36.520966,147.79082,-231.396325,-4.917155,339.814584,174.314793,-95.04144,...,66.689967,-110.334746,304.647943,-83.811498,-270.399169,146.452287,364.100157,13.453155,51.538501,-67.508291


In [29]:
df3_q2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-5.373565,77.3043,-249.022318,-25.692771,109.562332,59.804325,-79.022943,139.597848,-280.332339,9.790331,...,89.572089,-110.879569,-100.340596,90.604166,20.295473,-14.674574,-19.159758,-129.747007,-118.292253,89.304251
1,-113.239521,106.850477,-30.952364,-91.269426,77.295447,50.405411,29.52747,191.105875,68.722847,7.683953,...,80.583011,52.868427,91.395757,0.8324,-164.61746,102.49418,-85.872281,-65.242727,-116.834744,171.741777
2,-28.355394,20.861005,-60.91124,82.527247,113.185756,39.479117,-11.909298,270.097598,-129.368116,-0.164373,...,88.734341,22.559361,131.338697,128.613064,-43.06886,150.399292,154.15886,-246.701067,4.507912,253.489992
3,-131.672224,-40.301993,-93.485832,-28.065855,178.787958,-39.117698,-4.554926,48.221774,-39.912166,-66.313456,...,46.291135,-82.400769,24.867242,32.917215,-40.122687,21.548114,-81.215482,-164.16198,33.646341,9.997636
4,-5.696685,17.855528,-120.765247,21.64389,78.833768,-74.909262,-37.833239,179.518333,-39.428438,39.485871,...,113.318797,-65.12968,122.98288,-68.149418,-153.339061,-66.136773,8.174143,-33.789423,-25.280121,3.592565


In [30]:
print('No of features in nlp dataframe : ', df1.shape[1])
print('No of features in preprocessed dataframe : ' , df2.shape[1])
print('No of features in question1 glove dataframe : ', df3_q1.shape[1])
print('No of features in question2 glove dataframe : ', df3_q2.shape[1])
print('No of feature in final dataframe : ', df1.shape[1]+df2.shape[1]+df3_q1.shape[1]+df3_q2.shape[1])

No of features in nlp dataframe :  28
No of features in preprocessed dataframe :  12
No of features in question1 glove dataframe :  300
No of features in question2 glove dataframe :  300
No of feature in final dataframe :  640


In [31]:
# Stroing the final features to csv files 

if not os.path.isfile('final_features.csv'):
    df3_q1['id']=df1['id']
    df3_q2['id']=df1['id']
    df1 = df1.merge(df2, on='id' , how='left')
    df2 = df3_q1.merge(df3_q2, on='id', how='left')
    result = df1.merge(df2, on='id', how='left')
    result.to_csv('final_features.csv')