**<font color=blue><h3>Featurizing Text Data with TFIDF-Weighted Word-Vectors</h3></font>**

In [1]:
import pandas as pd
#avoid_decoding_problems

data = pd.read_csv("D:/Applied_Ai/Case Studies/Quora/My Work/New/quora.csv") #load_the_csv_file_

#encode_questions_to_unicode_ref:_https://stackoverflow.com/a/6812069
data['question1'] = data['question1'].apply(lambda x: str(x))
data['question2'] = data['question2'].apply(lambda x: str(x))

In [2]:
data.head(2)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

#merge_texts
questions = list(data['question1']) + list(data['question2'])

tfidf = TfidfVectorizer(lowercase=False)
tfidf.fit_transform(questions)

#dict_key:word_and_value:tfidf_score
word2tfidf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))

- After we find TFIDF scores, we convert each question to a weighted average of word2vec vectors by these scores.
- Here we use a pre-trained GLOVE model which comes free with "Spacy". Ref: https://spacy.io/usage/vectors-similarity
- It is trained on Wikipedia and therefore, it is stronger in terms of word semantics.

In [5]:
#exctract_word2vec_vectors
#https://github.com/explosion/spaCy/issues/1721
#http://landinghub.visualstudio.com/visual-cpp-build-tools
import spacy

In [14]:
import numpy as np
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

In [16]:
import en_core_web_sm #en_core_web_sm;_which_includes_over_1_million_unique_vectors_
#https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.0.0/en_core_web_md-2.0.0.tar.gz

nlp = en_core_web_sm.load()

vecs1 = []

#tqdm_is_used_to_print_the_progress_bar_
for ques1 in tqdm(list(data['question1'])):
    doc1 = nlp(ques1)  
    mean_vec1 = np.zeros([len(doc1), len(doc1[0].vector)])
    for word1 in doc1:
        #word2vec
        vec1 = word1.vector
        #fetch_idf_score
        try:
            idf = word2tfidf[str(word1)]
        except:
            idf = 0
        #compute_final_vec
        mean_vec1 += vec1 * idf
    mean_vec1 = mean_vec1.mean(axis=0)
    vecs1.append(mean_vec1)
data['q1_feats_m'] = list(vecs1)

100%|███████████████████████████████████████████████████████████████████████████| 50000/50000 [05:20<00:00, 155.95it/s]


In [18]:
vecs2 = []
for ques2 in tqdm(list(data['question2'])):
    doc2 = nlp(ques2) 
    mean_vec2 = np.zeros([len(doc1), len(doc2[0].vector)])
    for word2 in doc2:
        #word2vec
        vec2 = word2.vector
        #fetch_idf_score
        try:
            idf = word2tfidf[str(word2)]
        except:
            #print_word
            idf = 0
        #compute_final_vec
        mean_vec2 += vec2 * idf
    mean_vec2 = mean_vec2.mean(axis=0)
    vecs2.append(mean_vec2)
data['q2_feats_m'] = list(vecs2)

100%|███████████████████████████████████████████████████████████████████████████| 50000/50000 [05:18<00:00, 157.13it/s]


Right now we have two csv files;
1. data_without_prepro.csv (Simple Preprocessing Features)
2. nlp_features.csv (NLP Features)

_Load both the csv files._

In [20]:
#load_simple_preprocessing_features.csv_file_(_Simple_Preprocessing_Features_)_
p_pro_data = pd.read_csv("data_without_prepro.csv", encoding='latin-1')

#load_nlp_features.csv_file_(_NLP_Features_)_
nlp_data = pd.read_csv("nlp_features.csv", encoding='latin-1')

In [22]:
#drop_qid1;qid2;question1;question2_columns_and_store_rest_of_advance_extracted_(nlp)_features_in_data1
data1 = nlp_data.drop(['qid1','qid2','question1','question2'],axis=1)

#drop_qid1;qid2;question1;question2;is_duplicate_columns_and_store_rest_of_basic_(simple_preprocessing)_features_in_data2
data2 = p_pro_data.drop(['qid1','qid2','question1','question2','is_duplicate'],axis=1)

#drop_qid1;qid2;question1;question2;is_duplicate_columns_and_store_rest_of_featurized_text_data_(tfidf_weighted_w2v)_in_data3
data3 = data.drop(['qid1','qid2','question1','question2','is_duplicate'],axis=1)

#store_featurized_text_data_(tfidf_weighted_w2v_of_question_1)_in_data3_q1
data3_q1 = pd.DataFrame(data3.q1_feats_m.values.tolist(), index= data3.index)

#store_featurized_text_data_(tfidf_weighted_w2v_of_question_2)_in_data3_q2
data3_q2 = pd.DataFrame(data3.q2_feats_m.values.tolist(), index= data3.index)

In [23]:
#dataframe_of_nlp_features
data1.head()

Unnamed: 0,id,is_duplicate,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,token_set_ratio,token_sort_ratio,fuzz_ratio,fuzz_partial_ratio,longest_substr_ratio
0,0,0,0.99998,0.833319,0.999983,0.999983,0.916659,0.785709,0.0,1.0,2.0,13.0,100,93,93,100,0.982759
1,1,0,0.799984,0.399996,0.749981,0.599988,0.699993,0.466664,0.0,1.0,5.0,12.5,86,63,66,75,0.596154
2,2,0,0.399992,0.333328,0.399992,0.249997,0.399996,0.285712,0.0,1.0,4.0,12.0,63,63,43,47,0.166667
3,3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,12.0,28,24,9,14,0.039216
4,4,0,0.399992,0.199998,0.99995,0.666644,0.57142,0.30769,0.0,1.0,6.0,10.0,67,47,35,56,0.175


In [24]:
#data_before_preprocessing 
data2.head()

Unnamed: 0,id,freq_qid1,freq_qid2,q1len,q2len,q1_n_words,q2_n_words,word_Common,word_Total,word_share,freq_q1+q2,freq_q1-q2
0,0,1,1,66,57,14,12,10.0,23.0,0.434783,2,0
1,1,1,1,51,88,8,13,4.0,20.0,0.2,2,0
2,2,1,1,73,59,14,10,4.0,24.0,0.166667,2,0
3,3,1,1,50,65,11,9,0.0,19.0,0.0,2,0
4,4,1,1,76,39,13,7,2.0,20.0,0.1,2,0


In [25]:
#questions_1_tfidf_weighted_word2vec
data3_q1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,86,87,88,89,90,91,92,93,94,95
0,18.89271,41.648096,33.155665,4.310987,42.732992,-9.864838,-10.634421,-11.840693,5.617005,1.020409,...,20.647821,-21.506388,-24.173498,14.336091,-16.272346,25.371137,25.488638,39.202739,3.772958,9.92776
1,-23.343609,12.173132,-0.257286,-17.855058,7.816336,-6.990792,-16.24595,-12.547119,3.286317,-2.855047,...,49.590098,11.299516,-22.147376,-50.868811,-0.627945,-2.702794,-11.93556,17.299254,10.530151,17.804554
2,-0.031534,18.236641,17.321772,23.341717,25.507479,-6.083631,-20.739405,-15.223138,-2.893024,16.497008,...,-2.429696,-2.656703,-10.358425,16.450623,-22.032162,22.730402,2.092286,8.496325,27.627024,-23.033078
3,-22.874645,3.698369,-20.610707,6.31351,11.934285,11.623967,-29.311596,2.801635,-17.637064,-12.357982,...,-25.270801,-17.413254,-30.728475,1.437318,8.165583,21.963248,-6.926002,17.847203,50.869994,6.258697
4,46.234998,74.337402,-16.575459,26.780416,4.814146,-13.08808,-42.409124,30.424017,4.664057,-15.892533,...,22.066719,-34.15731,-19.464928,-52.149147,-60.026301,20.13879,44.375847,43.633056,-26.840464,17.367882


In [26]:
#questions_2_tfidf_weighted_word2vec
data3_q2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,86,87,88,89,90,91,92,93,94,95
0,18.696167,37.228157,45.303721,5.520344,33.628753,-3.229875,-4.98383,-13.457677,3.85763,4.640148,...,23.76825,-20.440072,-25.349166,12.486474,-19.63778,16.586846,30.426209,43.345757,5.500876,2.262348
1,-12.684361,36.306758,30.951682,-8.703764,16.983735,-17.617952,-16.031544,2.547901,-4.66387,6.987566,...,63.486983,0.217166,-24.077537,-5.160563,-23.868522,4.316975,2.170839,41.508875,10.780357,21.472071
2,1.850462,41.6567,11.439506,-2.610053,31.287295,-0.030191,-33.33981,9.737094,-8.313165,-18.748285,...,-10.40962,-35.576205,-20.263144,39.945732,-22.44319,12.674661,0.095201,21.343025,24.578321,-7.44697
3,5.718443,15.163344,8.306139,33.179072,17.315456,2.685051,-16.80744,0.021019,-10.302241,-14.506871,...,6.46835,-21.470736,0.546877,40.373335,-9.026011,18.770623,-1.556399,1.348674,18.060066,17.892235
4,9.795965,29.631456,29.330271,0.375853,15.920613,7.280994,-18.368975,6.83582,11.165791,7.97955,...,17.493541,-11.318888,-11.37167,14.175069,-32.553833,6.133714,20.308695,17.383724,12.348764,-12.703654


In [27]:
print("NUMBER OF FEATURES IN NLP DATAFRAME:", data1.shape[1])
print("NUMBER OF FEATURES IN PREPROCESSED DATAFRAME:", data2.shape[1])
print("NUMBER OF FEATURES IN QUESTION1 W2V  DATAFRAME:", data3_q1.shape[1])
print("NUMBER OF FEATURES IN QUESTION2 W2V  DATAFRAME:", data3_q2.shape[1])
print("NUMBER OF FEATURES IN FINAL DATAFRAME:", data1.shape[1]+data2.shape[1]+data3_q1.shape[1]+data3_q2.shape[1])

NUMBER OF FEATURES IN NLP DATAFRAME: 17
NUMBER OF FEATURES IN PREPROCESSED DATAFRAME: 12
NUMBER OF FEATURES IN QUESTION1 W2V  DATAFRAME: 96
NUMBER OF FEATURES IN QUESTION2 W2V  DATAFRAME: 96
NUMBER OF FEATURES IN FINAL DATAFRAME: 221


In [28]:
#storing_the_final_features_to_csv_file

data3_q1['id']=data1['id']
data3_q2['id']=data1['id']

#merge_nlp_data_and_preprocessed_data_and_store_in_data1_
data1 = data1.merge(data2, on='id',how='left')

#merge_questions_1_tfidf_weighted_word2vec_and_questions_2_tfidf_weighted_word2vec_store_in_data2_
data2 = data3_q1.merge(data3_q2, on='id',how='left')

#now_merge_data1_and_data2_in_final_
final  = data1.merge(data2, on='id',how='left')

#save_data_in_csv_file
final.to_csv('final_quora.csv')