In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
df = pd.read_csv("/datasets/quora/train.csv")

In [2]:
df[df.isnull().any(axis=1)]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
105780,105780,174363,174364,How can I develop android app?,,0
201841,201841,303951,174364,How can I create an Android app?,,0
363362,363362,493340,493341,,My Chinese name is Haichao Yu. What English na...,0


In [3]:
df.drop(df[df.isnull().any(axis=1)].index,inplace=True)

In [4]:
import re
def text_to_word_list(text , split):
    ''' Pre process and convert texts to a list of words '''
    text = str(text)
    text = text.lower()

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    #split to words for word2vec
    if split :
        text = text.split()

    return text

In [16]:
question1 = []
question2 = []
question1TF = []
question2TF = []
for x in df["question1"]:
    question1.append(text_to_word_list(x, True))
    question1TF.append(text_to_word_list(x, False))
for x in df["question2"]:
    question2.append(text_to_word_list(x, True))
    question2TF.append(text_to_word_list(x, False))

In [17]:
from gensim.models import word2vec

questions = question1 + question2

# train model
model = word2vec.Word2Vec(questions, size=10, min_count=1)

In [18]:
model.save('modelw2v.bin')

In [19]:
from gensim.models import word2vec
model = word2vec.Word2Vec.load('modelw2v.bin')

In [20]:
# exctract word2vec vectors
vecs1 = []
for qu in question1:
    word_vec = []
    for word in qu:
        # word2vec
        word_vec.append(model.wv[word])
    vecs1.append(word_vec)
df['q1_feats'] = list(vecs1)

In [21]:
vecs2 = []
for qu2 in question2:
    word_vec2 = []
    for word in qu2:
        # word2vec
        word_vec2.append(model.wv[word])
    vecs2.append(word_vec2)
df['q2_feats'] = list(vecs2)

In [22]:
df

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_feats,q2_feats
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,"[[1.7473717, -2.6262953, -0.10161054, -0.87554...","[[1.7473717, -2.6262953, -0.10161054, -0.87554..."
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,"[[1.7473717, -2.6262953, -0.10161054, -0.87554...","[[1.7473717, -2.6262953, -0.10161054, -0.87554..."
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,"[[-2.3355067, 2.0692139, -0.8257057, 0.3185072...","[[-2.3355067, 2.0692139, -0.8257057, 0.3185072..."
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,"[[-0.16869572, 3.5913603, 0.35479644, 0.276689...","[[6.995065, 3.4047115, -2.1689236, 2.414899, -..."
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,"[[2.1625907, -2.6822379, 2.4315636, -0.7801787...","[[2.1625907, -2.6822379, 2.4315636, -0.7801787..."
...,...,...,...,...,...,...,...,...
404285,404285,433578,379845,How many keywords are there in the Racket prog...,How many keywords are there in PERL Programmin...,0,"[[-2.3355067, 2.0692139, -0.8257057, 0.3185072...","[[-2.3355067, 2.0692139, -0.8257057, 0.3185072..."
404286,404286,18840,155606,Do you believe there is life after death?,Is it true that there is life after death?,1,"[[-2.1759653, 1.8239492, 0.89487946, 1.6334667...","[[-0.86929, -1.8171616, 0.43591565, -0.6005701..."
404287,404287,537928,537929,What is one coin?,What's this coin?,0,"[[1.7473717, -2.6262953, -0.10161054, -0.87554...","[[1.7473717, -2.6262953, -0.10161054, -0.87554..."
404288,404288,537930,537931,What is the approx annual cost of living while...,I am having little hairfall problem but I want...,0,"[[1.7473717, -2.6262953, -0.10161054, -0.87554...","[[-3.9164126, 4.0500646, 0.73007447, 1.7626686..."


In [23]:
print(df['q2_feats'][0])

[array([ 1.7473717 , -2.6262953 , -0.10161054, -0.8755403 , -1.1663448 ,
        0.8059146 ,  2.0686438 ,  1.2114148 ,  0.88742393,  0.45057452],
      dtype=float32), array([-0.86929   , -1.8171616 ,  0.43591565, -0.6005701 , -0.38657954,
       -0.67453945,  0.24090974, -0.32868263,  1.8809551 ,  0.4018846 ],
      dtype=float32), array([-0.53186184, -1.4571234 ,  1.7137733 , -1.5512898 ,  0.05082732,
       -0.17809573,  0.3911812 ,  2.0384965 ,  0.7645675 ,  0.3621754 ],
      dtype=float32), array([-0.48199895,  3.2713783 ,  2.0729063 , -1.4789608 , -0.03389227,
        0.34500214,  1.5566995 ,  2.2249734 ,  0.25469473, -4.3693867 ],
      dtype=float32), array([-2.2341025 ,  2.940374  ,  4.9096627 , -0.02052404,  0.8326176 ,
       -3.298926  , -1.1585256 ,  0.69731516,  1.6589822 ,  3.3604858 ],
      dtype=float32), array([-0.48199895,  3.2713783 ,  2.0729063 , -1.4789608 , -0.03389227,
        0.34500214,  1.5566995 ,  2.2249734 ,  0.25469473, -4.3693867 ],
      dtype=float32

In [26]:
questionsTF = question1TF + question2TF


In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
# merge texts
questionsTF = question1TF + question2TF

tfidf = TfidfVectorizer(lowercase=False, )
tfidf.fit_transform(questions)

TypeError: expected string or bytes-like object

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['q2_feats'], df['is_duplicate'], test_size=0.20)