In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

DATA_PATH = "./data/"
TRAIN_PATH = DATA_PATH + "train.csv"
TEST_PATH = DATA_PATH + "test.csv"
WORD_EMBED_PATH = DATA_PATH + "word_embed.txt"
CHAR_EMBED_PATH = DATA_PATH + "char_embed.txt"
QUEST_PATH = DATA_PATH + "question.csv"

train_data = pd.read_csv(TRAIN_PATH)
test_data = pd.read_csv(TEST_PATH)
question_data = pd.read_csv(QUEST_PATH)
word_embedding_data = pd.read_csv(WORD_EMBED_PATH, delimiter=" ", header=None, index_col=0)
char_embedding_data = pd.read_csv(CHAR_EMBED_PATH, delimiter=" ", header=None, index_col=0)

question_data["words"] = question_data["words"].str.split(" ")
question_data["chars"] = question_data["chars"].str.split(" ")

# Question features

- question frequency
- question lsi feature
- question cosine distance with tf-idf vector

In [2]:
total_question = pd.concat([train_data["q1"], train_data["q2"], test_data["q1"], test_data["q2"]])
quest_count = total_question.value_counts().reset_index()
quest_count.columns = ["qid", "n_quest"]
quest_count.head()

Unnamed: 0,qid,n_quest
0,Q489328,129
1,Q436579,125
2,Q119369,124
3,Q081677,123
4,Q209532,123


In [3]:
from gensim.corpora import Dictionary
from gensim.models import TfidfModel, LsiModel
from gensim.similarities import SparseMatrixSimilarity

In [4]:
word_dict = Dictionary(question_data["words"])

In [5]:
question_bow = [word_dict.doc2bow(q) for q in question_data["words"]]
question_bow[:5]

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)],
 [(5, 1), (6, 1), (7, 1), (8, 1), (9, 1)],
 [(7, 1), (10, 1), (11, 1), (12, 1), (13, 1)],
 [(14, 1), (15, 1), (16, 1), (17, 1), (18, 1)],
 [(19, 1), (20, 1), (21, 1), (22, 1)]]

In [6]:
word_tfidf = TfidfModel(question_bow)

In [7]:
question_tfidf = [word_tfidf[q] for q in question_bow]
question_tfidf[:5]

[[(0, 0.2477406991351563),
  (1, 0.21972489049272725),
  (2, 0.23444410678167835),
  (3, 0.3573948169492875),
  (4, 0.8412196051585514)],
 [(5, 0.3232659378436509),
  (6, 0.5943609648008642),
  (7, 0.16686694369784746),
  (8, 0.5021079206166066),
  (9, 0.5121300968563142)],
 [(7, 0.10802589124621184),
  (10, 0.508204869126432),
  (11, 0.5235092764094532),
  (12, 0.554041495508202),
  (13, 0.38604957788467187)],
 [(14, 0.6004184606813706),
  (15, 0.3968047817369808),
  (16, 0.35601914907908444),
  (17, 0.4285470895882856),
  (18, 0.4142962644751883)],
 [(19, 0.9462734193376207),
  (20, 0.25172175657293255),
  (21, 0.14844472800226852),
  (22, 0.13844470322571945)]]

In [8]:
NUM_LSI = 100

word_lsi = LsiModel(question_tfidf, id2word=word_dict, num_topics=NUM_LSI)

In [9]:
question_lsi = [word_lsi[q] for q in question_tfidf]
question_lsi[:5]

[[(0, 0.1658017697533896),
  (1, 0.10614563179430812),
  (2, -0.09628887778448074),
  (3, 0.01736849704400836),
  (4, -0.034682407754473885),
  (5, 0.09993798393879869),
  (6, 0.04254547426180555),
  (7, 0.1070328921897651),
  (8, 0.09439876112097248),
  (9, 0.09129468409087246),
  (10, 0.01757949410915195),
  (11, 0.05424882624105699),
  (12, 0.06354477074793581),
  (13, 0.023717276797831675),
  (14, 0.02205009277110225),
  (15, 0.07367513282045349),
  (16, 0.0056007563191569215),
  (17, 0.021500204365267308),
  (18, 0.1390728673007452),
  (19, -0.06646995703261187),
  (20, 0.06103349329917703),
  (21, -0.05532799033588197),
  (22, -0.08855464982254053),
  (23, 0.02535780504613975),
  (24, 0.018341648171933428),
  (25, -0.09340411421024535),
  (26, 0.11724074331062381),
  (27, -0.05666508874444189),
  (28, 0.011757827438216512),
  (29, 0.10448892463402279),
  (30, -0.05581804273906715),
  (31, 0.09944485876001373),
  (32, 0.15869967918124114),
  (33, -0.011802819240693502),
  (34, 0.0

In [10]:
lsi_feature = pd.DataFrame([[t[1] for t in q]for q in question_lsi])
lsi_feature = pd.concat([question_data[["qid"]], lsi_feature], axis=1)
lsi_feature.head()

Unnamed: 0,qid,0,1,2,3,4,5,6,7,8,...,90,91,92,93,94,95,96,97,98,99
0,Q000000,0.165802,0.106146,-0.096289,0.017368,-0.034682,0.099938,0.042545,0.107033,0.094399,...,0.000276,-0.005352,0.013101,-0.043726,0.018289,0.038202,-0.041938,0.000855,0.000152,0.004717
1,Q000001,0.186338,-0.07775,-0.103032,0.002319,0.203311,0.127353,-0.219649,-0.02903,-0.040761,...,0.088415,-0.122151,-0.063446,0.031962,-0.073047,0.008507,-0.060404,0.072514,-0.111591,0.027551
2,Q000002,0.056107,-0.014552,-0.031327,0.013678,0.028384,0.013188,-0.023608,-0.02316,-0.013257,...,0.021917,-0.029574,-0.018084,0.046179,-0.003465,-0.008996,-0.019373,-0.027318,-0.014738,0.011288
3,Q000003,0.1167,-0.088068,-0.035109,-0.025131,0.019472,-0.124079,0.010816,0.01341,0.068648,...,0.042782,0.013973,-0.081704,-0.023392,-0.012872,-0.027941,0.078527,-0.041033,-0.017079,0.037401
4,Q000004,0.083934,-0.058498,0.029994,-0.009854,-0.017631,0.036869,-0.02674,-0.013804,0.042177,...,-0.009225,0.001859,-0.004401,0.034703,0.020005,-0.002597,0.003319,0.011361,0.000523,0.020902


In [11]:
index = SparseMatrixSimilarity(question_tfidf, num_features=len(word_dict.dfs))

In [25]:
def generate_feature1(data):
    data = data.merge(quest_count, how="left", left_on="q1", right_on="qid")[["q1", "q2", "n_quest"]].rename(columns={"n_quest": "num_q1"})
    data = data.merge(quest_count, how="left", left_on="q2", right_on="qid")[["q1", "q2", "num_q1", "n_quest"]].rename(columns={"n_quest": "num_q2"})
    return data

train_feature1 = generate_feature1(train_data)
test_feature1 = generate_feature1(test_data)

In [40]:
def generate_feature2(data):
    quest_index = question_data.filter(["qid"]).reset_index()
    data = data.merge(quest_index, how="left", left_on="q1", right_on="qid") \
               .filter(["q1", "q2", "index"]) \
               .rename(columns={"index": "ind1"})
    data = data.merge(quest_index, how="left", left_on="q2", right_on="qid") \
               .filter(["q1", "q2", "ind1", "index"]) \
               .rename(columns={"index": "ind2"})
#     data["sim"] = data.apply(lambda x: index[question_tfidf[x["ind1"]]][x["ind2"]], axis=1)
    return data

train_feature2 = generate_feature2(train_data)
test_feature2 = generate_feature2(test_data)

Unnamed: 0,q1,q2,ind1,ind2
0,Q397345,Q538594,397345,538594
1,Q193805,Q699273,193805,699273
2,Q085471,Q676160,85471,676160
3,Q189314,Q438123,189314,438123
4,Q267714,Q290126,267714,290126


In [None]:
train_feature2.head()