In [1]:
import pandas as pd
import numpy as np
from copy import deepcopy
import matplotlib.pyplot as plt
import xgboost as xgb

In [15]:
# load train data from given txt file
invited_info_train = pd.read_csv("./bytecup2016data/invited_info_train.txt", sep='\t', names=['qid', 'uid', 'label'])
invited_info_train_copy = deepcopy(invited_info_train)

# set inconsistent (duplicate) samples' label to 0.5
# drop inconsistent (duplicate) samples except first one
invited_info_train_copy.loc[invited_info_train_copy.duplicated(['qid', 'uid'], keep=False), 'label'] = 0.5
invited_info_train_copy = invited_info_train_copy.drop_duplicates(['qid', 'uid'], keep='first')

invited_info_train_copy.head()

Unnamed: 0,qid,uid,label
0,1c525c9d44135bdc40ddd4b8d63738d8,e6a2ecac7f90d426103de95ba7f6d2b0,0.0
1,43c525e69431f916efcf8d09d99602ac,64e1c3152b0ad8ab1a3b6bbda5d2bbe8,1.0
2,e1a1009e93ea09bdd981029b592d89b8,405c2c59eb73d115b5d73d0d0cfc9c94,0.5
3,60f19c960a9300aabaf95ea49a9fe4c2,11ccc1bae27f606616070eca12441ffb,0.0
4,cb8f85a79be9cd9aae3af4a302b865a6,628b7aa38efb4c1ce999fe5cdf46a03e,0.0


In [3]:
# load question info data from given txt file
question_info = pd.read_csv("./bytecup2016data/question_info.txt", sep='\t', names=['qid', 'qtag', 'qwid', 'qcid', '#upvotes', '#ans', '#tqans'])
question_info_copy = deepcopy(question_info)
question_info_copy.head()

Unnamed: 0,qid,qtag,qwid,qcid,#upvotes,#ans,#tqans
0,c1c0075239841777d5b01c40b38135d2,0,0/1/2/3/4/5/6/7,0/1/2/3/4/5/6/5/7/8/9/3/10,103,6,5
1,367edcb36424493a7cf80f70903a64cd,1,8/9/10/11/12,11/12/13/14/15/15/16/17/12,173,10,5
2,fb9e401d86f20205d97a25f3aad76a67,2,13/14/15/16/17,18/19/20/21/22/23/24/25,221,5,5
3,e153659c6c654cd12122232fca89f4bc,3,18/19/20/21/22/23/24/21/25,26/27/28/29/30/31/32/33/34/35/36/37/38/39/33/3...,164,11,8
4,c1718491292fd44b9b33397d5e6a80ba,1,26/27/28/29/30/31/32/33/34/35/15,5/42/43/15/44/45/46/47/48/26/49/50/51/52/49/53...,67,3,3


In [4]:
# load user info data from given txt file
user_info = pd.read_csv("./bytecup2016data/user_info.txt", sep="\t", names=['uid', 'utag', 'uwid', 'ucid'])
user_info_copy = deepcopy(user_info)
user_info_copy.head()

Unnamed: 0,uid,utag,uwid,ucid
0,61fa06d8908d0e4710a599f970f0ab5a,0/1,2327/1083/1083/344/3584/1634,608/182/441/182/441/441/442/263/158/141/878
1,4588a1df2461674252ff01c63b59171a,2/3/4/5/6,54/11880/113/13231/13232/113/8864/444/7404,92/93/160/160/183/1022/1022/183/732/732/183/14...
2,e8a0f6906978da77dab3d1d779bf2904,7,45/13233/1871/13234/2182/3754/444/2644,73/953/15/291/182/168/438/1474/107/377/11/1002...
3,ab01652daaa15bcbbb21b7c02a8f3646,8,76/13235/2149/13236,122/129/1571/1837/749/68/300/1481/354
4,4c3694faef04ea6cce28d9d838fc3dda,9/10/11/12,9465/1492/1734/13237,93/1289/232/305/646/345


In [5]:
question_user_info = invited_info_train_copy.merge(question_info_copy, left_on='qid', right_on='qid', how='left')
question_user_info = question_user_info.merge(user_info_copy, left_on='uid', right_on='uid', how='left')

In [6]:
question_user_info.head()

Unnamed: 0,qid,uid,label,qtag,qwid,qcid,#upvotes,#ans,#tqans,utag,uwid,ucid
0,1c525c9d44135bdc40ddd4b8d63738d8,e6a2ecac7f90d426103de95ba7f6d2b0,0.0,3,2199/1087/730/2620/2621/45/18/59/1534/263/2622...,452/5/379/353/771/225/263/4/1635/1636/73/26/27...,122,7,6,18/20,3027/4061/1560/239/9596/14642/6398/19117/286/1...,638/34/1051/131/711/49/86/14/2/1006/334/598/82...
1,43c525e69431f916efcf8d09d99602ac,64e1c3152b0ad8ab1a3b6bbda5d2bbe8,1.0,0,9600/2368/0/289/276/74,88/451/300/380/0/1/179/376/68/123/124/125,145,13,9,27/28/29/30,54/22/1105/1492/1685/1330/2338/67/580/1472/16553,92/93/35/1/232/305/1194/224/192/1135/967/183/1...
2,e1a1009e93ea09bdd981029b592d89b8,405c2c59eb73d115b5d73d0d0cfc9c94,0.5,3,502/714/4531/4532,588/588/761/648/1184/2028/90/472,390,22,20,18/20,29056/29057/13301/2457/2644/1330/1124/1492,312/665/2270/2270/573/305/39/312/1002/135/192/...
3,60f19c960a9300aabaf95ea49a9fe4c2,11ccc1bae27f606616070eca12441ffb,0.0,11,3678/489/3679/3680/3681/123,189/488/269/129/46/1884/1179/196/197,17,1,1,51/52/53/54,33239/14462/501/17157/22475/9438,732/732/606/1511/1511/128/587/179/1051/662/102...
4,cb8f85a79be9cd9aae3af4a302b865a6,628b7aa38efb4c1ce999fe5cdf46a03e,0.0,7,3394/10866/5911/3051/10867/10868/10869/10870/7...,1829/1830/461/167/319/241/49/232/1558/1036/128...,2095,132,21,32,1094/12516/14796/5925/14462/5955/10585/5693/33...,324/248/225/217/68/200/493/1511/1511/5/1642/51...


In [7]:
name_to_split = ['qwid', 'qcid', 'uwid', 'ucid']

In [8]:
for name in name_to_split:
    question_user_info[name] = question_user_info[name].str.split('/')

In [9]:
qwid_uwid = question_user_info.loc[:, ['qwid', 'uwid']]

In [10]:
qwid_uwid['qwid'] = qwid_uwid['qwid'].apply(set)

In [11]:
qwid_uwid['uwid'] = qwid_uwid['uwid'].apply(set)

In [12]:
qwid_uwid.head()

Unnamed: 0,qwid,uwid
0,"{1087, 2622, 2620, 2621, 59, 2623, 18, 45, 263...","{14642, 6398, 4061, 19117, 3027, 27660, 410, 2..."
1,"{9600, 289, 0, 74, 276, 2368}","{580, 22, 16553, 2338, 54, 1330, 1472, 1492, 1..."
2,"{714, 502, 4531, 4532}","{1124, 2457, 2644, 1330, 13301, 1492, 29057, 2..."
3,"{3679, 3681, 3680, 3678, 123, 489}","{14462, 22475, 17157, 9438, 33239, 501}"
4,"{3394, 132, 10866, 7852, 7888, 10868, 10869, 1...","{1094, 5955, 5693, 14462, 12516, 3374, 14796, ..."


In [13]:
common_wid = qwid_uwid.apply(lambda row: row['qwid'].intersection(row['uwid']), axis = 1)

In [14]:
common_wid_len = common_wid.apply(len)