In [2]:
%load_ext autoreload
%autoreload 2

In [87]:
import os
import pandas as pd
import itertools
import numpy as np
import sys

from operator import itemgetter
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from dateutil.parser import parse as dt_parse
from src.DataReader import DataReader
from src.constants import SPLIT_TIME, STORE_LOCATION, DATA_ROOT, POSTS_FILE, VOTES_FILE

In [4]:
data_directory = DATA_ROOT

In [5]:
post_reader = DataReader(os.path.join(data_directory, POSTS_FILE), True)
post_reader.read_data()

Reading file...
Converting...
Progress | ██████████ | 100% || Estimated time remaining: 0.0 seconds
Conversion complete...
Forming data frame...


In [6]:
pdf = post_reader._df

In [7]:
vote_reader = DataReader(os.path.join(data_directory, VOTES_FILE), True)
vote_reader.read_data()

Reading file...
Converting...
Progress | █████████- | 90% || Estimated time remaining: 113.0 seconds
Conversion complete...
Forming data frame...


In [8]:
vdf = vote_reader._df
accepted_votes = vdf[vdf.VoteTypeId==1]

### Filter questions to ones that have > 3 answers *AND* an accepted answer

In [9]:
q_many_ans = pdf.Id[pdf.AnswerCount > 3]
q_has_tags = pdf.Id[pdf.Tags.notnull()]
q_acc_ans = pdf.Id[pdf.AcceptedAnswerId.notnull()]

q_ids = (set(q_many_ans) & set(q_acc_ans.values) & set(q_has_tags))

print len(q_ids)

22067


In [10]:
ans_df = pdf[pdf.ParentId.isin(q_ids) & pdf.OwnerUserId.notnull()]

In [11]:
ans_vote_df = ans_df.merge(accepted_votes, how='left', left_on='Id', right_on='PostId', suffixes=('', '_vt'))

In [12]:
q_acc_user = ans_vote_df[['ParentId', 'OwnerUserId', 'CreationDate']][ans_vote_df.VoteTypeId==1]

In [13]:
min(q_acc_user.CreationDate), max(q_acc_user.CreationDate)

(Timestamp('2010-07-20 19:21:52.240000'),
 Timestamp('2016-09-03 22:38:17.880000'))

### Add ranked users by score

In [14]:
def get_scored_users(grp):
    scr_users = grp[['OwnerUserId', 'Score']].set_index(['OwnerUserId']).to_dict()
    return scr_users['Score']

In [15]:
ans_grps = ans_vote_df.groupby(['ParentId'])

In [16]:
scored_users = ans_grps.apply(get_scored_users)

In [17]:
scored_users_df = pd.DataFrame(scored_users)

In [18]:
q_acc_user = q_acc_user.merge(scored_users_df, how='left', left_on='ParentId', right_index=True)

In [23]:
q_acc_user = q_acc_user.reset_index()

In [24]:
q_acc_user.head()

Unnamed: 0,index,ParentId,OwnerUserId,CreationDate,0
0,0,5.0,45.0,2010-07-20 19:21:52.240,"{226.0: 16, 45.0: 58, 1102.0: 20, 48.0: 17, 24..."
1,1,1.0,8.0,2010-07-20 19:22:20.193,"{33.0: -1, 35.0: 5, 39.0: 4, 8.0: 117, 117203...."
2,6,8.0,38.0,2010-07-20 19:23:54.230,"{160.0: 2, 38.0: 16, 78024.0: 1, 173.0: 20, 33..."
3,7,2.0,34.0,2010-07-20 19:24:52.200,"{34.0: 26, 99.0: 2, 198.0: 4, 72.0: 5, 174221...."
4,16,20.0,38.0,2010-07-20 19:32:21.590,"{38.0: 11, 72.0: 21, 400.0: 0, 232.0: 15, 210...."


In [25]:
# train_df = train_df.merge(scored_users_df, how='left', left_on='ParentId', right_index=True)

# test_df = test_df.merge(scored_users_df, how='left', left_on='ParentId', right_index=True)

### Add candidates using jaccard

In [26]:
pdf_train = pdf[pdf.CreationDate <= dt_parse(SPLIT_TIME)]

In [27]:
users_pdf = pdf_train[(pdf_train.OwnerUserId.notnull()) & (pdf_train.PostTypeId==2) ][['OwnerUserId', 'ParentId']]

In [28]:
q_tags_df = pdf[pdf.PostTypeId==1][['Tags', 'Id']]

In [29]:
user_tag_df = users_pdf.merge(q_tags_df, how='left', left_on='ParentId', right_on='Id', suffixes=('_us', '_pt'))

In [30]:
user_grps = user_tag_df.groupby(['OwnerUserId'])

In [31]:
def get_user_tags(grp):
    grp_tgs = grp.Tags[grp.Tags.notnull()]
    return set(itertools.chain.from_iterable(grp_tgs))

In [32]:
user_tags = user_grps.apply(get_user_tags)

In [33]:
user_tags_df = pd.DataFrame(user_tags, columns=['Tags']).reset_index()

In [34]:
u_tag_vals = get_user_tags(user_tags_df)

In [36]:
q_tag_cnt = q_tags_df.Tags.apply(lambda x: Counter(x))
vectorizer = DictVectorizer()
q_tag_arr = vectorizer.fit_transform(q_tag_cnt.values)

print q_tag_cnt.shape, q_tag_arr.shape

(652128,) (652128, 1436)


In [37]:
user_tag_cnt = user_tags_df.Tags.apply(lambda x: Counter(x))
user_tag_arr = vectorizer.transform(user_tag_cnt.values)

print user_tag_cnt.shape, user_tag_arr.shape

(31834,) (31834, 1436)


In [38]:
q_acc_user_w_tags = q_acc_user.merge(q_tags_df, how='left', left_on='ParentId', right_on='Id', suffixes=('_qu', '_pt'))

q_acc_user_tag_cnt = q_acc_user_w_tags.Tags.apply(lambda x: Counter(x))
q_acc_user_tag_arr = vectorizer.transform(q_acc_user_tag_cnt.values)

print q_acc_user_tag_cnt.shape, q_acc_user_tag_arr.shape

(21338,) (21338, 1436)


In [40]:
q_u_dot = q_acc_user_tag_arr.dot(user_tag_arr.T).toarray()

In [41]:
user_tag_arr_exp = user_tag_arr.toarray()
q_acc_user_tag_arr_exp = q_acc_user_tag_arr.toarray()

print 'Starting jc...'

for row_it in range(q_acc_user_tag_arr.shape[0]):
    q_tag_arr_row = q_acc_user_tag_arr_exp[row_it]
    q_user_tag_union = np.logical_or(user_tag_arr_exp, q_tag_arr_row)
    q_u_dot[row_it] = q_u_dot[row_it] / q_user_tag_union.sum(axis=1)
    
    if (row_it+1)%1000==0:
        sys.stdout.write('.')
        sys.stdout.flush()

Starting jc...
.....................

In [45]:
# np.save('data_temp/q_u_dot.npy', q_u_dot)

In [46]:
q_acc_user_w_tags['jc_scores'] = ''

In [58]:
jc_scores_list = []
for indx in range(q_u_dot.shape[0]):
    q_user_nz = np.nonzero(q_u_dot[indx])
    jc_scores_q = q_u_dot[indx][q_user_nz[0]]
    jc_scores_u = user_tags_df.ix[q_user_nz[0]].OwnerUserId.values
    tmp_d_items = dict(zip(*(jc_scores_u, jc_scores_q)))
    
    jc_scores_list.append(tmp_d_items)
#     q_acc_user_w_tags.ix[indx]['jc_scores'] = tmp_d_items
    
    
    if (indx+1)%1000==0:
        sys.stdout.write('.')
        sys.stdout.flush()
#         break

.....................

In [59]:
q_acc_user_w_tags['jc_scores'] = jc_scores_list

### Prep final df

In [65]:
q_acc_user_w_tags.columns = ['index', 'QuestionId', 'AcceptedAnswerUserId', 'AnswerCreationDate', 'AllAnswerScores',
                            'Tags', 'QId', 'JaccardScores']

In [66]:
q_acc_user_w_tags.head()

Unnamed: 0,index,QuestionId,AcceptedAnswerUserId,AnswerCreationDate,AllAnswerScores,Tags,QId,JaccardScores
0,0,5.0,45.0,2010-07-20 19:21:52.240,"{226.0: 16, 45.0: 58, 1102.0: 20, 48.0: 17, 24...","[elementary-number-theory, proof-writing, radi...",5,"{221716.0: 0.0769230769231, 238683.0: 0.166666..."
1,1,1.0,8.0,2010-07-20 19:22:20.193,"{33.0: -1, 35.0: 5, 39.0: 4, 8.0: 117, 117203....","[elementary-set-theory, intuition, faq]",1,"{122882.0: 0.25, 90117.0: 0.333333333333, 1392..."
2,6,8.0,38.0,2010-07-20 19:23:54.230,"{160.0: 2, 38.0: 16, 78024.0: 1, 173.0: 20, 33...","[linear-algebra, combinatorics, generating-fun...",8,"{131076.0: 0.2, 163845.0: 0.0625, 8.0: 0.01351..."
3,7,2.0,34.0,2010-07-20 19:24:52.200,"{34.0: 26, 99.0: 2, 198.0: 4, 72.0: 5, 174221....","[calculus, limits, definition]",2,"{65536.0: 0.111111111111, 65537.0: 0.1, 196610..."
4,16,20.0,38.0,2010-07-20 19:32:21.590,"{38.0: 11, 72.0: 21, 400.0: 0, 232.0: 15, 210....","[terminology, definition, number-systems]",20,"{8.0: 0.0277777777778, 9.0: 0.0175438596491, 1..."


In [109]:
def get_final_scored_users(row):
    final_scores = defaultdict(int)
    
    acc_user = row.AcceptedAnswerUserId
    
    ans_users_scores = row.AllAnswerScores
    
    jc_users = row.JaccardScores
    jc_users_sorted = sorted(jc_users.items(), key=itemgetter(1), reverse=True)[:10]
    
    final_scores[acc_user] = 1000000

    for usr, score in ans_users_scores.items():
        final_scores[usr] += score * 10
        
    for usr, jcs in jc_users_sorted:
        final_scores[usr] += jcs
        
    final_scores_sorted = sorted(final_scores.items(), key=itemgetter(1), reverse=True)[:5]
    final_users_sorted = [k for k,v in final_scores_sorted]
    
    final_jc = {k: jc_users.get(k, 0) for k in final_users_sorted}
    
    return final_users_sorted, final_jc

In [108]:
# get_final_scored_users(q_acc_user_w_tags.ix[0])

In [117]:
# %%timeit
final_users_sorted_list = []
final_jc_list = []

for indx, row in q_acc_user_w_tags.iterrows():
    final_users_sorted, final_jc = get_final_scored_users(row)
    
    final_users_sorted_list.append(final_users_sorted)
    final_jc_list.append(final_jc)
    
    if (indx+1)%1000==0:
        sys.stdout.write('.')
        sys.stdout.flush()
    
#         break

.....................

In [120]:
q_acc_user_w_tags['UsersSorted'] = final_users_sorted_list
q_acc_user_w_tags['UsersJaccard'] = final_jc_list

In [123]:
question_final_df = q_acc_user_w_tags[['QuestionId', 'AcceptedAnswerUserId', 'AnswerCreationDate', 'UsersSorted', 'UsersJaccard']]

### Split data by time

In [127]:
train_df = question_final_df[question_final_df.AnswerCreationDate < dt_parse(SPLIT_TIME)]

test_df = question_final_df[question_final_df.AnswerCreationDate >= dt_parse(SPLIT_TIME)]

users_in_test_phase = pdf[(pdf.PostTypeId==2) & (pdf.CreationDate <= dt_parse(SPLIT_TIME))]['OwnerUserId']

test_df = test_df[test_df.AcceptedAnswerUserId.isin(users_in_test_phase.unique())]

print train_df.shape, test_df.shape

(15796, 5) (4146, 5)


In [128]:
train_df.shape[0] + test_df.shape[0]

19942

In [130]:
train_df.head()

Unnamed: 0,QuestionId,AcceptedAnswerUserId,AnswerCreationDate,UsersSorted,UsersJaccard
0,5.0,45.0,2010-07-20 19:21:52.240,"[45.0, 1102.0, 48.0, 226.0, 242.0]","{48.0: 0.571428571429, 242.0: 0.0135135135135,..."
1,1.0,8.0,2010-07-20 19:22:20.193,"[8.0, 49.0, 25.0, 35.0, 39.0]","{8.0: 0.0422535211268, 49.0: 0.6, 35.0: 0.1153..."
2,8.0,38.0,2010-07-20 19:23:54.230,"[38.0, 173.0, 334.0, 53.0, 55.0]","{55.0: 0.444444444444, 334.0: 0.078431372549, ..."
3,2.0,34.0,2010-07-20 19:24:52.200,"[34.0, 56.0, 35.0, 190287.0, 72.0]","{56.0: 0.0714285714286, 72.0: 0.0206896551724,..."
4,20.0,38.0,2010-07-20 19:32:21.590,"[38.0, 72.0, 232.0, 51.0, 210.0]","{72.0: 0.0206896551724, 232.0: 0.0061224489795..."


In [131]:
test_df.head()

Unnamed: 0,QuestionId,AcceptedAnswerUserId,AnswerCreationDate,UsersSorted,UsersJaccard
15796,1307269.0,30953.0,2015-06-01 01:21:56.547,"[30953.0, 223391.0, 11667.0, 152299.0, 60129.0]","{152299.0: 0.0125, 30953.0: 0.0037037037037, 6..."
15797,1307364.0,223498.0,2015-06-01 03:30:05.077,"[223498.0, 264.0, 229072.0, 141600.0, 198240.0]","{264.0: 0.0103448275862, 229072.0: 0.038961038..."
15798,1307398.0,218419.0,2015-06-01 04:22:01.710,"[218419.0, 58320.0, 6312.0, 243183.0, 101504.0]","{58320.0: 0.0131578947368, 6312.0: 0.004048582..."
15799,1307603.0,146393.0,2015-06-01 09:37:28.713,"[146393.0, 195344.0, 112884.0, 21042.0, 65573.0]","{195344.0: 0.0215053763441, 146393.0: 0.010638..."
15800,1307959.0,164025.0,2015-06-01 15:11:06.170,"[164025.0, 12500.0, 44669.0, 191887.0, 141252.0]","{164025.0: 0.02, 12500.0: 0.0384615384615, 446..."


### Save

In [134]:
train_df.to_csv(os.path.join(STORE_LOCATION, 'train.csv'), index=False)

In [135]:
test_df.to_csv(os.path.join(STORE_LOCATION, 'test.csv'), index=False)