In [1]:
%load_ext autoreload
%autoreload 2

In [38]:
import os
import pandas as pd

from dateutil.parser import parse as dt_parse
from src.DataReader import DataReader
from src.constants import SPLIT_TIME, STORE_LOCATION, DATA_ROOT, POSTS_FILE, VOTES_FILE

In [3]:
data_directory = DATA_ROOT

In [5]:
post_reader = DataReader(os.path.join(data_directory, POSTS_FILE), True)
post_reader.read_data()

Reading file...
Converting...
Progress | ██████████ | 100% || Estimated time remaining: 0.0 seconds
Conversion complete...
Forming data frame...


In [6]:
pdf = post_reader._df

In [7]:
vote_reader = DataReader(os.path.join(data_directory, VOTES_FILE), True)
vote_reader.read_data()

Reading file...
Converting...
Progress | █████████- | 90% || Estimated time remaining: 118.555555556 seconds
Conversion complete...
Forming data frame...


In [8]:
vdf = vote_reader._df
accepted_votes = vdf[vdf.VoteTypeId==1]

### Filter questions to ones that have > 3 answers *AND* an accepted answer

In [9]:
q_many_ans = pdf.Id[pdf.AnswerCount > 3]

q_acc_ans = pdf.Id[pdf.AcceptedAnswerId.notnull()]

q_ids = (set(q_many_ans) & set(q_acc_ans.values))

print len(q_ids)

22067


In [10]:
ans_df = pdf[pdf.ParentId.isin(q_ids) & pdf.OwnerUserId.notnull()]

In [11]:
ans_vote_df = ans_df.merge(accepted_votes, how='left', left_on='Id', right_on='PostId', suffixes=('', '_vt'))

In [12]:
q_acc_user = ans_vote_df[['ParentId', 'OwnerUserId', 'CreationDate']][ans_vote_df.VoteTypeId==1]

In [13]:
min(q_acc_user.CreationDate), max(q_acc_user.CreationDate)

(Timestamp('2010-07-20 19:21:52.240000'),
 Timestamp('2016-09-03 22:38:17.880000'))

### Split data by time

In [15]:
train_df = q_acc_user[q_acc_user.CreationDate < dt_parse(SPLIT_TIME)]

test_df = q_acc_user[q_acc_user.CreationDate >= dt_parse(SPLIT_TIME)]

users_in_test_phase = pdf[(pdf.PostTypeId==2) & (pdf.CreationDate <= dt_parse(SPLIT_TIME))]['OwnerUserId']

test_df = test_df[test_df.OwnerUserId.isin(users_in_test_phase.unique())]

print train_df.shape, test_df.shape

(15796, 3) (4146, 3)


### Add ranked users by score

In [44]:
def get_scored_users(grp):
    scr_users = grp[['OwnerUserId', 'Score']].set_index(['OwnerUserId']).to_dict()
    return scr_users['Score']

In [16]:
ans_grps = ans_vote_df.groupby(['ParentId'])

In [45]:
scored_users = ans_grps.apply(get_scored_users)

In [46]:
scored_users_df = pd.DataFrame(scored_users)

In [48]:
train_df = train_df.merge(scored_users_df, how='left', left_on='ParentId', right_index=True)

In [49]:
test_df = test_df.merge(scored_users_df, how='left', left_on='ParentId', right_index=True)

In [50]:
train_df.to_csv(os.path.join(STORE_LOCATION, 'train.csv'))

In [51]:
test_df.to_csv(os.path.join(STORE_LOCATION, 'test.csv'))