In [2]:
import datetime
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import random
import time
from sklearn.metrics import log_loss

In [3]:
random.seed(2016)

In [4]:
def map_column(table, f):
    labels = sorted(table[f].unique())
    mappings = dict()
    for i in range(len(labels)):
        mappings[labels[i]] = i
    table = table.replace({f: mappings})
    return table

In [7]:
def read_train_test():
    # App
    print('Read apps...')
    app = pd.read_csv("./data/app_events.csv", dtype={'device_id': np.str})
    app['appcounts'] = app.groupby(['event_id'])['app_id'].transform('count')
    app_small = app[['event_id', 'appcounts']].drop_duplicates('event_id', keep='first')

    # Events
    print('Read events...')
    events = pd.read_csv("./data/events.csv", dtype={'device_id': np.str})
    events['counts'] = events.groupby(['device_id'])['event_id'].transform('count')
    events_small = events[['device_id', 'counts']].drop_duplicates('device_id', keep='first')
    e1=pd.merge(events, app_small, how='left', on='event_id', left_index=True)
    e1.loc[e1.isnull()['appcounts'] ==True, 'appcounts']=0
    e1['appcounts1'] = e1.groupby(['device_id'])['appcounts'].transform('sum')
    e1_small = e1[['device_id', 'appcounts1']].drop_duplicates('device_id', keep='first')
    
    
    # Phone brand
    print('Read brands...')
    pbd = pd.read_csv("./data/phone_brand_device_model.csv", dtype={'device_id': np.str})
    pbd.drop_duplicates('device_id', keep='first', inplace=True)
    pbd = map_column(pbd, 'phone_brand')
    pbd = map_column(pbd, 'device_model')

    # Train
    print('Read train...')
    train = pd.read_csv("./data/gender_age_train.csv", dtype={'device_id': np.str})
    train = map_column(train, 'group')
    train = train.drop(['age'], axis=1)
    train = train.drop(['gender'], axis=1)
    train = pd.merge(train, pbd, how='left', on='device_id', left_index=True)
    train = pd.merge(train, events_small, how='left', on='device_id', left_index=True)
    train = pd.merge(train, e1_small, how='left', on='device_id', left_index=True)
    train.fillna(-1, inplace=True)

    # Test
    print('Read test...')
    test = pd.read_csv("./data/gender_age_test.csv", dtype={'device_id': np.str})
    test = pd.merge(test, pbd, how='left', on='device_id', left_index=True)
    test = pd.merge(test, events_small, how='left', on='device_id', left_index=True)
    test = pd.merge(test, e1_small, how='left', on='device_id', left_index=True)
    test.fillna(-1, inplace=True)

    # Features
    features = list(test.columns.values)
    features.remove('device_id')

    return train, test, features

In [8]:
train, test, features = read_train_test()

Read apps...
Read events...
Read brands...
Read train...
Read test...


In [9]:
train

Unnamed: 0,device_id,group,phone_brand,device_model,counts,appcounts1
32462581,-8076087639492063270,10,51,749,-1.0,-1.0
32462581,-2897161552818060146,10,51,749,-1.0,-1.0
24761504,-8260683887967679142,10,51,749,1.0,53.0
32462581,-4938849341048082022,9,51,1524,-1.0,-1.0
32462581,245133531816851882,9,51,753,-1.0,-1.0
32462581,-1297074871525174196,0,7,908,-1.0,-1.0
32462581,236877999787307864,10,117,396,-1.0,-1.0
32462581,-8098239495777311881,10,51,1524,-1.0,-1.0
32462581,176515041953473526,10,13,1246,-1.0,-1.0
32462581,1596610250680140042,4,15,560,-1.0,-1.0
