# Libraries

In [34]:
import os
import warnings
warnings.filterwarnings("ignore")
from collections import Counter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from scipy import stats

# Load Data

In [2]:
usr_prof = pd.read_csv('../Data/usrProfile.csv')
Data = pd.read_csv('../Data/Train_Set.csv')

In [5]:
X = Data[['orderid','userid','bikeid','biketype',
          'starttime','start_lat','start_lon',
          'distance','hour','cont_time','distance_bins']]
y = Data[['end_lat','end_lon']]

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.5,
                                                    random_state=2020)

# User Profiling

In [57]:
dist_hour_count = X_train.pivot_table(index='userid',
                                      values='distance',
                                      aggfunc='count').reset_index()
dist_hour_stats = X_train.pivot_table(
    index='userid',
    values=['distance', 'hour'],
    aggfunc=['max', 'min', 'median', 'mean', 'std']).reset_index()


def mode(x):
    return stats.mode(x)[0][0]


def mode_cnt(x):
    return stats.mode(x)[1][0]


# hour
hour_mode = X_train.pivot_table(index='userid', values='hour',
                                aggfunc=mode).reset_index()
hour_mode_cnt = X_train.pivot_table(index='userid',
                                    values='hour',
                                    aggfunc=mode_cnt).reset_index()

# distance bins
dist_bins_mode = X_train.pivot_table(index='userid',
                                     values='distance_bins',
                                     aggfunc=mode).reset_index()
dist_bins_mode_cnt = X_train.pivot_table(index='userid',
                                         values='distance_bins',
                                         aggfunc=mode_cnt).reset_index()

# coordinates
coord_center = X_train.pivot_table(index='userid',
                                   values=['start_lat', 'start_lon'],
                                   aggfunc=['mean', 'std']).reset_index()

In [58]:
from functools import reduce

dfs = [dist_hour_count, dist_hour_stats, hour_mode, hour_mode_cnt,
       dist_bins_mode, dist_bins_mode_cnt, coord_center]

usr_prof = reduce(lambda left,right: pd.merge(left,right,on='userid'), dfs)

In [59]:
usr_prof.columns = ['userid','count','maxDist','maxHour',
                    'minDist','minHour','medDist','medHour',
                    'meanDist','meanHour','stdDist','stdHour',
                    'modeHour','modeHourCount','modeDistBins','modeDistBinsCount',
                    'coordCenterLat','coordCenterLon','coordStdLat','coordStdLon']

usr_prof['modeHourRatio'] = usr_prof['modeHourCount'] / usr_prof['count']
usr_prof['modeDistBinsRatio'] = usr_prof['modeDistBinsCount'] / usr_prof['count']

In [60]:
usr_prof.head()

Unnamed: 0,userid,count,maxDist,maxHour,minDist,minHour,medDist,medHour,meanDist,meanHour,...,modeHour,modeHourCount,modeDistBins,modeDistBinsCount,coordCenterLat,coordCenterLon,coordStdLat,coordStdLon,modeHourRatio,modeDistBinsRatio
0,1,5,949.330649,17,279.600909,8,719.112698,8.0,664.268117,10.8,...,8,3,2,4,39.91951,116.348099,0.002569,0.021487,0.6,0.8
1,2,1,327.080882,18,327.080882,18,327.080882,18.0,327.080882,18.0,...,18,1,1,1,39.938736,116.461258,,,1.0,1.0
2,3,12,2468.04227,22,660.423121,8,731.26919,9.0,971.144149,12.666667,...,8,5,2,8,39.900856,116.320839,0.043567,0.041888,0.416667,0.666667
3,5,7,1566.851357,18,327.044707,10,558.804131,17.0,687.10564,15.285714,...,18,3,1,3,39.994452,116.48225,0.006768,0.003695,0.428571,0.428571
4,9,5,838.353915,19,743.106111,8,743.106111,12.0,762.155672,12.6,...,8,2,2,5,39.974716,116.422531,0.002257,0.004165,0.4,1.0


In [61]:
trainid = Counter(usr_prof['userid']).keys()
testid = Counter(X_test['userid']).keys()

In [62]:
print('Train set ids: {:d}, Test set ids: {:d}'.format(len(trainid),
                                                       len(testid)))

intersect_cnt = len(set(testid).intersection(set(trainid)))
intersect_ratio = intersect_cnt / len(testid)
diff_cnt = len(set(testid) - set(trainid))
diff_ratio = diff_cnt / len(testid)

print('ids both in Train and Test set: {:d}, ratio: {:.2f}%'.format(
    intersect_cnt, 100 * intersect_ratio))
print('ids in Train not in Test set: {:d}, ratio: {:.2f}%'.format(
    diff_cnt, 100 * diff_ratio))

Train set ids: 317699, Test set ids: 317574
ids both in Train and Test set: 285583, ratio: 89.93%
ids in Train not in Test set: 31991, ratio: 10.07%
