In [None]:
import importlib
import json
import datetime

import numpy as np
import pandas as pd
import tqdm

from nile.api.v1 import filters as nf
from nile.api.v1 import aggregators as na
from nile.api.v1 import extractors as ne
from qb2.api.v1 import filters as qf
from qb2.api.v1 import extractors as qe
import matplotlib.pyplot as plt
import matplotlib.lines as mlines

from projects.common.nile import dates
from projects.common.nile import test_utils
from projects.driver_scoring import default_values
from projects.driver_scoring import project_config
from projects.driver_scoring import common as common_ds

In [None]:
pd.options.display.max_rows=500
pd.options.display.max_columns=500

In [None]:
import modin.pandas as mpd

In [None]:
importlib.reload(project_config)
importlib.reload(default_values)
importlib.reload(common_ds)

In [None]:
cluster = project_config.get_project_cluster(parallel_operations_limit=5)

In [None]:
DATES = sorted(
    [
        '2020-10-04',
        '2020-10-11',
        '2020-10-18',
        '2020-10-25',
        '2020-11-01',
        '2020-11-08',
        '2020-11-15',
    ]
)
LAST_DATE = DATES[-1]
TEST_DATES = [LAST_DATE]

### Reduce memory consumption

In [None]:
feature_cols_features = [
    test_utils.to_string(fc) for fc in list(set(cluster.read(
        f'//home/taxi_ml/dev/driver_scoring/driver_features_{DATES[0]}'
    )[:10000].as_dataframe().columns))
]

In [None]:
str_cols = {'unique_driver_id': np.object, 'utc_last_order_dttm': np.object}
int_cols = {}
float_cols = {}
dtypes_features = {}

for standart in ['econom', 'higher', 'ultima', 'child', 'all', 'other']:
    for window in ['35', '70', '140']:
        
        int_cols.update(
            {     
                'telemetry_order_cnt_{}'.format(window): np.uint16,
                'success_order_cnt_{}_{}'.format(standart, window): np.uint16,
                'order_total_cnt_{}'.format(window): np.uint16,
                'order_actual_cnt_{}'.format(window): np.uint16
            }
        )
        
        str_cols.update({'city_most_freq_{}'.format(window): np.object})
        
for feat in feature_cols_features:
    float_cols.update({feat: np.float32})
for feat in (list(str_cols.keys()) + list(int_cols.keys())):
    float_cols.pop(feat)
            
dtypes_features.update(float_cols)
dtypes_features.update(str_cols)
dtypes_features.update(int_cols)

In [None]:
feature_cols_target = [
    test_utils.to_string(fc) for fc in list(set(cluster.read(
        f'//home/taxi_ml/dev/driver_scoring/driver_targets_{LAST_DATE}_v1'
    )[:10000].as_dataframe().columns))
]

In [None]:
str_cols = {'unique_driver_id': np.object, 'utc_last_order_dttm': np.object}
int_cols = {}
float_cols = {}
dtypes_target = {}

for standart in ['econom', 'higher', 'ultima', 'child', 'all', 'other']:
    for window in ['35', '70', '140', '280', '560']:
        
        int_cols.update(
            {     
                'telemetry_order_cnt_{}'.format(window): np.uint16,
                'success_order_cnt_{}_{}'.format(standart, window): np.uint16,
                'order_total_cnt_{}'.format(window): np.uint16,
                'order_actual_cnt_{}'.format(window): np.uint16
            }
        )
        
        str_cols.update({'city_most_freq_{}'.format(window): np.object})
        
for feat in feature_cols_target:
    float_cols.update({feat: np.float32})
for feat in (list(str_cols.keys()) + list(int_cols.keys())):
    float_cols.pop(feat)
            
dtypes_target.update(float_cols)
dtypes_target.update(str_cols)
dtypes_target.update(int_cols)

### Analyze drivers metrics

In [None]:
features = cluster.read(
    f'//home/taxi_ml/dev/driver_scoring/driver_features_{LAST_DATE}'
).as_dataframe(dtypes=dtypes_features)

for col in str_cols:
    if col in features.columns:
        features[col] = features[col].apply(lambda x: test_utils.to_string(x))

In [None]:
targets = cluster.read(
    f'//home/taxi_ml/dev/driver_scoring/driver_targets_{LAST_DATE}_v1'
).as_dataframe(dtypes=dtypes_target)

for col in str_cols:
    if col in targets.columns:
        targets[col] = targets[col].apply(lambda x: test_utils.to_string(x))

In [None]:
score_version = 'v1'

In [None]:
scores_econom = (
    cluster
    .read(f'//home/taxi_ml/dev/driver_scoring/predictions/econom_{LAST_DATE}_{score_version}')
    .as_dataframe(dtypes={'DocId': np.object, 'Probability': np.float32})
    .rename({'DocId': 'unique_driver_id', 'Probability': 'score_econom'}, axis=1)
)

scores_higher = (
    cluster
    .read(f'//home/taxi_ml/dev/driver_scoring/predictions/higher_{LAST_DATE}_{score_version}')
    .as_dataframe(dtypes={'DocId': np.object, 'Probability': np.float32})
    .rename({'DocId': 'unique_driver_id', 'Probability': 'score_higher'}, axis=1)
)

scores_ultima = (
    cluster
    .read(f'//home/taxi_ml/dev/driver_scoring/predictions/ultima_{LAST_DATE}_{score_version}')
    .as_dataframe(dtypes={'DocId': np.object, 'Probability': np.float32})
    .rename({'DocId': 'unique_driver_id', 'Probability': 'score_ultima'}, axis=1)
)

scores_child = (
    cluster
    .read(f'//home/taxi_ml/dev/driver_scoring/predictions/child_{LAST_DATE}_{score_version}')
    .as_dataframe(dtypes={'DocId': np.object, 'Probability': np.float32})
    .rename({'DocId': 'unique_driver_id', 'Probability': 'score_child'}, axis=1)
)

scores_all = (
    cluster
    .read(f'//home/taxi_ml/dev/driver_scoring/predictions/all_{LAST_DATE}_{score_version}')
    .as_dataframe(dtypes={'DocId': np.object, 'Probability': np.float32})
    .rename({'DocId': 'unique_driver_id', 'Probability': 'score_all'}, axis=1)
)

scores = pd.merge(scores_econom, scores_higher, how='inner', on='unique_driver_id')
scores = pd.merge(scores, scores_ultima, how='inner', on='unique_driver_id')
scores = pd.merge(scores, scores_child, how='inner', on='unique_driver_id')
scores = pd.merge(scores, scores_all, how='inner', on='unique_driver_id')

scores['unique_driver_id'] = scores['unique_driver_id'].apply(
    lambda x: x.decode('utf-8').split('_')[0]
)

In [None]:
df = pd.merge(features,
              targets,
              how='left',
              on='unique_driver_id',
              suffixes=('_feature', '_target'))

df = pd.merge(df,
              scores,
              how='left',
              on='unique_driver_id')

In [None]:
city_df = df[['city_most_freq_140_feature',
              'success_order_cnt_higher_140_feature',
              'success_order_cnt_econom_140_feature']].groupby(
    'city_most_freq_140_feature'
).sum(
).sort_values(
    'success_order_cnt_higher_140_feature', ascending=False
)

In [None]:
# for row in city_df.itertuples():
#     print(row.Index.decode('utf-8'),
#           row.success_order_cnt_higher_140_feature,
#           row.success_order_cnt_econom_140_feature)

In [None]:
higher_cities = set(
    city_df[city_df['success_order_cnt_higher_140_feature'] >= 100].index.values.tolist()
)

In [None]:
# with open('higher_cities.json', 'w') as f:
#     json.dump(list(higher_cities), f)

In [None]:
for col in df.columns:
    if col.startswith('score_'):
        df[col + '_reverse'] = 1-df[col]
df['score_random'] = np.random.uniform(size=len(df))

In [None]:
# for col in df.columns:
#     if col.startswith('less_5_'):
#         postfix = col.replace('less_5_order_frac', '')
#         df['rating_4_order_frac' + postfix] = df[col]
#         df[col] = (
#             df[col] + df['bad_order_frac' + postfix]
#         )

#### Common stats about dataset

In [None]:
# print(df.shape[0], 'drivers total\n')

# print(
#     df[(df['success_order_cnt_all_140_feature']>10)].shape[0],
#     'drivers with at least 1 trip in the last 140 days\n'
# )

# print(
#     df[(df['success_order_cnt_all_70_feature']>10)].shape[0],
#     'drivers with at least 1 trip in the last 70 days\n'
# )

# print(
#     df[(df['success_order_cnt_all_35_feature']>10)].shape[0],
#     'drivers with at least 1 trip in the last 35 days\n'
# )

# print(
#     df[(df['success_order_cnt_all_140_target']>10)].shape[0],
#     'drivers with at least 1 trip from 2020-02-09 to 2020-03-09\n'
# )

# print(
#     df[(df['success_order_cnt_econom_140_target']>0)].shape[0],
#     'drivers with at least 1 econom trip from 2020-02-09 to 2020-03-09\n'
# )

# print(
#     df[(df['success_order_cnt_higher_140_target']>0)].shape[0],
#     'drivers with at least 1 higher trip from 2020-02-09 to 2020-03-09\n'
# )

# print(
#     df[(df['success_order_cnt_ultima_140_target']>0)].shape[0],
#     'drivers with at least 1 ultima trip from 2020-02-09 to 2020-03-09\n',
# )

In [None]:
df.head()

In [None]:
rename_dict = {}
for col in df.columns:
    if '280' in col or '560' in col:
        rename_dict[col] = col + '_target'

In [None]:
df.rename(rename_dict, axis=1, inplace=True)

In [None]:
unconstrained_clean_df = df[
    (df['success_order_cnt_all_140_feature']>=0)
    & (df['success_order_cnt_econom_140_feature']>=0)
    & (df['success_order_cnt_all_140_target']>=0)
    & (df['success_order_cnt_higher_560_target']>=1)
    & (~pd.isnull(df['current_rating_feature']))
    & (~pd.isnull(df['score_econom_reverse']))
    & (~pd.isnull(df['score_higher_reverse']))
    & (~pd.isnull(df['score_random']))
    & (df['city_most_freq_140_feature'].apply(lambda x: x in higher_cities))
    & (df['current_rating_feature']>=4.8)
]

In [None]:
unconstrained_clean_df.shape

In [None]:
def calculate_metric(df, perc, score_name, metric_names, total_drivers, total_orders):
    
    min_score = np.percentile(df[score_name], perc)
    
    filtered_df = df[(df[score_name] >= min_score)]

    
    order_cnt = filtered_df['success_order_cnt_all_560_target'].sum()
    order_frac = filtered_df['success_order_cnt_all_560_target'].sum() / total_orders
    
    metric_value = {}
    order_metric_cnt = {}
    for metric_name in metric_names:
        metric_value[metric_name+'_metric_value'] = round(
            (filtered_df[metric_name]
             * filtered_df['success_order_cnt_all_560_target']).sum()
            / filtered_df['success_order_cnt_all_560_target'].sum(),
            4
        )
        order_metric_cnt[metric_name+'_order_cnt'] = int(
            (filtered_df[metric_name]
             * filtered_df['success_order_cnt_all_560_target']).sum()
        )
    
    driver_frac = filtered_df.shape[0] / total_drivers
    driver_cnt = filtered_df.shape[0]
    
    result_dict = {'min_score': round(min_score, 4),
                   'driver_frac': round(driver_frac, 4),
                   'driver_cnt': round(driver_cnt, 4),
                   'order_cnt': int(order_cnt),
                   'order_frac': round(order_frac, 4),
                   'score_name': score_name}
    result_dict.update(metric_value)
    result_dict.update(order_metric_cnt)
    
    return result_dict

In [None]:
calculate_metric(df=unconstrained_clean_df,
                 perc=0,
                 score_name='score_higher_reverse',
                 metric_names=['bad_order_frac_all_560_target',
                               'less_5_order_frac_all_560_target'],
                 total_drivers=unconstrained_clean_df.shape[0],
                 total_orders=unconstrained_clean_df['success_order_cnt_all_560_target'].sum())

In [None]:
result_list = []
metric_names = ['bad_order_frac_all_560_target',
                'less_5_order_frac_all_560_target']
total_drivers = unconstrained_clean_df.shape[0]
total_orders = unconstrained_clean_df['success_order_cnt_all_560_target'].sum()

for min_rating in tqdm.tqdm_notebook(range(4800, 4801, 25)):
    min_rating = min_rating / 1000
    for min_orders in tqdm.tqdm_notebook(range(0, 141, 10)):
        
        clean_df = unconstrained_clean_df[
            (unconstrained_clean_df['success_order_cnt_all_140_feature']>=min_orders)
            & (unconstrained_clean_df['current_rating_feature']>=min_rating)
        ]
        
        update_dict = {'min_orders': min_orders, 'min_rating': min_rating}
        
        for perc in tqdm.tqdm_notebook(range(0, 101, 10)):
            result_random = calculate_metric(
                df=clean_df,
                perc=perc,
                score_name='score_random',
                metric_names=metric_names,
                total_drivers=total_drivers,
                total_orders=total_orders
            )
            result_random.update(update_dict)

            result_heur = calculate_metric(
                df=clean_df,
                perc=perc,
                score_name='current_rating_feature',
                metric_names=metric_names,
                total_drivers=total_drivers,
                total_orders=total_orders
            )
            result_heur.update(update_dict)
            
            result_ml_econom = calculate_metric(
                df=clean_df,
                perc=perc,
                score_name='score_econom_reverse',
                metric_names=metric_names,
                total_drivers=total_drivers,
                total_orders=total_orders
            )
            result_ml_econom.update(update_dict)
            
            result_ml_higher = calculate_metric(
                df=clean_df,
                perc=perc,
                score_name='score_higher_reverse',
                metric_names=metric_names,
                total_drivers=total_drivers,
                total_orders=total_orders
            )
            result_ml_higher.update(update_dict)

            result_list.append(result_random)
            result_list.append(result_heur)
            result_list.append(result_ml_econom)
            result_list.append(result_ml_higher)

In [None]:
result_df = pd.DataFrame(result_list)

In [None]:
result_df.rename({
    'bad_order_frac_all_560_target_metric_value': 'bad_order_frac',
    'less_5_order_frac_all_560_target_metric_value': 'less_5_order_frac',
    'bad_order_frac_all_560_target_order_cnt': 'bad_order_cnt',
    'less_5_order_frac_all_560_target_order_cnt': 'less_5_order_cnt'
}, axis=1, inplace=True)

In [None]:
result_df = result_df[[
    'score_name', 'min_score', 'min_rating', 'min_orders',
    'bad_order_cnt', 'bad_order_frac', 'less_5_order_cnt', 'less_5_order_frac',
    'order_cnt', 'order_frac', 'driver_cnt', 'driver_frac'
]]

In [None]:
rename_dict = {'score_econom_reverse': 'ml',
               'current_rating_feature': 'rating',
               'score_random': 'random'}

In [None]:
result_df = result_df[result_df['score_name']!='score_higher_reverse']

In [None]:
result_df['score_name'] = result_df['score_name'].apply(lambda x: rename_dict[x])

In [None]:
result_df[
    (result_df['min_rating']==4.8)
    & (result_df['order_frac']==1)]

In [None]:
result_df[
    (result_df['min_rating']==4.8)
#     & (result_df['order_frac']>=0.75)
    & (result_df['bad_order_frac']<=0.0064)
].sort_values('order_frac', ascending=False)

In [None]:
result_df[
    (result_df['min_rating']==4.8)
#     & (result_df['order_frac']>=0.75)
    & (result_df['bad_order_frac']<=0.007)
].sort_values('order_frac', ascending=False)

In [None]:
# result_df.to_excel('scoring_constraints_analytics.xls')

In [None]:
min_rating = 4.8
min_orders = 140

rating = result_df[
    (result_df['min_rating']==min_rating)
    & (result_df['score_name']=='rating')
    & (result_df['min_orders']==min_orders)
].sort_values('min_score')

random = result_df[
    (result_df['min_rating']==min_rating)
    & (result_df['score_name']=='random')
    & (result_df['min_orders']==min_orders)
].sort_values('min_score')

ml_econom = result_df[
    (result_df['min_rating']==min_rating)
    & (result_df['score_name']=='ml')
    & (result_df['min_orders']==min_orders)
].sort_values('min_score')

# ml_higher = result_df[
#     (result_df['min_rating']==min_rating)
#     & (result_df['score_name']=='score_higher_reverse')
#     & (result_df['min_orders']==min_orders)
# ].sort_values('min_score')

In [None]:
# ml_econom

In [None]:
# rating

In [None]:
# econom min rating 0
fig, ax = plt.subplots(figsize=(16, 12))
ax.grid()
ax.set_yticks(np.arange(0, 1, step=0.05))

x_min = min(
    rating['bad_order_frac'][:-2].min(),
    ml_econom['bad_order_frac'][:-2].min(),
    random['bad_order_frac'][:-2].min(),
)
x_max = max(
    rating['bad_order_frac'][:-2].max(),
    ml_econom['bad_order_frac'][:-2].max(),
    random['bad_order_frac'][:-2].max(),
)
length = x_max - x_min

ax.set_xlim((x_min - length / 3, x_max + length / 3))

ax.set_title(f'Algorithm Comparison ({LAST_DATE})', fontsize=20)
ax.set_xlabel(
    'Defect Rate (#trips with rating < 4 / #trips)',
    fontsize=18,
)
ax.set_ylabel(
    f'Fraction of Total Supply ({total_orders / 1e6:.1f}M orders)',
    fontsize=18,
)

ax.plot(
    rating['bad_order_frac'][:-2],
    rating['order_frac'][:-2],
    color='b',
    lw=2,
    label='Rating',
)
ax.plot(
    ml_econom['bad_order_frac'][:-2],
    ml_econom['order_frac'][:-2],
    color='g',
    lw=2,
    label='ML',
)
ax.plot(
    random['bad_order_frac'][:-2],
    random['order_frac'][:-2],
    color='r',
    lw=2,
    label='Random',
)

ax.legend(fontsize=20, loc=2)
ax.grid(ls='-.')

plt.savefig(f'supply_vs_dr_{LAST_DATE}.png', dpi=150)

In [None]:
corr_df = unconstrained_clean_df.corr('pearson')[[
    'bad_order_frac_higher_140_target',
    'bad_order_frac_econom_140_target',
    'bad_order_frac_ultima_140_target',
    'bad_order_frac_child_140_target',
    'bad_order_frac_all_140_target'
]]

In [None]:
corr_df['bad_order_frac_all_140_target'].apply(
    lambda x: abs(x)
).loc[
    [i for i in corr_df['bad_order_frac_all_140_target'].index if not i.endswith('target')]
].sort_values(ascending=False).round(4)

### Choose min num orders

In [None]:
df['success_order_cnt_econom_140_feature'].hist(bins=100)

In [None]:
for standart in ['econom', 'higher', 'ultima', 'all']:
    for perc in [5, 25, 50, 75, 95]:
        print(
            standart,
            perc,
            np.percentile(
                df[df['success_order_cnt_{}_140_feature'.format(standart)]>0]['success_order_cnt_{}_140_feature'.format(standart)],
                perc
            )
        )

In [None]:
df[df['success_order_cnt_ultima_140_feature']>0]['success_order_cnt_ultima_140_feature'].hist(bins=100)

In [None]:
num_trips_list = []
less_dr_list = []
more_dr_list = []
diff_list = []
less_drivers_list = []
more_drivers_list = []
success_orders_less_list = []
success_orders_more_list = []

for i in range(1, 147, 5):
    less_df = unconstrained_clean_df[
        (unconstrained_clean_df['success_order_cnt_all_140_feature']<i)
        & (unconstrained_clean_df['current_rating_feature']>=4.8)
    ]
    more_df = unconstrained_clean_df[
        (unconstrained_clean_df['success_order_cnt_all_140_feature']>=i)
        & (unconstrained_clean_df['current_rating_feature']>=4.8)
    ]
    less_dr = (
        (less_df['bad_order_frac_all_140_target']
         * less_df['success_order_cnt_all_140_target']).sum()
        / less_df['success_order_cnt_all_140_target'].sum()
    )
    more_dr = (
        (more_df['bad_order_frac_all_140_target']
         * more_df['success_order_cnt_all_140_target']).sum()
        / more_df['success_order_cnt_all_140_target'].sum()
    )
    num_trips_list.append(i)
    less_dr_list.append(round(less_dr, 5))
    more_dr_list.append(round(more_dr, 5))
    diff_list.append(round(round(less_dr, 5) - round(more_dr, 5), 3))
    less_drivers_list.append(less_df.shape[0])
    more_drivers_list.append(more_df.shape[0])
    success_orders_less_list.append(less_df['success_order_cnt_all_140_feature'].sum())
    success_orders_more_list.append(more_df['success_order_cnt_all_140_feature'].sum())
    
    

In [None]:
analytics_df = pd.DataFrame({'min_orders': num_trips_list,
              'defect_rate_less': less_dr_list,
              'defect_rate_more': more_dr_list,
              'diff': diff_list,
              'num_drivers_less': less_drivers_list,
              'num_drivers_more': more_drivers_list,
              'success_orders_less': success_orders_less_list,
              'success_orders_more': success_orders_more_list,})

In [None]:
analytics_df.to_csv(f'analytics_{LAST_DATE}.csv', index=False)