In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools

In [41]:
train = pd.read_csv('data/train_music.csv')
test = pd.read_csv('data/test_music.csv')

In [42]:
device_features = {
    'id': 'n',
    'device_type': 'c',
    'manufacturer_category': 'c',
    'os_category': 'c',
    'sim_count': 'c'
}

In [43]:
general_features = {
    'tp_flag': 'c',
    'lt': 'n',
    'block_flag': 'c',
    'days_exp': 'n',
    'service_1_flag': 'c',
    'service_1_count': 'n',
    'service_2_flag': 'c',
    'service_3_flag': 'c',
    'is_obl_center': 'c',
    'is_my_vf': 'c'
}

In [44]:
user_activity_features = {
 'balance_sum': 'n',
 'paym_last_days': 'n',
 'inact_days_count': 'c'
}


In [45]:
dynamics_features_generic = {
    'service_P_flag': 'c', 
    'block_all_dur': 'n',
    'block_count': 'n',
    'all_cost': 'n',
    'all_home_clc': 'n',
    'all_roam_cost': 'n',
    'sms_cost': 'n',
    'sms_roam_cost': 'n',
    'content_cost': 'n',
    'abon_cost': 'n',
    'abon_part': 'n',
    'act_days_count': 'n',
    'com_num_cost': 'n',
    'conn_com_cost': 'n',
    'paym_el_count': 'n',
    'paym_el_sum': 'n',
    'paym_sum': 'n',
    'pay_in_P2P_cost': 'n',
    'pay_out_P2P_cost': 'n',
    'paym_count': 'n'
}
dynamics_features_m1 = {'{}_m1'.format(x):y for x, y in dynamics_features_generic.items()}
dynamics_features_m2 = {'{}_m2'.format(x):y for x, y in dynamics_features_generic.items()}
dynamics_features_m3 = {'{}_m3'.format(x):y for x, y in dynamics_features_generic.items()}


In [46]:
all_features = {
    **device_features,
    **general_features,
    **user_activity_features,
    **dynamics_features_m1,
    **dynamics_features_m2,
    **dynamics_features_m3
}

other_features = set(train.columns).difference(set(all_features.keys()))

In [47]:
all_features = {
    **all_features,
    **{f: 'n' for f in other_features}
}
del all_features['target']

In [48]:
def drop_outliers(data, qt=0.99):
    q = data.quantile(qt)
    return data[data < q]

In [50]:
def prepare_types(data, features):
    df = data.copy()
    for f, t in features.items():
        if t is 'c' and f not in ['id']:
            df.loc[:, f] = df.loc[:, f].astype(int)
        elif t is 'n':
            df.loc[:, f] = df.loc[:, f].astype(float)
    return df

In [51]:
def fill_na(data):
    data = data.copy()
    data.loc[:, 'sim_count'] = data.loc[:, 'sim_count'].fillna(100)
    data.loc[:, 'tp_flag'] = data.loc[:, 'tp_flag'].fillna(0)
    data.loc[:, 'service_1_flag'] = data.loc[:, 'service_1_flag'].fillna(100)
    data.loc[:, 'service_2_flag'] = data.loc[:, 'service_2_flag'].fillna(100)
    data.loc[:, 'service_3_flag'] = data.loc[:, 'service_3_flag'].fillna(100)
    data.loc[:, 'is_obl_center'] = data.loc[:, 'is_obl_center'].fillna(100)
    data.loc[:, 'inact_days_count'] = data.loc[:, 'inact_days_count'].fillna(0)
    data.loc[:, 'service_P_flag_m1'] = data.loc[:, 'service_P_flag_m1'].fillna(100)
    data.loc[:, 'service_P_flag_m2'] = data.loc[:, 'service_P_flag_m2'].fillna(100)
    data.loc[:, 'service_P_flag_m3'] = data.loc[:, 'service_P_flag_m3'].fillna(100)
    
    return data

In [52]:
train = fill_na(train)

In [53]:
test = fill_na(test)

In [54]:
train = prepare_types(train, all_features)

In [55]:
test = prepare_types(test, all_features)

In [56]:
train = train[list({'target': 'c', **all_features}.keys())]

In [57]:
test = test[list(all_features.keys())]

In [58]:
train_test = pd.concat([train, test], sort=False)\
#                 .replace([np.inf], 0)
                #Check this later, potentially bad

In [19]:
def receipt_diff_process(data, receipt):
    target_group, rec = receipt
    b = data[[*target_group, *list(rec.keys())]]
    a = b.groupby(target_group).agg(rec)
    a.columns = ["_".join(x) for x in a.columns.ravel()]
    merged = b.merge(a, left_on=target_group, right_index=True, how='left')
    df = {}
    target_group_name = "_".join(target_group)
    for k in rec.keys():
        target_cat = k
        names = (["{}_{}".format(k, x) for x in rec[k]])
        for name in names:
            col = merged[name] - merged[target_cat] 
            col_name = "{}_diff_{}_group_{}".format(target_cat, name, target_group_name)
            df[col_name] = col
    return pd.DataFrame(df)

In [21]:
# receipts = [
#     (['manufacturer_category'],{
#         'lt': ['mean', 'median'],
#         'balance_sum': ['mean', 'median'],
#         'service_1_count': ['mean', 'median']
#     }),
#     (['manufacturer_category', 'device_type'],{
#         'lt': ['mean', 'median']
#     })
# ]

In [59]:
cats = {x: y for x, y in all_features.items() if x != 'id' and y is 'c'}
nums = {x: ['mean', 'median'] for x, y in all_features.items() if x != 'id' and y is 'n'}

In [60]:
receipts = [([x], nums) for x in cats.keys()]

In [29]:
features = []
for receipt in receipts:
    print("Processing {}".format(receipt[0]))
    res = receipt_diff_process(train_test, receipt)
    features.append(res)

Processing ['device_type']
Processing ['manufacturer_category']
Processing ['os_category']
Processing ['sim_count']
Processing ['tp_flag']
Processing ['block_flag']
Processing ['service_1_flag']
Processing ['service_2_flag']
Processing ['service_3_flag']
Processing ['is_obl_center']
Processing ['is_my_vf']
Processing ['inact_days_count']
Processing ['service_P_flag_m1']
Processing ['service_P_flag_m2']
Processing ['service_P_flag_m3']


In [30]:
features = pd.concat(features, axis=1)

In [36]:
train_test = pd.concat([train_test, features], axis=1)

NameError: name 'features' is not defined

In [61]:
def combine_cat_features(data, features, interactions=2):
    combinations = list(itertools.combinations(features, interactions))
    generated = {}
    for combination in combinations:
        f_name = "_".join(combination)
        res = data[combination[0]].astype(str)
        for feature in combination[1:]:
            res = res + "_" + data[feature].astype(str)
        
        generated[f_name] = res
    return pd.DataFrame(generated)

In [62]:
gen = combine_cat_features(train_test, list(cats.keys()))

In [63]:
factored = gen.apply(lambda x: pd.factorize(x)[0])

In [35]:
train_test = pd.concat([train_test, factored], axis=1)

In [36]:
train_test = pd.concat([train_test, num50_f], axis=1)

In [37]:
train = train_test[~train_test.target.isna()]
test = train_test[train_test.target.isna()].drop(columns=["target"])

In [38]:
train.to_pickle('data/train.pkl')

In [39]:
test.to_pickle('data/test.pkl')

In [25]:
top50_fit = ['lt',
 'balance_sum',
 'content_count_m1',
 'content_count_m3',
 'data_type_2_m1',
 'days_exp',
 'data_type_3_m1',
 'count_app_4',
 'data_type_2_m2',
 'vol_app_7',
 'content_count_m2',
 'data_type_1_m1',
 'os_category_is_my_vf',
 'all_cost_m1',
 'count_app_5',
 'service_1_count',
 'count_url_category_2',
 'all_count_m1',
 'vol_app_4',
 'short_out_calls_part_m3',
 'sms_in_count_m1',
 'data_type_2_m3',
 'count_sms_source_4',
 'data_type_3_m3',
 'vol_app_5',
 'data_type_3_m2',
 'paym_last_days',
 'count_app_1',
 'short_out_calls_part_m1',
 'vol_app_1',
 'short_in_calls_part_m1',
 'all_cost_m3',
 'count_act_type_1',
 'manufacturer_category_is_my_vf',
 'sms_in_count_m3',
 'voice_onnet_in_night_rest_dur_m1',
 'sms_in_count_m2',
 'vol_app_10',
 'paym_el_sum_m3',
 'paym_sum_m1',
 'conn_com_cost_m3',
 'paym_sum_m3',
 'voice_all_in_dur_m3',
 'all_count_m3',
 'short_in_calls_part_m3',
 'com_num_cost_m2',
 'manufacturer_category_service_P_flag_m3',
 'short_out_calls_part_m2',
 'voice_omo_in_night_work_dur_m2',
 'paym_sum_m2']
current_cols = list(train.columns)
top50_fit = [x for x in top50_fit if x in current_cols]

In [26]:
num50 = list(train[top50_fit].dtypes[train[top50_fit].dtypes != int].index)

In [29]:
def combine_div_num_features(data, features, op):
    combinations = list(itertools.combinations(features, 2))
    generated = {}
    for combination in combinations:
        f_name = "_div_by_".join(combination)
        res = op(data[combination[0]], data[combination[1]])
        generated[f_name] = res
    return pd.DataFrame(generated)

In [32]:
num50_divs = combine_div_num_features(train_test, num50, lambda x, y: x - y)

In [33]:
num50_f = num50_divs.replace(np.inf, np.nan)