In [1]:
import pandas as pd
import numpy as np
import sklearn

import warnings
warnings.filterwarnings("ignore")

from sklearn.linear_model import SGDClassifier
from sklearn.metrics import f1_score

import xgboost as xgb

N_FOLDS = 3

In [2]:
train_data_def = pd.read_csv('train/train.csv', sep=';')

train_start = '2020-06'
test_start = '2020-07'

train_data = train_data_def[train_data_def['order_completed_at']==train_start]
print(train_data['phone_id'].nunique())
print(train_data['target'].value_counts())

test_data = train_data_def[train_data_def['order_completed_at']==test_start]
print(test_data['phone_id'].nunique())
print(test_data['target'].value_counts())

train_data.loc[:, 'fold_num'] = [np.random.randint(1,N_FOLDS+1) for i in range(len(train_data))]
train_data['fold_num'].value_counts()
train_data['month']=6

test_data.loc[:, 'fold_num'] = [np.random.randint(1,N_FOLDS+1) for i in range(len(test_data))]
test_data['fold_num'].value_counts()
test_data['month']=7


train_folds = []
test_folds = []

for i in range(1, N_FOLDS+1):
    local_data_train = train_data.loc[train_data['fold_num']==i]
    local_data_test = test_data.loc[test_data['fold_num']==i]

    del local_data_train['fold_num']
    del local_data_test['fold_num']
    
    del local_data_train['order_completed_at']
    del local_data_test['order_completed_at']
    
    train_folds.append(local_data_train.reset_index(drop=True))
    test_folds.append(local_data_test.reset_index(drop=True))


229989
0.0    115137
1.0    114852
Name: target, dtype: int64
257197
0.0    136768
1.0    120429
Name: target, dtype: int64


In [3]:
def get_X_y(folds, i, fea):
    local_fea = train_folds[i].merge(fea, on =['phone_id', 'month'])
    del local_fea['phone_id']
    del local_fea['month']
    
    return local_fea.drop('target', axis=1).values, local_fea['target'].values

In [4]:
fea = pd.read_pickle('fea_all_2_ship_plus_order_plus_phone.pkl')

In [None]:
from hyperopt import hp, tpe
from hyperopt.fmin import fmin

def hyperopt_xgb_score(params):
    f = open('text.txt', 'w')
    print('fit start', end = ' ')
    clf = xgb.XGBClassifier(**params)
    
    scores = []
    for i in range(N_FOLDS):

        X_tr, y_tr = get_X_y(train_folds, i, fea)
        X_te, y_te = get_X_y(test_folds, i, fea)    

        clf.fit(X_tr, y_tr)
        preds = clf.predict(X_te)
        scores.append(f1_score(y_te, preds))

        


    current_score = np.mean(scores)
    f.write(str(current_score))
    f.write(str(clf.get_params()) + '\n')
    print(f'current_score - {current_score}')
    return -current_score
 
 
simple_space_xgb = {
            'n_estimators': hp.quniform('n_estimators', 30, 70, 100),
            'eta': hp.quniform('eta', 0.025, 0.05, 0.01),
            'max_depth':  hp.choice('max_depth', np.arange(7, 9, dtype=int)),
}

space_xgb2 = {
            'n_estimators': hp.choice('n_estimators', range(70, 300, 20)),
            'eta': hp.quniform('eta', 0.025, 0.5, 0.025),
            'max_depth':  hp.choice('max_depth', np.arange(10, 16, 2,dtype=int)),
#             'min_child_weight': hp.quniform('min_child_weight', 1, 6, 1),
#             'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
#             'gamma': hp.quniform('gamma', 0.5, 1, 0.05),
            'colsample_bytree': hp.quniform('colsample_bytree', 0.9, 1, 0.8),
            'eval_metric': 'auc',
            'objective': 'binary:logistic',
            # Increase this number if you have more cores. Otherwise, remove it and it will default
            # to the maxium number.
            'nthread': 6,
            'booster': 'gbtree',
            'tree_method': 'exact',
        }


 
best = fmin(fn=hyperopt_xgb_score, space=space_xgb2, algo=tpe.suggest, max_evals=10)
print('best:')
print(best.values)

fit start current_score - 0.9465582876711816
fit start 

In [181]:
clf = xgb.XGBClassifier()
clf.get_params()

{'base_score': None,
 'booster': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'gamma': None,
 'gpu_id': None,
 'importance_type': 'gain',
 'interaction_constraints': None,
 'learning_rate': None,
 'max_delta_step': None,
 'max_depth': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 100,
 'n_jobs': None,
 'num_parallel_tree': None,
 'objective': 'binary:logistic',
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [33]:
# clf = xgb.XGBClassifier(**p)
clf.feature_importances_

array([0.01822638, 0.78845894, 0.01384963, 0.01517677, 0.01507491,
       0.01379641, 0.01329169, 0.02083668, 0.01533451, 0.01530199,
       0.01558643, 0.016531  , 0.03853457], dtype=float32)

In [37]:
clf = xgb.XGBClassifier(**p)

Unnamed: 0.1,Unnamed: 0,order_id_nunique,total_cost_sum,total_cost_min,total_cost_max,total_cost_mean,total_cost_std,rate_mean,promo_total_sum,promo_total_mean,total_weight_sum,total_weight_mean,month,phone_id,month_after_first
0,0,1,98.0,98.0,98.0,98.0,0.000000,0.0,0.000000,0.000000,22024,22024.0,1,2,0.0
1,1,2,366.0,158.0,208.0,183.0,35.355339,0.0,0.000000,0.000000,67946,33973.0,2,2,1.0
2,2,1,233.0,233.0,233.0,233.0,0.000000,0.0,0.000000,0.000000,34195,34195.0,3,2,2.0
3,3,1,248.0,248.0,248.0,248.0,0.000000,0.0,0.000000,0.000000,40777,40777.0,5,2,4.0
4,4,2,0.0,0.0,0.0,0.0,0.000000,5.0,0.000000,0.000000,31265,15632.5,3,3,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1155784,1155784,1,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,5320,5320.0,8,719367,0.0
1155785,1155785,1,0.0,0.0,0.0,0.0,0.000000,5.0,0.000000,0.000000,12270,12270.0,8,719369,0.0
1155786,1155786,1,0.0,0.0,0.0,0.0,0.000000,5.0,-163.770004,-163.770004,14039,14039.0,8,719370,0.0
1155787,1155787,1,0.0,0.0,0.0,0.0,0.000000,0.0,-193.270004,-193.270004,18470,18470.0,8,719373,0.0


In [63]:
test_data 
fea = pd.read_csv('fea/f_shipments.csv')

data = test_data.merge(fea, on=['phone_id', 'month'])
data.head()

Unnamed: 0.1,phone_id,order_completed_at,target,fold_num,month,Unnamed: 0,order_id_nunique,total_cost_sum,total_cost_min,total_cost_max,total_cost_mean,total_cost_std,rate_mean,promo_total_sum,promo_total_mean,total_weight_sum,total_weight_mean,month_after_first
0,101944,2020-07,0.0,1,7,331935,7,1147.0,49.0,248.0,163.857143,67.008528,0.0,0.0,0.0,248820,35545.714286,6.0
1,342522,2020-07,0.0,3,7,820858,1,199.0,199.0,199.0,199.0,0.0,0.0,-300.0,-300.0,15150,15150.0,6.0
2,457799,2020-07,1.0,3,7,943757,1,98.0,98.0,98.0,98.0,0.0,5.0,0.0,0.0,25000,25000.0,0.0
3,40738,2020-07,0.0,1,7,142539,1,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,99480,33160.0,3.0
4,174054,2020-07,1.0,1,7,518599,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10781,10781.0,3.0


In [64]:
del data['Unnamed: 0']
del data['fold_num']
del data['month']
del data['phone_id']
del data['order_completed_at']


In [65]:
data.columns

Index(['target', 'order_id_nunique', 'total_cost_sum', 'total_cost_min',
       'total_cost_max', 'total_cost_mean', 'total_cost_std', 'rate_mean',
       'promo_total_sum', 'promo_total_mean', 'total_weight_sum',
       'total_weight_mean', 'month_after_first'],
      dtype='object')

In [66]:
X = data.drop('target', axis = 1).values 
y = data['target'].values

clf = xgb.XGBClassifier(**p)
clf.fit(X, y)
clf.feature_importances_

array([0.8651687 , 0.00828489, 0.01012916, 0.01008234, 0.00951171,
       0.00892515, 0.01361466, 0.00913344, 0.00977358, 0.0101423 ,
       0.01066739, 0.03456669], dtype=float32)

In [69]:
test = pd.read_csv('sample_submission.csv', sep = ';')
test.head()

Unnamed: 0,Id,Predicted
0,19843,
1,471287,
2,342522,
3,457799,
4,233778,


In [78]:
test = test.rename(columns={'Id':'phone_id'})
test['month'] = 8


data = test.merge(fea, on=['phone_id', 'month'], how = 'left')
data.head()


Unnamed: 0.1,phone_id,Predicted,month,Unnamed: 0,order_id_nunique,total_cost_sum,total_cost_min,total_cost_max,total_cost_mean,total_cost_std,rate_mean,promo_total_sum,promo_total_mean,total_weight_sum,total_weight_mean,month_after_first
0,19843,,8,70452.0,2.0,147.0,49.0,98.0,73.5,34.648232,2.5,-550.0,-275.0,19204.0,9602.0,7.0
1,471287,,8,957255.0,1.0,199.0,199.0,199.0,199.0,0.0,5.0,0.0,0.0,23100.0,23100.0,0.0
2,342522,,8,,,,,,,,,,,,,
3,457799,,8,943758.0,2.0,238.0,80.0,158.0,119.0,55.154329,5.0,0.0,0.0,61800.0,30900.0,1.0
4,233778,,8,639744.0,2.0,199.0,0.0,199.0,99.5,140.714249,0.0,0.0,0.0,21437.0,10718.5,3.0


In [82]:
# del data['Unnamed: 0']
# del data['month']
# del data['phone_id']
# del data['order_completed_at']

data = data.dropna(subset=['order_id_nunique'])

In [86]:
X = data.drop(['phone_id', 'Predicted'], axis=1).values
preds = clf.predict(X)
data['Predicted'] = preds

In [90]:
data['Predicted'].value_counts()

1.0    106058
0.0     53154
Name: Predicted, dtype: int64

In [92]:
data.head()

Unnamed: 0,phone_id,Predicted,order_id_nunique,total_cost_sum,total_cost_min,total_cost_max,total_cost_mean,total_cost_std,rate_mean,promo_total_sum,promo_total_mean,total_weight_sum,total_weight_mean,month_after_first
0,19843,1.0,2.0,147.0,49.0,98.0,73.5,34.648232,2.5,-550.0,-275.0,19204.0,9602.0,7.0
1,471287,1.0,1.0,199.0,199.0,199.0,199.0,0.0,5.0,0.0,0.0,23100.0,23100.0,0.0
3,457799,1.0,2.0,238.0,80.0,158.0,119.0,55.154329,5.0,0.0,0.0,61800.0,30900.0,1.0
4,233778,1.0,2.0,199.0,0.0,199.0,99.5,140.714249,0.0,0.0,0.0,21437.0,10718.5,3.0
6,695176,1.0,1.0,98.0,98.0,98.0,98.0,0.0,4.0,-250.0,-250.0,16181.0,16181.0,0.0


In [None]:
test

In [105]:
test = pd.read_csv('sample_submission.csv', sep = ';')
test = test[['Id']]
test = test.merge(data[['phone_id', 'Predicted']], left_on = 'Id', right_on='phone_id', how='left')
del test['phone_id']
test.head()

Unnamed: 0,Id,Predicted
0,19843,1.0
1,471287,1.0
2,342522,
3,457799,1.0
4,233778,1.0


In [106]:
test.loc[test['Predicted'].isna(), 'Predicted'] = 0 
test.head()

Unnamed: 0,Id,Predicted
0,19843,1.0
1,471287,1.0
2,342522,0.0
3,457799,1.0
4,233778,1.0


In [124]:
len(test), len( pd.read_csv('sample_submission.csv', sep = ';'))

(214609, 214609)

In [101]:
# test['Predicted'] = test['Predicted'].astype(int)

In [113]:
test['Predicted'] = test['Predicted'].astype(int).astype(str)

In [123]:
test['Id'] = test['Id'].astype(str)
test['Predicted'][2]

'0'

In [118]:
test.to_csv('HackIt_test_sabmission.csv', sep=',', index=False)

In [109]:
test.to_excel('HackIt_test_sabmission.xlsx', index=False)