In [1]:
import os
import unittest
import pickle
import numpy as np
import pandas as pd
import lightgbm

from preprocessor import Preprocessor
from dropout_predictor import DropoutPredictor

In [3]:
INPUT_DIR = os.path.join(os.path.abspath(os.path.dirname('./')), 'input')
OUTPUT_DIR = os.path.join(os.path.abspath(os.path.dirname('./')), 'output')
categorical_columns = [
    '性別','都道府県','誕生月','支払い方法','商品名','キャンペーンコード',
    'オプション11','オプション12','オプション13','オプション14','オプション15',
    'オプション16','オプション17','オプション18','オプション19','オプション20',
]
print(INPUT_DIR)
print(OUTPUT_DIR)

/test_model/test_model/input
/test_model/test_model/output


In [7]:
def test_0_train_preprocess():
    input_path = INPUT_DIR + '/train.csv'
    output_path = INPUT_DIR + '/train/preprocessed.csv'
    pickle_path = INPUT_DIR + '/train_preprocess.pkl'
    purpose = 'train'
    load_pickle_path = None

    preprocessor = Preprocessor(
        input_path=input_path,
        output_path=output_path,
        pickle_path=pickle_path,
        purpose=purpose,
        load_pickle_path=load_pickle_path,
        categorical_columns=categorical_columns
    )
    preprocessor.preprocess()

In [8]:
test_0_train_preprocess()

2020-04-24T03:03:39+0000 _base.py preprocess [INFO] xxxxxxxx start preprocess
2020-04-24T03:04:10+0000 _base.py preprocess [INFO] xxxxxxxx complete load data
2020-04-24T03:04:20+0000 _base.py preprocess [INFO] xxxxxxxx complete transform
2020-04-24T03:04:20+0000 _base.py preprocess [INFO] xxxxxxxx complete output
2020-04-24T03:04:20+0000 _base.py preprocess [INFO] xxxxxxxx complete save instance
2020-04-24T03:04:21+0000 _base.py preprocess [INFO] xxxxxxxx complete preprocess


In [11]:
def test_1_train_model():
    input_dir = INPUT_DIR + '/train'
    param_path = INPUT_DIR + '/parameter.json'
    save_model_dir = OUTPUT_DIR + '/model'
    model = DropoutPredictor(
        input_dir=input_dir,
        param_path=param_path,
        save_model_dir=save_model_dir,
        pretrain_model_dir=None
    )
    model.train()

In [38]:
test_1_train_model()

2020-04-24T07:25:08+0000 _base.py train [INFO] test00001 start training
2020-04-24T07:25:08+0000 _base.py train [INFO] test00001 start training
2020-04-24T07:25:08+0000 _base.py train [INFO] test00001 start training
2020-04-24T07:25:08+0000 _base.py train [INFO] test00001 {"process_id": "test00001", "reg_alpha": 1.0, "reg_lambda": 5.0, "n_estimators": 300, "boosting_type": "gbdt", "class_weight": null, "colsample_bytree": 1.0, "learning_rate": 0.1, "max_depth": -1, "min_child_samples": 20, "min_child_weight": 0.001, "min_split_gain": 0.0, "num_leaves": 31, "random_state": null, "subsample": 1.0, "objective": "binary", "metric": "auc", "is_finetune": false, "num_boost_round": 100, "early_stopping_rounds": 10}
2020-04-24T07:25:08+0000 _base.py train [INFO] test00001 {"process_id": "test00001", "reg_alpha": 1.0, "reg_lambda": 5.0, "n_estimators": 300, "boosting_type": "gbdt", "class_weight": null, "colsample_bytree": 1.0, "learning_rate": 0.1, "max_depth": -1, "min_child_samples": 20, "mi

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
def test_2_test_preprocess():
    input_path = INPUT_DIR + '/test.csv'
    output_path = INPUT_DIR + '/test/preprocessed.csv'
    pickle_path = INPUT_DIR + '/test_preprocess.pkl'
    purpose = 'predict'
    load_pickle_path = INPUT_DIR + '/train_preprocess.pkl'

    preprocessor = Preprocessor(
        input_path=input_path,
        output_path=output_path,
        pickle_path=pickle_path,
        purpose=purpose,
        load_pickle_path=load_pickle_path,
        categorical_columns=categorical_columns
    )
    preprocessor.preprocess()

In [16]:
test_2_test_preprocess()

2020-04-24T03:28:05+0000 _base.py preprocess [INFO] xxxxxxxx start preprocess
2020-04-24T03:28:19+0000 _base.py preprocess [INFO] xxxxxxxx complete load data
2020-04-24T03:28:25+0000 _base.py preprocess [INFO] xxxxxxxx complete transform
2020-04-24T03:28:25+0000 _base.py preprocess [INFO] xxxxxxxx complete output
2020-04-24T03:28:25+0000 _base.py preprocess [INFO] xxxxxxxx complete save instance
2020-04-24T03:28:25+0000 _base.py preprocess [INFO] xxxxxxxx complete preprocess


In [23]:
test_df = pd.read_csv(INPUT_DIR + '/test/preprocessed.csv', header=None)
test_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,27,28,29,30,31,32,33,34,35,36
0,31,40,3700.35,26431,1.975,8,5256.325,26323,1,22,...,3,2,1,2,1,1,1,2,1,1
1,85,12,6125.083333,10630,2.666667,7,17176.0,56373,1,20,...,1,1,1,0,0,1,1,0,0,0
2,65,32,3664.71875,21078,2.1875,6,7649.25,63138,1,33,...,0,1,3,1,0,3,3,1,0,2
3,53,29,3560.103448,33230,2.034483,6,7667.206897,99371,2,12,...,2,4,1,1,3,0,0,2,0,0
4,78,33,2691.848485,22125,2.69697,8,8057.060606,66146,2,22,...,2,1,2,0,0,1,0,1,1,0


In [25]:
train_df = pd.read_csv(INPUT_DIR + '/train/preprocessed.csv', header=None)
train_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,28,29,30,31,32,33,34,35,36,37
0,31,1,29,3744.689655,14015,2.0,7,8575.758621,91419,1,...,2,0,0,0,1,4,0,0,3,1
1,85,0,12,5817.416667,16814,2.0,6,12165.25,47196,1,...,0,1,0,0,0,0,2,0,1,0
2,65,1,37,2622.27027,13713,2.0,6,5005.027027,53413,1,...,1,1,0,4,2,1,2,2,1,0
3,53,1,35,3188.142857,32254,2.542857,9,11800.657143,257786,2,...,2,2,1,3,0,1,1,0,0,0
4,78,1,46,2026.413043,8292,2.26087,7,4581.826087,24440,2,...,1,7,2,0,4,0,1,1,2,3


In [27]:
test_df.iloc[:, 1:]

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,27,28,29,30,31,32,33,34,35,36
0,40,3700.350000,26431,1.975000,8,5256.325000,26323,1,22,1,...,3,2,1,2,1,1,1,2,1,1
1,12,6125.083333,10630,2.666667,7,17176.000000,56373,1,20,3,...,1,1,1,0,0,1,1,0,0,0
2,32,3664.718750,21078,2.187500,6,7649.250000,63138,1,33,3,...,0,1,3,1,0,3,3,1,0,2
3,29,3560.103448,33230,2.034483,6,7667.206897,99371,2,12,5,...,2,4,1,1,3,0,0,2,0,0
4,33,2691.848485,22125,2.696970,8,8057.060606,66146,2,22,10,...,2,1,2,0,0,1,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,32,1917.562500,9649,1.906250,7,3349.343750,31302,1,23,3,...,1,1,2,2,0,1,2,2,1,3
96,7,4222.571429,7193,3.142857,6,9856.571429,14749,1,22,4,...,0,0,0,0,0,0,1,0,2,1
97,18,5184.611111,8236,2.222222,6,11916.500000,49019,1,9,3,...,1,0,0,1,0,1,0,0,1,1
98,32,3220.406250,16707,2.281250,6,5417.781250,38112,2,16,10,...,0,3,0,1,1,2,3,1,1,1


In [34]:
def test_3_test_predict():
    model_path = OUTPUT_DIR + '/model/model.txt'
    input_path =  INPUT_DIR + '/test/preprocessed.csv'
    output_path = OUTPUT_DIR + '/test/prediction.csv'
    
    model = lightgbm.Booster(model_file=model_path)
    
    df = pd.read_csv(input_path, header=None)
    ids = list(df.iloc[:, 0])

    data = df.iloc[:, 1:]

    predict_proba = model.predict(data)
    predictions = np.round(predict_proba).astype(int)

    pd.DataFrame(
        {'id': ids, 'prediction': predictions, 'probability': predict_proba},
        columns=['id','prediction', 'probability']
    ).to_csv(output_path, header=True, index=False)

In [36]:
test_3_test_predict()

In [37]:
prediction_df = pd.read_csv(OUTPUT_DIR + '/test/prediction.csv')
prediction_df.head()

Unnamed: 0,id,prediction,probability
0,31,1,0.50965
1,85,0,0.177146
2,65,1,0.50965
3,53,1,0.50965
4,78,1,0.50965
