In [1]:
import pandas as pd
import os
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
plt.style.use('seaborn')

In [24]:
root_dir = 'D:/datasets/KAERI_dataset'

train_f = pd.read_csv(os.path.join(root_dir, 'train_features.csv'))
train_t = pd.read_csv(os.path.join(root_dir, 'train_target.csv'))
test_f = pd.read_csv(os.path.join(root_dir, 'test_features.csv'))

In [3]:
train_f.shape

(1050000, 6)

In [4]:
train_f.head()

Unnamed: 0,id,Time,S1,S2,S3,S4
0,0,0.0,0.0,0.0,0.0,0.0
1,0,4e-06,0.0,0.0,0.0,0.0
2,0,8e-06,0.0,0.0,0.0,0.0
3,0,1.2e-05,0.0,0.0,0.0,0.0
4,0,1.6e-05,0.0,0.0,0.0,0.0


In [5]:
train_t.head()

Unnamed: 0,id,X,Y,M,V
0,0,0.0,-400.0,50.0,0.4
1,1,400.0,0.0,100.0,1.0
2,2,-300.0,-200.0,25.0,0.4
3,3,200.0,-100.0,150.0,0.4
4,4,-300.0,-100.0,150.0,0.4


In [6]:
test_f.head()

Unnamed: 0,id,Time,S1,S2,S3,S4
0,2800,0.0,0.0,0.0,0.0,0.0
1,2800,4e-06,0.0,0.0,0.0,0.0
2,2800,8e-06,0.0,0.0,0.0,0.0
3,2800,1.2e-05,0.0,0.0,0.0,0.0
4,2800,1.6e-05,0.0,0.0,0.0,0.0


### train_feature

In [7]:
train_f['id'].value_counts()

2047    375
1333    375
2361    375
312     375
1335    375
       ... 
1645    375
621     375
2668    375
1644    375
0       375
Name: id, Length: 2800, dtype: int64

In [8]:
train_f.isnull().sum()

id      0
Time    0
S1      0
S2      0
S3      0
S4      0
dtype: int64

In [9]:
train_f.groupby('id')['Time'].count()

id
0       375
1       375
2       375
3       375
4       375
       ... 
2795    375
2796    375
2797    375
2798    375
2799    375
Name: Time, Length: 2800, dtype: int64

In [10]:
train_f.groupby('id')['Time'].max()

id
0       0.001496
1       0.001496
2       0.001496
3       0.001496
4       0.001496
          ...   
2795    0.001496
2796    0.001496
2797    0.001496
2798    0.001496
2799    0.001496
Name: Time, Length: 2800, dtype: float64

In [11]:
train_f.describe()

Unnamed: 0,id,Time,S1,S2,S3,S4
count,1050000.0,1050000.0,1050000.0,1050000.0,1050000.0,1050000.0
mean,1399.5,0.000748,-405.0983,-405.0983,-1334.343,-1605.664
std,808.2907,0.0004330114,275317.4,275317.4,265535.1,302697.0
min,0.0,0.0,-5596468.0,-5596468.0,-2772952.0,-6069645.0
25%,699.75,0.000372,-74263.21,-74263.21,-78554.88,-78183.71
50%,1399.5,0.000748,0.0,0.0,0.0,0.0
75%,2099.25,0.001124,73911.42,73911.42,72958.36,76658.08
max,2799.0,0.001496,3865086.0,3865086.0,3655237.0,3687344.0


In [12]:
test_f.describe()

Unnamed: 0,id,Time,S1,S2,S3,S4
count,262500.0,262500.0,262500.0,262500.0,262500.0,262500.0
mean,3149.5,0.000748,-217.2298,-184.2608,-120.8247,-857.8727
std,202.072773,0.000433,230343.8,228562.8,228294.1,269135.2
min,2800.0,0.0,-3027980.0,-2783507.0,-2399706.0,-5163090.0
25%,2974.75,0.000372,-78738.56,-78963.56,-83235.76,-78882.64
50%,3149.5,0.000748,0.0,0.0,0.0,0.0
75%,3324.25,0.001124,76982.37,77196.2,81656.85,78455.08
max,3499.0,0.001496,3022443.0,2877832.0,2486714.0,4305746.0


In [13]:
train_t['M'].value_counts()

175.0    400
150.0    400
75.0     400
125.0    400
25.0     400
100.0    400
50.0     400
Name: M, dtype: int64

In [14]:
train_t['X'].value_counts()

-300.0    315
 300.0    315
-200.0    315
 100.0    315
-100.0    315
-400.0    315
 200.0    315
 400.0    315
 0.0      280
Name: X, dtype: int64

In [15]:
train_t['Y'].value_counts()

-300.0    315
 300.0    315
 100.0    315
 200.0    315
-100.0    315
-200.0    315
 0.0      315
-400.0    315
 400.0    280
Name: Y, dtype: int64

In [16]:
train_t['V'].value_counts()

0.8    560
0.2    560
0.4    560
0.6    560
1.0    560
Name: V, dtype: int64

### preprocessing

In [25]:
def preprocess(data):
    # convert time sec to order number
    data['Time'] /= 0.000004
    data['Time'] = data['Time'].astype(np.int16).astype(str)
    
    # pivot
    data_pivot = data.pivot_table(index = 'id', values= ['S1','S2','S3','S4'], columns='Time')
    data_pivot.columns = ['_'.join(x) for x in data_pivot.columns.values]
    
    return data_pivot

In [26]:
trn = preprocess(train_f)

In [27]:
trn.head()

Unnamed: 0_level_0,S1_0,S1_1,S1_10,S1_100,S1_101,S1_102,S1_103,S1_104,S1_105,S1_106,...,S4_90,S4_91,S4_92,S4_93,S4_94,S4_95,S4_96,S4_97,S4_98,S4_99
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,-4.972607e-08,139454.8,168428.9,191703.7,208145.9,219978.2,224587.2,218900.5,...,-215.4438,-14557.57,-33901.75,-52578.66,-62459.04,-53790.86,-32473.27,-11799.29,9057.05,34479.76
1,0.0,0.0,0.0,8151.302,4888.677,-615.5879,-12015.93,-30952.12,-48913.93,-49925.75,...,1024173.0,1095499.0,1061209.0,909563.4,648399.0,339156.3,-42677.19,-428476.1,-812719.7,-1074859.0
2,0.0,0.0,-187.4429,-16723.0,-16074.19,-16368.09,-13730.92,-11915.05,-13439.89,-14359.6,...,79126.16,113238.9,139822.0,147123.5,132768.6,93479.66,28656.93,-41169.07,-110139.0,-161226.8
3,0.0,0.0,0.0,-73672.23,-82998.01,-90253.48,-89981.46,-77092.49,-52874.01,-24631.03,...,39165.71,-17841.29,-75468.21,-124241.6,-176415.8,-229249.6,-289117.7,-343540.6,-398224.6,-441401.9
4,0.0,0.0,-5.39542,-124438.1,-134954.0,-144245.1,-153940.1,-164849.0,-173207.0,-175876.5,...,5953.886,48055.03,93281.8,132627.0,171026.9,201571.8,226705.7,238129.6,238082.5,227932.5


In [36]:
train = trn.loc[~trn.index.isin(val_id)]
valid = trn.loc[trn.index.isin(val_id)]
train_y = train_t.loc[~train_t['id'].isin(val_id)]
valid_y = train_t.loc[train_t['id'].isin(val_id)]

In [45]:
test = preprocess(test_f)

In [58]:
import lightgbm as lgb
from sklearn.model_selection import KFold

In [96]:
params = {
    'learning_rate':0.05,
    'num_boost_round':1000,
    'num_leaves':64,
    'objective':'mse',
    'boosting':'gbdt',
    'seed':83,
    'max_depth':4,
    'bagging_fraction':0.7,
    'feature_fraction':0.7,
    'early_stopping_round':100,
    'lambda_l2':0.00001,
    'verbosity':-1
    
}

In [97]:
nfold = 10
fold = KFold(nfold, shuffle = True, random_state= 25)

In [None]:
predict = {'X':np.zeros(700), 'Y':np.zeros(700), 'M':np.zeros(700), 'V':np.zeros(700)}
loss = {'X':[],'Y':[],'M':[],'V':[]}
for train_idx, val_idx in fold.split(trn, y = train_t):
    for target in ['X','Y','M','V']:
        trainx_fold = trn.iloc[train_idx]
        validx_fold = trn.iloc[val_idx]
        trainy_fold = train_t.iloc[train_idx]
        validy_fold = train_t.iloc[val_idx]
        
        train_dataset = lgb.Dataset(trainx_fold, trainy_fold[target])
        valid_dataset = lgb.Dataset(validx_fold, validy_fold[target])

        model = lgb.train(params, train_set=train_dataset, valid_sets = valid_dataset)
        predict[target] += model.predict(test)
        loss[target].append(model.best_score['valid_0']['l2'])

In [100]:
submission = pd.read_csv(os.path.join(root_dir, 'sample_submission.csv'))

submission.shape

submission['X'] = predict['X'] / nfold
submission['Y'] = predict['Y'] / nfold
submission['M'] = predict['M'] / nfold
submission['V'] = predict['V'] / nfold

submission.to_csv(os.path.join(root_dir, 'lgb_cv_20200622.csv'), index = False)