In [50]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [51]:
train = pd.read_csv('data/first_round_training_data.csv')
test = pd.read_csv('data/first_round_testing_data.csv')
submit = pd.read_csv('data/submit_example.csv')

dic = {'Excellent': 0, 'Good': 1, 'Pass': 2, 'Fail': 3}
train['label'] = train['Quality_label'].map(dic)

feathers_p = ["Parameter"+str(i) for i in range(1, 11)]
X = train[feathers_p]
Y = train["label"]
test_x = test[feathers_p]

In [52]:
pred = np.zeros((X.shape[0], 4))
pred_test = np.zeros((test_x.shape[0], 4))
skf = StratifiedKFold(n_splits=5, random_state=2019, shuffle=True)
for i, (train_index, valid_index) in enumerate(skf.split(X, Y)):
    train_x, valid_x, train_y, valid_y = X.iloc[train_index], X.iloc[valid_index], Y.iloc[train_index], Y.iloc[valid_index]
    train_data = lgb.Dataset(train_x, label=train_y)
    valid_data = lgb.Dataset(valid_x, label=valid_y)
    params = {
        'learning_rate': 0.01,
        'boosting_type': 'gbdt',
        'objective': 'multiclass',
        'num_class': 4,
        'metric': 'multi_logloss',
        'max_depth': 6,
        'num_leaves': 50,
        'num_threads': -1
    }
    lgbmodel = lgb.train(params, train_data, valid_sets=[valid_data], num_boost_round=5000, verbose_eval=100,
                         early_stopping_rounds=100)
    pred[valid_index] = lgbmodel.predict(valid_x)
    pred_test += lgbmodel.predict(test_x) / 5
print('acc:', accuracy_score(Y, np.argmax(pred, axis=1)))

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 1.15943
[200]	valid_0's multi_logloss: 1.12009
[300]	valid_0's multi_logloss: 1.1108
[400]	valid_0's multi_logloss: 1.10877
Early stopping, best iteration is:
[390]	valid_0's multi_logloss: 1.10859
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 1.17409
[200]	valid_0's multi_logloss: 1.13803
[300]	valid_0's multi_logloss: 1.12828
[400]	valid_0's multi_logloss: 1.12573
Early stopping, best iteration is:
[395]	valid_0's multi_logloss: 1.1257
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 1.16293
[200]	valid_0's multi_logloss: 1.1261
[300]	valid_0's multi_logloss: 1.11597
[400]	valid_0's multi_logloss: 1.11417
[500]	valid_0's multi_logloss: 1.11225
Early stopping, best iteration is:
[470]	valid_0's multi_logloss: 1.11192
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_

In [53]:
res_cols = [i for i in submit.columns if i not in ['Group']]
temp = pd.DataFrame(pred_test,columns=res_cols)
temp['Group'] = test['Group']
result = temp.groupby('Group').mean()

import datetime
curtime = datetime.datetime.now().strftime('%m-%d')
result.to_csv("result-"+curtime+".csv",index=True)