## load data

In [15]:
import pandas as pd
import numpy as np

train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

In [16]:
train_df = train_df.drop(['PRODUCT_ID', 'Y_Quality', 'TIMESTAMP'], axis=1)
test_df = test_df.drop(['PRODUCT_ID', 'TIMESTAMP'], axis=1)

In [17]:
from sklearn.preprocessing import LabelEncoder

qual_col = ['LINE','PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le.fit(train_df[i])
    train_df[i] = le.transform(train_df[i])

    for label in np.unique(test_df[i]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
    
    test_df[i] = le.transform(test_df[i])

print('done')

done


## split

In [18]:
def split_line(df, kind):

    globals()[f'{kind}_x_line_set'] = []
    globals()[f'{kind}_y_line_set'] = []

    line_value = df['LINE'].value_counts().index
    for line in line_value:
        globals()[f'{kind}_x_line_{line}'] = df[df['LINE'] == line]
        globals()[f'{kind}_x_line_set'].append(globals()[f'{kind}_x_line_{line}'])
    try:
        for line in line_value:
            globals()[f'{kind}_y_line_{line}'] = df['Y_Class'][df['LINE'] == line]
            globals()[f'{kind}_y_line_set'].append(globals()[f'{kind}_y_line_{line}'])
            globals()[f'{kind}_x_line_{line}'].drop('Y_Class', axis=1, inplace = True)
    except:
        pass

In [19]:
import warnings
warnings.filterwarnings('ignore')

split_line(train_df, 'train')
split_line(test_df, 'test')

## Preprocessing

In [20]:
# drop_fill()

def drop_fill(x):
    row_len = x.shape[0]

    df = pd.DataFrame(x.isnull().sum())
    #drop_col = df[df[0]/row_len >= 0.3].index
    fill_mean_col = df[df[0]/row_len < 0.3].index

    #x.drop(drop_col, axis=1, inplace=True)

    for col in fill_mean_col:
        x[col] = x[col].fillna(x[col].mean())

    return x

## model

In [21]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBRFClassifier, XGBClassifier

model_lgbm = LGBMClassifier(n_estimators=200, boost_from_average=False)
model_XGBRF = XGBRFClassifier()
model_XGB = XGBClassifier()

In [22]:
def fit_accuracy(model, x, y):

    x_t, x_v, y_t, y_v = train_test_split(x, y, test_size=0.15, random_state=42)
    
    model.fit(x_t, y_t)
    pred = model.predict(x_v)
    
    print('accuracy:',accuracy_score(pred, y_v), '\n')

In [23]:
def valid(model,train_x_line_set, train_y_line_set):

    for line in range(6):
        print(f'line {line+1}')
        x = drop_fill(train_x_line_set[line])
        y = train_y_line_set[line]

        fit_accuracy(model, x, y)

## submit

In [24]:
def test(model, train_x, train_y, test_x):
    
    model.fit(train_x, train_y)
    pred = model.predict(test_x)

    return pred

In [25]:
#split_line(test_df, 'test')

def test_submit(model, train_x_line_set, train_y_line_set, test_x_line_set):

    pred_df_list = []

    for idx, train_x in enumerate(train_x_line_set):
        train_y = train_y_line_set[idx]
        test_x = test_x_line_set[idx]
        
        data = test(model, train_x, train_y, test_x)
        pred_df = pd.DataFrame(data, index=test_x.index)
        
        pred_df_list.append(pred_df)
        pred = pd.concat(pred_df_list, axis=0).sort_index()
    
    return pred

def submit_(pred):
    submit_csv = pd.read_csv('./sample_submission.csv')
    submit_csv['Y_Class'] = pred
    submit_csv.to_csv('0220_LGBM_split.csv', index=False)


In [27]:
pred = test_submit(model_lgbm, train_x_line_set, train_y_line_set, test_x_line_set)
submit_(pred)