## Import

In [12]:
import pandas as pd
import random
import os
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

In [13]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(37) # Seed 고정

## Data Load

In [14]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

In [15]:
train_x = train_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Class', 'Y_Quality'])
train_y = train_df['Y_Class']

test_x = test_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

## Data Pre-processing

In [16]:
train_x = train_x.fillna(0)
test_x = test_x.fillna(0)
# print(test_x)
print(train_x)

        LINE PRODUCT_CODE   X_1   X_2  X_3   X_4   X_5  X_6   X_7   X_8  ...  \
0    T050304         A_31   0.0   0.0  0.0   0.0   0.0  0.0   0.0   0.0  ...   
1    T050307         A_31   0.0   0.0  0.0   0.0   0.0  0.0   0.0   0.0  ...   
2    T050304         A_31   0.0   0.0  0.0   0.0   0.0  0.0   0.0   0.0  ...   
3    T050307         A_31   0.0   0.0  0.0   0.0   0.0  0.0   0.0   0.0  ...   
4    T050304         A_31   0.0   0.0  0.0   0.0   0.0  0.0   0.0   0.0  ...   
..       ...          ...   ...   ...  ...   ...   ...  ...   ...   ...  ...   
593  T100306         T_31   2.0  95.0  0.0  45.0  10.0  0.0  50.0  10.0  ...   
594  T050304         A_31   0.0   0.0  0.0   0.0   0.0  0.0   0.0   0.0  ...   
595  T050304         A_31   0.0   0.0  0.0   0.0   0.0  0.0   0.0   0.0  ...   
596  T100304         O_31  40.0  94.0  0.0  45.0  11.0  0.0  45.0  10.0  ...   
597  T100306         O_31  21.0  87.0  0.0  45.0  10.0  0.0  61.0  10.0  ...   

     X_2866  X_2867  X_2868  X_2869  X_

In [17]:
# qualitative to quantitative
qual_col = ['LINE', 'PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train_x[i])
    train_x[i] = le.transform(train_x[i])
    
    for label in np.unique(test_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test_x[i] = le.transform(test_x[i]) 
print('Done.')

Done.


## Classification Model Fit

In [18]:
RF = RandomForestClassifier(random_state=37).fit(train_x, train_y)
print('Done.')
print(test_x)
# print(train_x)
# print(train_y)


Done.
     LINE  PRODUCT_CODE  X_1   X_2  X_3   X_4   X_5  X_6   X_7   X_8  ...  \
0       5             2  2.0  94.0  0.0  45.0  10.0  0.0  51.0  10.0  ...   
1       4             2  2.0  93.0  0.0  45.0  11.0  0.0  45.0  10.0  ...   
2       4             2  2.0  95.0  0.0  45.0  11.0  0.0  45.0  10.0  ...   
3       0             0  0.0   0.0  0.0   0.0   0.0  0.0   0.0   0.0  ...   
4       1             0  0.0   0.0  0.0   0.0   0.0  0.0   0.0   0.0  ...   
..    ...           ...  ...   ...  ...   ...   ...  ...   ...   ...  ...   
305     5             2  2.0  91.0  0.0  45.0  10.0  0.0  51.0  10.0  ...   
306     4             2  2.0  96.0  0.0  45.0  11.0  0.0  45.0  10.0  ...   
307     5             2  2.0  91.0  0.0  45.0  10.0  0.0  50.0  10.0  ...   
308     5             2  2.0  95.0  0.0  45.0  10.0  0.0  51.0  10.0  ...   
309     5             2  2.0  87.0  0.0  45.0  10.0  0.0  51.0  10.0  ...   

     X_2866  X_2867  X_2868  X_2869  X_2870  X_2871  X_2872  X_2873  

## Inference

In [8]:
preds = RF.predict(test_x)
print('Done.')

Done.


## Submit

In [9]:
submit = pd.read_csv('./sample_submission.csv')

In [10]:
submit['Y_Class'] = preds

In [11]:
submit.to_csv('./baseline_submission.csv', index=False)