In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import random
import os
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(37)

train_df = pd.read_csv('./drive/MyDrive/open/train.csv')
test_df = pd.read_csv('./drive/MyDrive/open/test.csv')

# X_2872, X_2873, X_2874, X_2875는 필요 없고

## feature끼리 묶을 때 isnull sum 값이 같은 것끼리 묶어도 괜찮을듯?
## 변수 보면 product code에 따라 feature끼리 묶이는게 확실하게 보임


In [None]:
train_df.shape

(598, 2881)

In [None]:
a = train_df.isnull().sum()
print(a)

PRODUCT_ID      0
Y_Class         0
Y_Quality       0
TIMESTAMP       0
LINE            0
             ... 
X_2871        499
X_2872        598
X_2873        598
X_2874        598
X_2875        598
Length: 2881, dtype: int64


# quality를 회귀하고 난 후 class를 분류하는건 어때?
## class별로 나뉘는 값이 크지 않음 0.0001 정도...
## 데이터 불균형

In [None]:
train_df[['Y_Class','Y_Quality']].groupby('Y_Class').agg(['mean', 'min', 'max', 'count'])

Unnamed: 0_level_0,Y_Quality,Y_Quality,Y_Quality,Y_Quality
Unnamed: 0_level_1,mean,min,max,count
Y_Class,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,0.520837,0.500856,0.525067,88
1,0.530253,0.525086,0.534843,407
2,0.542031,0.534951,0.578841,103


# Product_code끼리 묶기
불균형 불균형 불균형
O_31 개심함

In [None]:
train_df.groupby('PRODUCT_CODE')['Y_Class'].value_counts()

PRODUCT_CODE  Y_Class
A_31          1          118
              2           71
              0           60
O_31          1            4
              2            2
T_31          1          285
              2           30
              0           28
Name: Y_Class, dtype: int64

# 각 product code마다 사용하지 않는 변수 구해둠
A_31은 759개 사용 안하고
T_31은 2198개 사용 안함
0_31은 2208개

In [None]:
print(train_df[train_df['PRODUCT_CODE']=='A_31'].shape)
A = pd.DataFrame(train_df[train_df['PRODUCT_CODE']=='A_31'].isnull().sum(),columns=['sum'])
A[A['sum']==train_df[train_df['PRODUCT_CODE']=='A_31'].shape[0]]

(249, 2881)


Unnamed: 0,sum
X_1,249
X_2,249
X_3,249
X_4,249
X_5,249
...,...
X_2844,249
X_2872,249
X_2873,249
X_2874,249


In [None]:
print(train_df[train_df['PRODUCT_CODE']=='T_31'].shape)
T = pd.DataFrame(train_df[train_df['PRODUCT_CODE']=='T_31'].isnull().sum(),columns=['sum'])
T[T['sum']==train_df[train_df['PRODUCT_CODE']=='T_31'].shape[0]]

(343, 2881)


Unnamed: 0,sum
X_128,343
X_129,343
X_130,343
X_131,343
X_132,343
...,...
X_2871,343
X_2872,343
X_2873,343
X_2874,343


In [None]:
print(train_df[train_df['PRODUCT_CODE']=='O_31'].shape)
T = pd.DataFrame(train_df[train_df['PRODUCT_CODE']=='O_31'].isnull().sum(),columns=['sum'])
T[T['sum']==train_df[train_df['PRODUCT_CODE']=='O_31'].shape[0]]

(6, 2881)


Unnamed: 0,sum
X_128,6
X_129,6
X_130,6
X_131,6
X_132,6
...,...
X_2871,6
X_2872,6
X_2873,6
X_2874,6


# 분석

1. time cycle 즉 주기의 간격이 일정하지 않음
2. Product_ID와 LINE은 그룹핑 해야할거 같음
3. 변수가 너무 많음

1. timestamp는 사용 안함 -> 시계열 아닌 그냥 분류
2. Y_Quality를 회귀하고 분류 문제로 풀어도 되지만 Y_QUality에 따른 Y_Class 값이 0.0001 단위로 바뀌기에 좋은 생각인지는 모르겠음
3. 분류 class 불균형 존재
4. LIne과 Product끼리 grouping하는건 확실해보임 -> Catboost 써보기
5. 데이터가 많지가 않음 -> 증강을 해봐야할까 -> 저번처럼 증강한다고 무조건 좋진 않음


In [None]:
!pip install pycaret

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

train_df = pd.read_csv('./drive/MyDrive/open/train.csv')
test_df = pd.read_csv('./drive/MyDrive/open/test.csv')

train_x = train_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP','Y_Quality'])
#train_y = train_df['Y_Class']
x_train, x_val= train_test_split(train_x, test_size=0.2, shuffle=True, random_state=37)

x_train = x_train.fillna(0)
x_val = x_val.fillna(0)

qual_col = ['LINE', 'PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(x_train[i])
    x_train[i] = le.transform(x_train[i])

    for label in np.unique(x_val[i]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
    x_val[i] = le.transform(x_val[i])

In [None]:
from pycaret.classification import *

In [None]:
xgboost = create_model('xgboost')
catboost = create_model('catboost')

INFO:logs:Initializing create_model()
INFO:logs:create_model(estimator=xgboost, fold=None, round=4, cross_validation=True, predict=True, fit_kwargs=None, groups=None, refit=True, verbose=True, system=True, metrics=None, experiment_custom_tags=None, add_to_model_list=True, probability_threshold=None, display=None, return_train_score=False, kwargs={})
INFO:logs:Checking exceptions


ValueError: ignored

In [None]:
clf = setup(data=x_train, target='Y_Class')

Unnamed: 0,Description,Value
0,session_id,160
1,Target,Y_Class
2,Target Type,Multiclass
3,Label Encoded,
4,Original Data,"(478, 2878)"
5,Missing Values,False
6,Numeric Features,2655
7,Categorical Features,222
8,Ordinal Features,False
9,High Cardinality Features,False


INFO:logs:create_model_container: 0
INFO:logs:master_model_container: 0
INFO:logs:display_container: 1
INFO:logs:Pipeline(memory=None,
         steps=[('dtypes',
                 DataTypes_Auto_infer(categorical_features=[],
                                      display_types=True, features_todrop=[],
                                      id_columns=[],
                                      ml_usecase='classification',
                                      numerical_features=[], target='Y_Class',
                                      time_features=[])),
                ('imputer',
                 Simple_Imputer(categorical_strategy='not_available',
                                fill_value_categorical=None,
                                fill_value_numerical=None,
                                numeric_stra...
                ('scaling', 'passthrough'), ('P_transform', 'passthrough'),
                ('binn', 'passthrough'), ('rem_outliers', 'passthrough'),
                ('cluste

In [None]:
best_3 = compare_models(sort = 'F1', n_select = 3)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.7364,0.7957,0.501,0.6952,0.6843,0.3311,0.3864,0.196
rf,Random Forest Classifier,0.7307,0.7849,0.4859,0.6921,0.6711,0.2909,0.3438,0.219
lr,Logistic Regression,0.7004,0.7455,0.483,0.6744,0.6697,0.3025,0.3181,1.362
gbc,Gradient Boosting Classifier,0.7004,0.713,0.4853,0.6739,0.6647,0.2905,0.3161,1.806
knn,K Neighbors Classifier,0.6798,0.7524,0.5153,0.6633,0.664,0.3076,0.3154,0.037
lightgbm,Light Gradient Boosting Machine,0.6971,0.755,0.4849,0.6754,0.6632,0.2867,0.311,0.453
nb,Naive Bayes,0.6282,0.746,0.5505,0.6913,0.6463,0.3309,0.342,0.021
ridge,Ridge Classifier,0.6733,0.0,0.4449,0.6462,0.6396,0.2389,0.2616,0.028
ada,Ada Boost Classifier,0.6674,0.5699,0.4278,0.6226,0.6242,0.188,0.2015,0.24
dt,Decision Tree Classifier,0.6111,0.6113,0.4755,0.6155,0.6067,0.2195,0.2259,0.034


INFO:logs:create_model_container: 14
INFO:logs:master_model_container: 14
INFO:logs:display_container: 2
INFO:logs:[ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=160, verbose=0,
                     warm_start=False), RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
  

In [None]:
blended = blend_models(estimator_list = best_3, fold=10)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7647,0.8883,0.5522,0.7306,0.7336,0.4505,0.4698
1,0.7059,0.8479,0.4821,0.674,0.6723,0.2902,0.3105
2,0.7353,0.8546,0.4855,0.6374,0.6799,0.3585,0.3864
3,0.7941,0.8352,0.63,0.7962,0.7723,0.5031,0.5384
4,0.7576,0.8984,0.5111,0.7561,0.7155,0.4081,0.4566
5,0.6667,0.7169,0.4657,0.6888,0.6416,0.216,0.2325
6,0.697,0.7232,0.4919,0.6554,0.6604,0.2857,0.306
7,0.7273,0.6715,0.5364,0.7262,0.7033,0.3571,0.3825
8,0.6667,0.6519,0.371,0.5384,0.5902,0.0547,0.0693
9,0.7879,0.7985,0.5333,0.8053,0.7428,0.4254,0.4908


INFO:logs:create_model_container: 15
INFO:logs:master_model_container: 15
INFO:logs:display_container: 3
INFO:logs:VotingClassifier(estimators=[('et',
                              ExtraTreesClassifier(bootstrap=False,
                                                   ccp_alpha=0.0,
                                                   class_weight=None,
                                                   criterion='gini',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   max_samples=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,

In [None]:
preds = predict_model(blended)

INFO:logs:Initializing predict_model()
INFO:logs:predict_model(estimator=VotingClassifier(estimators=[('et',
                              ExtraTreesClassifier(bootstrap=False,
                                                   ccp_alpha=0.0,
                                                   class_weight=None,
                                                   criterion='gini',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   max_samples=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                         

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.7153,0.7187,0.4908,0.6723,0.6612,0.329,0.3748
