In [174]:
import pandas as pd
import random
import os
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [175]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(37) # Seed 고정

## 1️⃣ Load data

In [176]:
import pandas as pd
import numpy as np


train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

## 2️⃣ Preprocessing

### (1)_ drop

In [177]:
train_x = train_df.drop(columns=['PRODUCT_ID', 'Y_Quality', 'TIMESTAMP'])
train_y = train_df['Y_Class']
test_x = test_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

### (2)_ LabelEncoder

In [178]:
from sklearn.preprocessing import LabelEncoder

# qualitative to quantitative
qual_col = ['LINE', 'PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train_x[i])
    train_x[i] = le.transform(train_x[i])
    
    for label in np.unique(test_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test_x[i] = le.transform(test_x[i]) 
print('Done.')

Done.


### (3)_ Split

In [179]:
train_x['count_null']=train_x.isnull().sum(axis=1)
test_x['count_null']=test_x.isnull().sum(axis=1)

In [180]:
# train
train_x_1 = train_x[train_x['PRODUCT_CODE'] == 0].drop('Y_Class', axis=1)
train_x_2 = train_x[train_x['PRODUCT_CODE'] != 0].drop('Y_Class', axis=1)

train_y_1 = train_x['Y_Class'][train_x['PRODUCT_CODE'] == 0]
train_y_2 = train_x['Y_Class'][train_x['PRODUCT_CODE'] != 0]

print('train_x_1 shape :', train_x_1.shape,
      '\ntrain_x_2 shape :', train_x_2.shape)

print('\ntrain_y_1 shape :', train_y_1.shape,
      '\ntrain_y_2 shape :', train_y_2.shape)



# test
test_x_1 = test_x[test_x['PRODUCT_CODE'] == 0]
test_x_2 = test_x[test_x['PRODUCT_CODE'] != 0]

print('\ntest_x_1 shape :', test_x_1.shape,
      '\ntest_x_2 shape :', test_x_2.shape)


print('done')

train_x_1 shape : (249, 2878) 
train_x_2 shape : (349, 2878)

train_y_1 shape : (249,) 
train_y_2 shape : (349,)

test_x_1 shape : (67, 2878) 
test_x_2 shape : (243, 2878)
done


### (4)_ null 처리

#### zero

In [181]:
# zero
train_x_1_zero = train_x_1.fillna(0)
train_x_2_zero = train_x_2.fillna(0)

# zero
test_x_1_zero = test_x_1.fillna(0)
test_x_2_zero = test_x_2.fillna(0)

#### drop
  
test-train 데이터와 동일한 column을 drop 시켜줘야 한다 🤔🤔🤔🤔🤔   

In [182]:
# 1
train_x_isnull = train_x_1.isnull().sum()
train_1_drop_col = train_x_isnull[train_x_isnull > 0].index
test_x_isnull = test_x_1.isnull().sum()
test_1_drop_col = test_x_isnull[test_x_isnull > 0].index


train_x_1_drop = train_x_1.drop(test_1_drop_col, axis=1).dropna(axis=1)
test_x_1_drop = test_x_1.drop(train_1_drop_col, axis=1).dropna(axis=1)

# 2
train_x_isnull = train_x_2.isnull().sum()
train_2_drop_col = train_x_isnull[train_x_isnull > 0].index
test_x_isnull = test_x_2.isnull().sum()
test_2_drop_col = test_x_isnull[test_x_isnull > 0].index

train_x_2_drop = train_x_2.drop(test_2_drop_col, axis=1).dropna(axis=1)
test_x_2_drop = test_x_2.drop(train_2_drop_col, axis=1).dropna(axis=1)


shape check

In [183]:
print('✅ 1')
print(train_x_1_zero.shape)
print(test_x_1_zero.shape)

print('\n✅ 2')
print(train_x_2_zero.shape)
print(test_x_2_zero.shape)

✅ 1
(249, 2878)
(67, 2878)

✅ 2
(349, 2878)
(243, 2878)


In [184]:
print('✅ 1')
print(train_x_1_drop.shape)
print(test_x_1_drop.shape)

print('\n✅ 2')
print(train_x_2_drop.shape)
print(test_x_2_drop.shape)

✅ 1
(249, 221)
(67, 221)

✅ 2
(349, 233)
(243, 233)


### (5)_standardscaler

In [185]:
from sklearn.preprocessing import StandardScaler

In [186]:
scaler_1_d = StandardScaler()
scaler_2_z = StandardScaler()
scaler_2_d = StandardScaler()
scaler_1_z = StandardScaler()


# 1
### zero
scaler = StandardScaler()

df = train_x_1_zero
scale = scaler.fit_transform(df)
train_x_1_zero = pd.DataFrame(data=scale, columns=df.columns, index=df.index)

df = test_x_1_zero
scale = scaler.transform(df)
test_x_1_zero = pd.DataFrame(data=scale, columns=df.columns, index=df.index)


### drop
df = train_x_1_drop
scale = scaler.fit_transform(df)
train_x_1_drop = pd.DataFrame(data=scale, columns=df.columns, index=df.index)

df = test_x_1_drop
scale = scaler.transform(df)
test_x_1_drop = pd.DataFrame(data=scale, columns=df.columns, index=df.index)


# 2
### zero
df = train_x_2_zero
scale = scaler.fit_transform(df)
train_x_2_zero = pd.DataFrame(data=scale, columns=df.columns, index=df.index)

df = test_x_2_zero
scale = scaler.transform(df)
test_x_2_zero = pd.DataFrame(data=scale, columns=df.columns, index=df.index)


### drop
df = train_x_2_drop
scale = scaler.fit_transform(df)
train_x_2_drop = pd.DataFrame(data=scale, columns=df.columns, index=df.index)

df = test_x_2_drop
scale = scaler.transform(df)
test_x_2_drop = pd.DataFrame(data=scale, columns=df.columns, index=df.index)

### (4)_ train / valid split

In [187]:
from sklearn.model_selection import train_test_split

# zero
z_train_x_1, z_valid_x_1, z_train_y_1, z_valid_y_1= train_test_split(train_x_1_zero, train_y_1, test_size=0.15)
z_train_x_2, z_valid_x_2, z_train_y_2, z_valid_y_2= train_test_split(train_x_2_zero, train_y_2, test_size=0.15)

# drop
d_train_x_1, d_valid_x_1, d_train_y_1, d_valid_y_1= train_test_split(train_x_2_drop, train_y_2, test_size=0.15)
d_train_x_2, d_valid_x_2, d_train_y_2, d_valid_y_2= train_test_split(train_x_1_drop, train_y_1, test_size=0.15)

## 3️⃣ Build model

### (1)_ define model

In [188]:
# model
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn import svm

# metric
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

rfc = RandomForestClassifier(random_state=42)
xgbc = XGBClassifier(eval_metric='mlogloss')
svmc = svm.SVC()
gbc = GradientBoostingClassifier(random_state=42)

models = [rfc, xgbc, svmc, gbc]
model_names = ['RandomForestClassifier','XGBClassifier','SVC','GradientBoostingClassifier']

### (2)_ fit, predict

In [189]:
print('### model : 1 ###\n')

print('✅ zero\n')
for idx, i in enumerate(models):
    warnings.filterwarnings('ignore')
    model = i
    model.fit(z_train_x_1, z_train_y_1)
    pred = model.predict(z_valid_x_1)
    print('{} :'.format(model_names[idx]), accuracy_score(pred, z_valid_y_1))

print('\n✅ drop\n')
for idx, i in enumerate(models):
    warnings.filterwarnings('ignore')
    model = i
    model.fit(d_train_x_1, d_train_y_1)
    pred = model.predict(d_valid_x_1)
    print('{} :'.format(model_names[idx]), accuracy_score(pred, d_valid_y_1))


print('\n\n\n### model : 2 ###\n')

print('✅ zero\n')
for idx, i in enumerate(models):
    warnings.filterwarnings('ignore')
    model = i
    model.fit(z_train_x_2, z_train_y_2)
    pred = model.predict(z_valid_x_2)
    print('{} :'.format(model_names[idx]), accuracy_score(pred, z_valid_y_2))

print('\n✅ drop\n')
for idx, i in enumerate(models):
    warnings.filterwarnings('ignore')
    model = i
    model.fit(d_train_x_2, d_train_y_2)
    pred = model.predict(d_valid_x_2)
    print('{} :'.format(model_names[idx]), accuracy_score(pred, d_valid_y_2))

### model : 1 ###

✅ zero

RandomForestClassifier : 0.6842105263157895
XGBClassifier : 0.5789473684210527
SVC : 0.39473684210526316
GradientBoostingClassifier : 0.6842105263157895

✅ drop

RandomForestClassifier : 0.8679245283018868
XGBClassifier : 0.8679245283018868
SVC : 0.8490566037735849
GradientBoostingClassifier : 0.8301886792452831



### model : 2 ###

✅ zero

RandomForestClassifier : 0.8301886792452831
XGBClassifier : 0.8301886792452831
SVC : 0.8301886792452831
GradientBoostingClassifier : 0.8490566037735849

✅ drop

RandomForestClassifier : 0.5789473684210527
XGBClassifier : 0.5263157894736842
SVC : 0.3684210526315789
GradientBoostingClassifier : 0.5263157894736842


### (3)_ feature importance

In [62]:
model = rfc

# 1
model.fit(d_train_x_1, d_train_y_1)
pred1 = model.predict(d_valid_x_1)
print(accuracy_score(pred1, d_valid_y_1))

0.8301886792452831


In [66]:
model = gbc

# 2
model.fit(z_train_x_2, z_train_y_2)
pred2 = model.predict(z_valid_x_2)
print(accuracy_score(pred2, z_valid_y_2))

0.8867924528301887


In [63]:
import seaborn as sns
import matplotlib.pyplot as plt

importance_values = model.feature_importances_

# 정렬과 시각화를 쉽게 하기 위해 series 전환
ft_series = pd.Series(importance_values, index = d_train_x_1.columns)
ft_top20 = ft_series.sort_values(ascending=False)[:100]

# 시각화
plt.figure(figsize=(8,6))
plt.title('Feature Importance Top 2850')
sns.barplot(x=ft_top20, y=ft_top20.index)
plt.show()

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [None]:
# 2
model.fit(z_train_x_2, z_train_y_2)
pred2 = model.predict(z_valid_x_2)
print(accuracy_score(pred2, z_valid_y_2))

## 🎯 Submit

In [67]:
# ✅ zero

### 1
train_x_1_zero
test_x_1_zero
### 2
train_x_2_zero
test_x_2_zero

# ✅ drop

### 1
train_x_1_drop
test_x_1_drop
### 2
train_x_2_drop
test_x_2_drop

train_x1 = train_x_1_drop
test_x1 = test_x_1_drop

train_x2 = train_x_2_zero
test_x2 = test_x_2_zero

# rfc, xgbc, svmc, gbc
# 'RandomForestClassifier','XGBClassifier','SVC','GradientBoostingClassifier'



# 1
model = rfc
model.fit(train_x_1_drop, train_y_1)
pred1 = model.predict(test_x_1_drop)

# 2
model = gbc
model.fit(train_x2, train_y_2)
pred2 = model.predict(test_x2)

In [69]:
sub_pred_value = np.concatenate([pred1, pred2])

submit_csv = pd.read_csv('./sample_submission.csv')
submit_csv['Y_Class'] =sub_pred_value
submit_csv.to_csv('0208_submission.csv', index=False)