In [1]:
import pandas as pd
import random
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split # 학습, 테스트set 구분
from sklearn.tree import export_graphviz # tree 시각화를 위해
import graphviz # tree 시각화
from sklearn.metrics import f1_score# 성능지표를 계산하기 위해 import
from sklearn.model_selection import cross_val_score, cross_validate # 교차검증

In [2]:
train_df = pd.read_csv('/content/drive/MyDrive/LGaimers/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/LGaimers/test.csv')

In [3]:
print(train_df.shape)
print(test_df.shape)

(598, 2881)
(310, 2879)


## 사용 함수 정의

In [4]:
# 모두 결측치인 변수 제거
def remove_all_nan(dataframe):
  col_list = dataframe.columns
  nan_list = []
  nan_cnt = []
  nan_col = []
  full_list = []

  for col in col_list:
    if dataframe[col].isnull().sum() == 0:
      full_list.append(col)
      continue
    nan_list.append([col,dataframe[col].isnull().sum()])
    nan_cnt.append(dataframe[col].isnull().sum())
    nan_col.append(col)

  del_col = []
  for nan in nan_list:
    if nan[1] == len(dataframe):
      del_col.append(nan[0])
  
  return dataframe.drop(columns=del_col)

In [5]:
'''값이 1개 존재하면 제거'''

def remove_one_value(dataframe):
  del_col = []
  col_list = dataframe.columns
  for col in col_list:
      if dataframe[col].nunique()==1 :
          del_col.append(col)
  
  return dataframe.drop(columns=del_col)

In [6]:
# iterativeimputer 함수 정의
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

def iterativeimputer_subset(input_df,target_df,LINE_NUM):
  imputer = IterativeImputer(random_state=1234)

  cols = input_df.columns[:-1 * LINE_NUM]
  key_cols = input_df.columns[-1 * LINE_NUM:]

  total_subset = target_df[key_cols]
  interval = 100
  
  for i in range(0,len(cols)//interval):
    print(i,'번째 merge 진행중 ...')
    subset = pd.concat([input_df[key_cols], input_df[cols[i*interval : (i*interval)+interval]]], axis=1)
    imputer.fit(subset)

    target_subset = pd.concat([target_df[key_cols], target_df[cols[i*interval : (i*interval)+interval]]], axis=1)
    impute_subset = pd.DataFrame(imputer.transform(target_subset), columns=target_subset.columns)
    impute_subset = impute_subset.drop(key_cols,axis=1)

    print("기존 total : ", total_subset.shape, "기존 impute_subset : ", impute_subset.shape)
    total_subset = pd.concat([total_subset.reset_index(drop=True), impute_subset.reset_index(drop=True)], axis=1)
    print("병합 total : ", total_subset.shape)
    print('------------------------------------------------------')

  subset = pd.concat([input_df[key_cols], input_df[cols[total_subset.shape[1]-len(input_df.columns) : ]]], axis=1)
  imputer.fit(subset)

  target_subset = pd.concat([target_df[key_cols], target_df[cols[total_subset.shape[1]-len(input_df.columns) : ]]], axis=1)
  impute_subset = pd.DataFrame(imputer.transform(target_subset), columns=target_subset.columns)
  impute_subset = impute_subset.drop(key_cols,axis=1)

  print("기존 total : ", total_subset.shape, "기존 impute_subset : ", impute_subset.shape)
  total_subset = pd.concat([total_subset.reset_index(drop=True), impute_subset.reset_index(drop=True)], axis=1)
  print("병합 total : ", total_subset.shape)
  print('------------------------------------------------------')

  return total_subset

# PRODUCT CODE별 dataset 구축

In [7]:
trainA_31 = train_df[train_df['PRODUCT_CODE'] == 'A_31']
trainT_31 = train_df[train_df['PRODUCT_CODE'] == 'T_31']
trainO_31 = train_df[train_df['PRODUCT_CODE'] == 'O_31']

In [8]:
testA_31 = test_df[test_df['PRODUCT_CODE'] == 'A_31']
testT_31 = test_df[test_df['PRODUCT_CODE'] == 'T_31']
testO_31 = test_df[test_df['PRODUCT_CODE'] == 'O_31']

In [9]:
# 모두 NaN인 feature 제거
trainA_31 = remove_all_nan(trainA_31)
trainT_31 = remove_all_nan(trainT_31)
trainO_31 = remove_all_nan(trainO_31)

In [10]:
# 모두 NaN인 feature 제거
trainA_31 = remove_one_value(trainA_31)
trainT_31 = remove_one_value(trainT_31)
trainO_31 = remove_one_value(trainO_31)

In [11]:
print(trainA_31.shape, trainT_31.shape, trainO_31.shape)

(249, 1870) (343, 554) (6, 503)


In [12]:
trainA_31_x = trainA_31.drop(columns=['PRODUCT_ID','TIMESTAMP','Y_Class','Y_Quality'])
trainT_31_x = trainT_31.drop(columns=['PRODUCT_ID','TIMESTAMP','Y_Class','Y_Quality'])
trainO_31_x = trainO_31.drop(columns=['PRODUCT_ID','TIMESTAMP','Y_Class','Y_Quality'])

In [13]:
trainA_31_x = pd.get_dummies(trainA_31_x)
trainT_31_x = pd.get_dummies(trainT_31_x)
trainO_31_x = pd.get_dummies(trainO_31_x)

In [14]:
trainA_31_x = pd.concat([trainA_31_x, trainA_31['Y_Class'], trainA_31['Y_Quality']], axis=1)
trainT_31_x = pd.concat([trainT_31_x, trainT_31['Y_Class'], trainT_31['Y_Quality']], axis=1)
trainO_31_x = pd.concat([trainO_31_x, trainO_31['Y_Class'], trainO_31['Y_Quality']], axis=1)

## PRODUCT_CODE 별 iterativeimputer 적용
- 이때 Y_Class와 Y_Quality는 key_cols에 포함시킴.
- test에 transform시에는 해당 column 제거함.

In [15]:
trainA_31_x = iterativeimputer_subset(trainA_31_x,trainA_31_x,6)
trainT_31_x = iterativeimputer_subset(trainT_31_x,trainT_31_x,4)
trainO_31_x = iterativeimputer_subset(trainO_31_x,trainO_31_x,4)

0 번째 merge 진행중 ...




기존 total :  (249, 6) 기존 impute_subset :  (249, 100)
병합 total :  (249, 106)
------------------------------------------------------
1 번째 merge 진행중 ...




기존 total :  (249, 106) 기존 impute_subset :  (249, 100)
병합 total :  (249, 206)
------------------------------------------------------
2 번째 merge 진행중 ...


  eigen_vals_ = S ** 2
  gamma_ = np.sum((alpha_ * eigen_vals_) / (lambda_ + alpha_ * eigen_vals_))
  eigen_vals_ = S ** 2
  gamma_ = np.sum((alpha_ * eigen_vals_) / (lambda_ + alpha_ * eigen_vals_))


ValueError: ignored

In [None]:
#test에 transform을 위해 Y_Class, Y_Quality drop
trainA_31_x = trainA_31_x.drop(columns=['Y_Class','Y_Quality'])
trainT_31_x = trainT_31_x.drop(columns=['Y_Class','Y_Quality'])
trainO_31_x = trainO_31_x.drop(columns=['Y_Class','Y_Quality'])

In [None]:
new_col = trainA_31_x.columns[4:].to_list() + trainA_31_x.columns[:4].to_list()
trainA_31_x = trainA_31_x[new_col]

new_col = trainT_31_x.columns[2:].to_list() + trainT_31_x.columns[:2].to_list()
trainT_31_x = trainT_31_x[new_col]

new_col = trainO_31_x.columns[2:].to_list() + trainO_31_x.columns[:2].to_list()
trainO_31_x = trainO_31_x[new_col]

In [None]:
trainA_31_x

In [None]:
testA_31_x = pd.concat([testA_31[trainA_31_x.columns[:-4]], testA_31['LINE']], axis=1)
testA_31_x = pd.get_dummies(testA_31_x)

testT_31_x = pd.concat([testT_31[trainT_31_x.columns[:-2]], testT_31['LINE']], axis=1)
testT_31_x = pd.get_dummies(testT_31_x)

testO_31_x = pd.concat([testO_31[trainO_31_x.columns[:-2]], testO_31['LINE']], axis=1)
testO_31_x = pd.get_dummies(testO_31_x)

In [None]:
testA_31_x

In [None]:
testA_31_x = iterativeimputer_subset(trainA_31_x,testA_31_x,4)
testT_31_x = iterativeimputer_subset(trainT_31_x,testT_31_x,2)
testO_31_x = iterativeimputer_subset(trainO_31_x,testO_31_x,2)

# testA_31_x = testA_31_x.fillna(-1)
# testT_31_x = testT_31_x.fillna(-1)
# testO_31_x = testO_31_x.fillna(-1)

Dataset:  
- trainA_31  
- trainT_31  
- trainO_31  

test 결측치
  - train에 iterativeimputer fit, test에 transform 적용

Dataset:
- trainA_31_x, testA_31_x
- trainT_31_x, testT_31_x 
- trainO_31_x, testO_31_x

In [None]:
# classification
trainA_31_y_c = trainA_31['Y_Class']
trainT_31_y_c = trainT_31['Y_Class']
trainO_31_y_c = trainO_31['Y_Class']

# regression
trainA_31_y_r = trainA_31['Y_Quality']
trainT_31_y_r = trainT_31['Y_Quality']
trainO_31_y_r = trainO_31['Y_Quality']

## Model

In [None]:
!pip install --target=$my_path catboost

In [None]:
from catboost import *

In [None]:
model = CatBoostRegressor(random_state=1234,verbose=500,iterations=1500,learning_rate=0.033)
model.fit(trainA_31_x, trainA_31_y_r)
pred_a = model.predict(testA_31_x)

In [None]:
#model = CatBoostRegressor(random_state=110,verbose=500,iterations=500)
model.fit(trainT_31_x, trainT_31_y_r)
pred_t = model.predict(testT_31_x)

In [None]:
#model = CatBoostRegressor(random_state=110,verbose=500,iterations=500)
model.fit(trainO_31_x, trainO_31_y_r)
pred_o = model.predict(testO_31_x)

In [None]:
testA_31['Y_quanlity'] = pred_a
testT_31['Y_quanlity'] = pred_t
testO_31['Y_quanlity'] = pred_o

In [None]:
testA_31['Y_Class'] = 1
testT_31['Y_Class'] = 1
testO_31['Y_Class'] = 1

In [None]:
testA_31.loc[(testA_31['Y_quanlity']<0.52507), 'Y_Class'] = 0
testA_31.loc[(testA_31['Y_quanlity']>0.5349), 'Y_Class'] = 2

testT_31.loc[(testT_31['Y_quanlity']<0.52507), 'Y_Class'] = 0
testT_31.loc[(testT_31['Y_quanlity']>0.5349), 'Y_Class'] = 2

testO_31.loc[(testO_31['Y_quanlity']<0.52507), 'Y_Class'] = 0
testO_31.loc[(testO_31['Y_quanlity']>0.5349), 'Y_Class'] = 2

In [None]:
submita = pd.read_csv('/content/drive/MyDrive/LGaimers/sample_submission.csv')
submitt = pd.read_csv('/content/drive/MyDrive/LGaimers/sample_submission.csv')
submito = pd.read_csv('/content/drive/MyDrive/LGaimers/sample_submission.csv')

In [None]:
submita = pd.merge(submita[['PRODUCT_ID']],testA_31[['PRODUCT_ID','Y_Class']],on='PRODUCT_ID')
submitt = pd.merge(submitt[['PRODUCT_ID']],testT_31[['PRODUCT_ID','Y_Class']],on='PRODUCT_ID')
submito = pd.merge(submito[['PRODUCT_ID']],testO_31[['PRODUCT_ID','Y_Class']],on='PRODUCT_ID')

In [None]:
pd.concat([submita,submitt,submito]).sort_values(by='PRODUCT_ID').to_csv('iterativeimputer.csv',index=False)

In [None]:
result = pd.read_csv('/content/iterativeimputer.csv')
cat2 = pd.read_csv('/content/캣2_5.csv')

In [None]:
result['Y_Class'].value_counts()

In [None]:
from collections import Counter
Counter(result['Y_Class'] != cat2['Y_Class'])

In [None]:
cat2['Y_Class'].value_counts()