### 본 코드는 사이킷런(sklearn)으로 수행하였으며, K-fold(교차검증) 방식과 2가지 방식의 모델(Multi-layer perceptron과 LinearRegression)으로 수행하였습니다.

In [1]:
import pandas as pd
import numpy as np

# Multi-layer perceptron 모델 구성
from sklearn.neural_network import MLPRegressor

# LinearRegression 모델 구성
from sklearn.linear_model import LinearRegression

# K-fold
from sklearn.model_selection import KFold

from sklearn import preprocessing

### 데이터 불러오기
#### csv 파일 읽어들이기

In [2]:
train_df = pd.read_csv('./dataset/iris_train.csv')
train_df

Unnamed: 0,id,species,sepal length (cm),petal length (cm),sepal width (cm),petal width (cm)
0,0,setosa,4.4,1.4,2.9,0.2
1,1,versicolor,6.4,4.5,3.2,1.5
2,2,virginica,6.2,4.8,2.8,1.8
3,3,virginica,7.2,6.1,3.6,2.5
4,4,setosa,4.9,1.4,3.0,0.2
...,...,...,...,...,...,...
70,70,versicolor,6.5,4.6,2.8,1.5
71,71,versicolor,5.6,3.6,2.9,1.3
72,72,versicolor,6.2,4.5,2.2,1.5
73,73,versicolor,4.9,3.3,2.4,1.0


### 학습 데이터 전처리 & 학습 데이터/결과 데이터 분류

In [3]:
train_df['species'] = train_df['species'].map({'setosa':0, 'versicolor': 1, 'virginica': 2})

X, Y = train_df.iloc[:,1:4] , train_df.iloc[:, 4:]

### data normalization

In [4]:
# standarlization
X = preprocessing.StandardScaler().fit_transform(X)
# min-max
#X = preprocessing.MinMaxScaler().fit_transform(X)

In [5]:
def MAE(true, pred):
    score = np.mean(np.abs(true-pred))
    return score


model_arr = []
MAE_error_record = []

split_count = 5

kf = KFold(n_splits= split_count)
kf.get_n_splits(X)

for train_index, test_index in kf.split(X):

    x_train, x_test = X[train_index], X[test_index]
    y_train, y_test = Y.values[train_index], Y.values[test_index]
    
    # multi-layer perceptron model. 으로 해봤을때 성능 확인
    # created_model = MLPRegressor().fit(x_train, y_train)

    # LinearRegression 으로 확인
    created_model = LinearRegression().fit(x_train, y_train)
    
    pred = created_model.predict(x_test)
    
    MAE_error_record.append(MAE(y_test, pred))
        
    model_arr.append(created_model)

In [6]:
print(model_arr)
print(MAE_error_record)

[LinearRegression(), LinearRegression(), LinearRegression(), LinearRegression(), LinearRegression()]
[0.19804149075825211, 0.18687865136313317, 0.19352196334840516, 0.2173408938069675, 0.18507301564446058]


### K-fold의 분류 수 만큼, 모델을 생성 한 뒤   모델 중 가장 MSE Error 값이 낮은 모델을 선정

In [7]:
find_model_num = np.where(MAE_error_record == np.min(MAE_error_record))[0][0]
print(f"Find model Num :  {find_model_num} ")

final_model = model_arr[find_model_num]
print(MAE_error_record[find_model_num])

Find model Num :  4 
0.18507301564446058


### test 결과 값 추정

In [8]:
result_df = pd.read_csv('./dataset/iris_test.csv')
result_df['species'] = result_df['species'].map({'setosa':0, 'versicolor': 1, 'virginica': 2})

x_result = result_df.iloc[:,1:4]
x_result = preprocessing.StandardScaler().fit_transform(x_result)

y_result = final_model.predict(x_result)

In [9]:
submission = pd.read_csv('./dataset/sample_submission.csv')
submission['sepal width (cm)'] = y_result[:,0]
submission['petal width (cm)'] = y_result[:,1]

### submission 파일이 제대로 들어갔는지 확인

In [10]:
submission

Unnamed: 0,id,sepal width (cm),petal width (cm)
0,0,4.103581,-1.934637
1,1,5.149807,-2.205094
2,2,4.103524,-2.112664
3,3,3.057354,-1.664180
4,4,2.393118,2.465300
...,...,...,...
70,70,2.529242,0.996478
71,71,2.010900,-2.105831
72,72,2.131773,3.200515
73,73,2.005817,0.152558


In [11]:
submission.to_csv("result.csv", index=False)

## 추가, 본 코드에서는 MAE 값이 가장 작은 모델을 선정하였지만. 모든 모델의 predict 값을 낸 뒤 평균을 내는 방법도 있습니다.### 

In [12]:
total_result = np.zeros_like(y_result)

for each_model in model_arr:
    
    each_result = each_model.predict(x_result)
    
    total_result += each_result
    
total_result /= split_count

In [13]:
submission2 = pd.read_csv('./dataset/sample_submission.csv')
submission2['sepal width (cm)'] = y_result[:,0]
submission2['petal width (cm)'] = y_result[:,1]

submission2.to_csv("result2.csv", index=False)