In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

### 1. 데이터 불러오기

In [None]:
path = "/content/drive/MyDrive/team_project1/data/"

In [None]:
df = pd.read_csv(path + "Regression_data_preprocessing.csv")
print(df.shape)
df.head()

(4177, 11)


Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings,Sex_F,Sex_I,Sex_M
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15,0,0,1
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7,0,0,1
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9,1,0,0
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10,0,0,1
4,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7,0,1,0


### 2. 데이터 나누기
- train : test = 0.8 : 0.2
- train : test = 0.7 : 0.3

In [None]:
X = df.drop('Rings', axis=1)
y = df['Rings']

X.shape, y.shape

((4177, 10), (4177,))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

X_train.shape, X_test.shape

((3341, 10), (836, 10))

### 3. 학습 관련 함수 정의
- 손실 계산 함수 -> 베이스 모델 함수와 동일한 지표
- 정확도 계산 함수 -> 베이스 모델 함수와 동일한 지표
- 예측한 결과와 원래 결과를 비교해서 시각화

In [None]:
# loss
def mse(y_true, y_pred):
  return mean_squared_error(y_true, y_pred)

In [None]:
# 정확도 계산
def eval_accuracy(y, y_hat):
	# 오차율 구하는 과정    
	# np.mean() 메서드의 이유는 미니배치 처리를 고려하여 하나의 지표로 묶어주기 위함 입니다. 
    mdiff = np.mean(np.abs((y_hat - y) / y))
    # 1 에서 오차율을 빼 정확도를 구합니다. 
    return 1 - mdiff

In [None]:
# test_count = y_test.value_counts().sort_values(ascending=False)
# test_count.plot(kind='bar')

### 4. 기본 선형 모델

In [None]:
# 모델 정의
lr = LinearRegression()

In [None]:
# 학습
lr.fit(X_train, y_train)

In [None]:
# 예측
y_train_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)

In [None]:
print(f"Train - Loss = {mse(y_train, y_train_pred):.3f}, Accuracy = {eval_accuracy(y_train, y_train_pred):.3f} / Test - Accuracy = {mse(y_test, y_test_pred):.3f}, Accuracy = {eval_accuracy(y_test, y_test_pred):.3f}")

Train - Loss = 4.666, Accuracy = 0.843 / Test - Accuracy = 5.048, Accuracy = 0.841


### 5. 릿지 선형 회귀 모델

In [None]:
from sklearn.linear_model import Ridge

In [None]:
# 모델 정의 
ridge_model = Ridge()

In [None]:
# 하이퍼파라미터 값 목록
ridge_parmas = {
    'max_iter': [3000],
    'alpha': [0.1, 1, 2, 3, 4, 10, 30, 100, 200, 300, 400, 800, 900, 1000]
}

In [None]:
gridsearch_ridge_model = GridSearchCV(estimator=ridge_model,
                                      param_grid=ridge_parmas,
                                      return_train_score=True,
                                      cv=5)

In [None]:
gridsearch_ridge_model.fit(X_train, y_train)

In [None]:
print(f'최적 하이퍼파라미터 : {gridsearch_ridge_model.best_params_}')

최적 하이퍼파라미터 : {'alpha': 0.1, 'max_iter': 3000}


In [None]:
# 예측
y_train_pred = gridsearch_ridge_model.best_estimator_.predict(X_train)
y_test_pred = gridsearch_ridge_model.best_estimator_.predict(X_test)

In [None]:
print(f"Train - Loss = {mse(y_train, y_train_pred):.3f}, Accuracy = {eval_accuracy(y_train, y_train_pred):.3f} / Test - Accuracy = {mse(y_test, y_test_pred):.3f}, Accuracy = {eval_accuracy(y_test, y_test_pred):.3f}")

Train - Loss = 4.668, Accuracy = 0.843 / Test - Accuracy = 5.058, Accuracy = 0.841


### 6. 라쏘 회귀 모델

In [None]:
from sklearn.linear_model import Lasso

In [None]:
lasso_model = Lasso()

In [None]:
# 하이퍼파라미터 값 목록
lasso_parmas = {
    'max_iter': [3000],
    'alpha': 1 / np.array([0.1, 1, 2, 3, 4, 10, 30, 100, 200, 300, 400, 800, 900, 1000])
}

In [None]:
gridsearch_lasso_model = GridSearchCV(estimator=lasso_model,
                                      param_grid=lasso_parmas,
                                      return_train_score=True,
                                      cv=5)

In [None]:
gridsearch_lasso_model.fit(X_train, y_train)

In [None]:
print(f'최적 하이퍼파라미터 : {gridsearch_lasso_model.best_params_}')

최적 하이퍼파라미터 : {'alpha': 0.001, 'max_iter': 3000}


In [None]:
# 예측
y_train_pred = gridsearch_ridge_model.best_estimator_.predict(X_train)
y_test_pred = gridsearch_ridge_model.best_estimator_.predict(X_test)

In [None]:
print(f"Train - Loss = {mse(y_train, y_train_pred):.3f}, Accuracy = {eval_accuracy(y_train, y_train_pred):.3f} / Test - Accuracy = {mse(y_test, y_test_pred):.3f}, Accuracy = {eval_accuracy(y_test, y_test_pred):.3f}")

Train - Loss = 4.668, Accuracy = 0.843 / Test - Accuracy = 5.058, Accuracy = 0.841


### 7. 랜덤 포레스트 회귀 모델

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
# 모델 생성
rf_model = RandomForestRegressor()

In [None]:
# 그리드 서치
rf_parmas = {
    'random_state':[42],
    'n_estimators':[100, 120, 140]
}

gridsearch_rf_model = GridSearchCV(estimator=rf_model,
                                   param_grid=rf_parmas,
                                   cv=5)

In [None]:
gridsearch_rf_model.fit(X_train, y_train)
print(f'최적 하이퍼파라미터 : {gridsearch_rf_model.best_params_}')

최적 하이퍼파라미터 : {'n_estimators': 140, 'random_state': 42}


In [None]:
# 예측
y_train_pred = gridsearch_rf_model.best_estimator_.predict(X_train)
y_test_pred = gridsearch_rf_model.best_estimator_.predict(X_test)

print(f"Train - Loss = {mse(y_train, y_train_pred):.3f}, Accuracy = {eval_accuracy(y_train, y_train_pred):.3f} / Test - Loss = {mse(y_test, y_test_pred):.3f}, Accuracy = {eval_accuracy(y_test, y_test_pred):.3f}")

Train - Loss = 0.642, Accuracy = 0.945 / Test - Loss = 4.920, Accuracy = 0.846
