# Regression 종합실습 : Car seat sales
유아용 카시트 매출액을 예측해 봅시다.

* 카시트에 대해서 지역 매장 별 매출액을 예측하고자 합니다.

![](https://cdn.images.express.co.uk/img/dynamic/24/590x/child-car-seat-986556.jpg?r=1532946857754)

## 1.환경준비

### (1) Import

In [317]:
#라이브러리들을 불러오자.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

import warnings    # 경고메시지 제외
warnings.filterwarnings(action='ignore')

### (2) Data Loading

In [318]:
data_path = 'https://raw.githubusercontent.com/DA4BAM/dataset/master/Carseats.csv'
data = pd.read_csv(data_path)

**변수설명**
> * Sales - 각 지역 판매량(단위 : 1000개) <== Target
* CompPrice - 각 지역 경쟁사 가격
* Income - 각 지역 평균 소득수준(단위 : 1000달러)
* Advertising - 각 지역, 회사의 광고 예산(단위 : 1000달러)
* Population - 지역 인구수(단위 : 1000명)
* Price - 자사 지역별 판매가격
* ShelveLoc - 진열상태
* Age - 지역 인구의 평균 연령
* Education - 각 지역 교육수준 레벨
* Urban - 매장 도시 지역 여부
* US - 매장이 미국에 있는지 여부

## 2.데이터 이해

* 둘러보기

In [319]:
data.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.5,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7.4,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No


In [320]:
data['Education'].value_counts()
# 값이 10~18까지 있다.
# 숫자로 보겠다면 18은 10의 1.8배.
# 18이 10의 1.8배라고 생각하지 않는다면 가변수화를 진행해야 한다.

17    49
12    49
10    48
11    48
16    47
13    43
14    40
18    40
15    36
Name: Education, dtype: int64

## 3.데이터 준비

In [321]:
target = 'Sales'

### (1) 데이터 정리

### (2) 데이터분할1 : x, y 나누기

In [322]:
x = data.drop(target, axis=1)
y = data.loc[:,target]

### (3) NA 조치
- 결측치가 없으므로 생략

In [323]:
x.isna().sum()

CompPrice      0
Income         0
Advertising    0
Population     0
Price          0
ShelveLoc      0
Age            0
Education      0
Urban          0
US             0
dtype: int64

In [324]:
y.isna().sum()

0

### (4) 가변수화

In [325]:
columns = ['ShelveLoc', 'Education','Urban', 'US']
x = pd.get_dummies(x, columns = columns, drop_first=True)

In [326]:
x.head()

Unnamed: 0,CompPrice,Income,Advertising,Population,Price,Age,ShelveLoc_Good,ShelveLoc_Medium,Education_11,Education_12,Education_13,Education_14,Education_15,Education_16,Education_17,Education_18,Urban_Yes,US_Yes
0,138,73,11,276,120,42,0,0,0,0,0,0,0,0,1,0,1,1
1,111,48,16,260,83,65,1,0,0,0,0,0,0,0,0,0,1,1
2,113,35,10,269,80,59,0,1,0,1,0,0,0,0,0,0,1,1
3,117,100,4,466,97,55,0,1,0,0,0,1,0,0,0,0,1,1
4,141,64,3,340,128,38,0,0,0,0,1,0,0,0,0,0,1,0


In [327]:
x.rename(columns={'Urban_Yes' : 'Urban', 'US_Yes': 'US'}, inplace=True)

In [328]:
x.head()

Unnamed: 0,CompPrice,Income,Advertising,Population,Price,Age,ShelveLoc_Good,ShelveLoc_Medium,Education_11,Education_12,Education_13,Education_14,Education_15,Education_16,Education_17,Education_18,Urban,US
0,138,73,11,276,120,42,0,0,0,0,0,0,0,0,1,0,1,1
1,111,48,16,260,83,65,1,0,0,0,0,0,0,0,0,0,1,1
2,113,35,10,269,80,59,0,1,0,1,0,0,0,0,0,0,1,1
3,117,100,4,466,97,55,0,1,0,0,0,1,0,0,0,0,1,1
4,141,64,3,340,128,38,0,0,0,0,1,0,0,0,0,0,1,0


### (5) 데이터분할2 : train : validation 나누기

In [329]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = .3, random_state = 2022)

### (6) Scaling
KNN 알고리즘을 적용하기 위해서는 스케일링을 해야 합니다.

In [330]:
from sklearn.preprocessing import *

In [331]:
x_trains = []; x_vals = []
Scalers = []

In [332]:
Scalers.append(StandardScaler())
Scalers.append(MinMaxScaler())
Scalers.append(MaxAbsScaler())
Scalers.append(RobustScaler())

In [333]:
for Scaler in Scalers:
    x_train_Scale = Scaler.fit_transform(x_train)
    x_val_Scale = Scaler.transform(x_val)
    x_trains.append(x_train_Scale)
    x_vals.append(x_val_Scale)

In [334]:
mmScaler = MinMaxScaler()
x_train_s1 = mmScaler.fit_transform(x_train)
x_val_s1 = mmScaler.transform(x_val)

## 4.모델링 : 선형회귀

* 변수를 조절하며 최소 2개 이상의 모델을 생성하고 예측하고 평가해 봅시다.

In [335]:
from sklearn.linear_model import LinearRegression

* 모델1

In [336]:
feature = ['Income']
x_train_1 = x_train[feature]
x_val_1 = x_val[feature]

In [337]:
model1 = LinearRegression()

In [338]:
model1.fit(x_train_1, y_train)

LinearRegression()

In [339]:
Linear_pred1 = model1.predict(x_val_1)

* 모델2

In [340]:
model2 = LinearRegression()

In [341]:
model2.fit(x_train, y_train)

LinearRegression()

In [342]:
Linear_pred2 = model2.predict(x_val)

## 5.모델링 : KNN

* 하이퍼파라미터를 조절하며 모델을 최소 3가지 이상 생성하시오.

In [343]:
from sklearn.neighbors import KNeighborsRegressor

* 모델3

In [344]:
model3_pred = []

In [345]:
model3 = KNeighborsRegressor()

In [346]:
for i in range(4):
    model3.fit(x_trains[i], y_train)
    pred = model3.predict(x_vals[i])
    model3_pred.append(pred)

* 모델4

In [347]:
count = x_train['US'].count()
k_count = int(count**(1/2))
k_count # 데이터의 개수의 제곱근을 정수로

16

In [348]:
model4_pred = []

In [349]:
model4 = KNeighborsRegressor(n_neighbors = 10, metric = 'euclidean')

In [350]:
for i in range(4):
    model4.fit(x_trains[i], y_train)
    pred = model4.predict(x_vals[i])
    model4_pred.append(pred)

* 모델5

In [351]:
model5_pred = []

In [352]:
model5 = KNeighborsRegressor(n_neighbors=k_count, metric='manhattan')

In [353]:
for i in range(4):
    model5.fit(x_trains[i], y_train)
    pred = model5.predict(x_vals[i])
    model5_pred.append(pred)

* 모델 6

In [354]:
model6_pred = []

In [355]:
model6 = KNeighborsRegressor(n_neighbors=k_count, metric='euclidean')

In [356]:
for i in range(4):
    model6.fit(x_trains[i], y_train)
    pred = model6.predict(x_vals[i])
    model6_pred.append(pred)

In [357]:
model7 = KNeighborsRegressor(n_neighbors=k_count, metric='euclidean')
model7.fit(x_train_s1, y_train)
pred7 = model7.predict(x_val_s1)

## 6.성능비교

In [358]:
from sklearn.metrics import *

In [359]:
def CheckPerform(y_val, pred):
    print('r2 score : ', r2_score(y_val, pred))
    print('RMSE : ', mean_squared_error(y_val, pred, squared=False))
    print('MAE : ', mean_absolute_error(y_val, pred))
    print('MAPE : ', mean_absolute_percentage_error(y_val, pred))
    print('정확도 : ', 1-mean_absolute_percentage_error(y_val, pred))

In [360]:
# Income에 대한 선형회귀 예측 성능비교
CheckPerform(y_val, Linear_pred1)

r2 score :  -0.07518678258789113
RMSE :  3.0583485669634953
MAE :  2.539223474675414
MAPE :  0.8748266596253752
정확도 :  0.1251733403746248


In [361]:
# 전체 데이터에 대한 선형회귀 예측 성능비교
CheckPerform(y_val, Linear_pred2)

r2 score :  0.8752579913475843
RMSE :  1.0417209287542315
MAE :  0.8291702856537125
MAPE :  0.21735487590453131
정확도 :  0.7826451240954687


In [362]:
# 순서 : StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler
# k : 5, 거리 : default
for pred in model3_pred:
    CheckPerform(y_val, pred)
    print('-'*30)

r2 score :  0.34480152121915786
RMSE :  2.387434487757378
MAE :  1.9351166666666666
MAPE :  0.675900855334496
정확도 :  0.32409914466550405
------------------------------
r2 score :  0.2813522045353446
RMSE :  2.5003630469727125
MAE :  2.066683333333333
MAPE :  0.7604663220505302
정확도 :  0.23953367794946978
------------------------------
r2 score :  0.29282299125494116
RMSE :  2.480327847953438
MAE :  2.0375166666666664
MAPE :  0.7447268025234451
정확도 :  0.25527319747655486
------------------------------
r2 score :  0.5008673152701503
RMSE :  2.0837855775807004
MAE :  1.6861666666666664
MAPE :  0.5657064717675502
정확도 :  0.4342935282324498
------------------------------


In [363]:
# 순서 : StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler
# k : 10, 거리 : 유클리드
for pred in model4_pred:
    CheckPerform(y_val, pred)
    print('-'*30)

r2 score :  0.31429056614456186
RMSE :  2.442390405251926
MAE :  2.0291916666666667
MAPE :  0.7473790295749876
정확도 :  0.2526209704250124
------------------------------
r2 score :  0.31288349738327814
RMSE :  2.444895001153765
MAE :  2.0193166666666666
MAPE :  0.807536849826176
정확도 :  0.19246315017382398
------------------------------
r2 score :  0.3050802120871551
RMSE :  2.4587386163098617
MAE :  2.0330666666666666
MAPE :  0.8084809537479398
정확도 :  0.19151904625206018
------------------------------
r2 score :  0.4586950290529235
RMSE :  2.1700314629055497
MAE :  1.7522333333333333
MAPE :  0.5998864066233637
정확도 :  0.4001135933766363
------------------------------


In [364]:
# 순서 : StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler
# k : 제곱근(16), 거리 : 맨해튼
for pred in model5_pred:
    CheckPerform(y_val, pred)
    print('-'*30)

r2 score :  0.3309380921861659
RMSE :  2.4125602904275607
MAE :  1.9912395833333334
MAPE :  0.756210177382736
정확도 :  0.243789822617264
------------------------------
r2 score :  0.33733254085187914
RMSE :  2.40100378856601
MAE :  2.003645833333333
MAPE :  0.7564736872717485
정확도 :  0.24352631272825154
------------------------------
r2 score :  0.33992516461541633
RMSE :  2.396302335435264
MAE :  1.9818854166666666
MAPE :  0.7887207482760339
정확도 :  0.2112792517239661
------------------------------
r2 score :  0.4256546868368546
RMSE :  2.2352781027457373
MAE :  1.8267187500000002
MAPE :  0.6653468363115836
정확도 :  0.33465316368841636
------------------------------


In [365]:
# 순서 : StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler
# k : 제곱근(16), 거리 : 유클리드
for pred in model6_pred:
    CheckPerform(y_val, pred)
    print('-'*30)

r2 score :  0.3117326901080849
RMSE :  2.4469415433693276
MAE :  2.017385416666667
MAPE :  0.767287267935679
정확도 :  0.23271273206432097
------------------------------
r2 score :  0.2967098362723467
RMSE :  2.473502164084189
MAE :  2.027135416666667
MAPE :  0.804689866972101
정확도 :  0.195310133027899
------------------------------
r2 score :  0.3076678828116689
RMSE :  2.4541565479771594
MAE :  1.9969895833333333
MAPE :  0.8092456213872345
정확도 :  0.19075437861276545
------------------------------
r2 score :  0.44528768442059774
RMSE :  2.196741360325941
MAE :  1.7775520833333334
MAPE :  0.636818240823579
정확도 :  0.36318175917642104
------------------------------


In [366]:
# 스케일러 : MinMaxScaler
# k : 제곱근, 거리 : 유클리드
CheckPerform(y_val, pred7)

r2 score :  0.2967098362723467
RMSE :  2.473502164084189
MAE :  2.027135416666667
MAPE :  0.804689866972101
정확도 :  0.195310133027899
