# 1. 환경준비

In [70]:
# 라이브러리
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy as spst
import urllib.request 
import json 

# 한글폰트 지정(맑은고딕)
plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False

# 고해상도
%config InlineBackend.figure_format = 'retina'


# 2. 데이터 탐색

In [71]:
# 학습용 데이터 불러오기
path = 'https://bit.ly/InsuranceTrainFile'
data1 = pd.read_csv(path)

# 확인
data1.tail()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
1233,50,male,30.97,3,no,northwest,10600.5483
1234,18,female,31.92,0,no,northeast,2205.9808
1235,18,female,36.85,0,no,southeast,1629.8335
1236,21,female,25.8,0,no,southwest,2007.945
1237,61,female,29.07,0,yes,northwest,29141.3603


In [72]:
# 평가용 데이터 불러오기
path = 'https://bit.ly/InsuranceTestFile'
data2 = pd.read_csv(path)

# 확인  -> charge 가 없음 예측을 해야한다.
data2.tail()

Unnamed: 0,age,sex,bmi,children,smoker,region
95,28,female,37.62,1,no,southeast
96,54,female,30.8,3,no,southwest
97,55,male,38.28,0,no,southeast
98,56,male,19.95,0,yes,northeast
99,38,male,19.3,0,yes,southwest


# 3. 데이터 준비

In [73]:
# 가변수화
dumm_cols = ['sex','smoker','region']
data1 = pd.get_dummies(data1 ,columns = dumm_cols, drop_first=True)

# 확인
data1.head()

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,41,31.6,0,6186.127,0,0,0,0,1
1,30,25.46,0,3645.0894,1,0,0,0,0
2,18,30.115,0,21344.8467,0,0,0,0,0
3,61,29.92,3,30942.1918,0,1,0,1,0
4,34,27.5,1,5003.853,0,0,0,0,1


In [74]:
# x, y split
target = 'charges'
x = data1.drop(target , axis =1)
y = data1.loc[:,target]

# 확인
x.head()

Unnamed: 0,age,bmi,children,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,41,31.6,0,0,0,0,0,1
1,30,25.46,0,1,0,0,0,0
2,18,30.115,0,0,0,0,0,0
3,61,29.92,3,0,1,0,1,0
4,34,27.5,1,0,0,0,0,1


In [75]:
# 학습용, 검증용(validation) 분리
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(x,y, random_state = 2022, test_size = 0.2)

# 확인
x_train.tail()

Unnamed: 0,age,bmi,children,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
951,64,26.41,0,1,0,0,0,0
240,24,27.6,0,0,0,0,0,1
624,50,27.075,1,0,0,0,0,0
173,50,27.455,1,1,0,0,0,0
893,38,28.27,1,1,0,0,1,0


# 4. 모델링 

In [88]:
# 불러오기
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error ,r2_score

In [89]:
# 선언하기
model_DT = DecisionTreeRegressor(random_state = 2022)

In [91]:
# 학습하기 --> x_train, y_train 은 data1 의 자료
model_DT.fit(x_train, y_train)

In [92]:
# 예측하기
y_val_pred = model_DT.predict(x_val)

In [93]:
# 평가하기
print('MAE:', mean_absolute_error(y_val, y_val_pred)) # (정답, 예측) 순서로 표기
print('R2:' , r2_score(y_val, y_val_pred))

MAE: 2815.9559071935487
R2: 0.7402200651887789


# 5. 일반화된 성능

In [94]:
# 불러오기
from sklearn.model_selection import cross_val_score

# 성능예측
cv_score = cross_val_score(model_DT,x_train,y_train, cv = 10)
print(cv_score)
print('mean ',cv_score.mean()) # mean = 0.8018

[0.79377188 0.64221342 0.60534947 0.7125661  0.44144025 0.58561708
 0.79845772 0.62503319 0.72455466 0.62926463]
mean  0.6558268405235703


# 6. 성능 튜닝

In [104]:
# 불러오기
from sklearn.model_selection import GridSearchCV ,cross_val_score

# 파라미터선언 
params = {'max_depth': range(1,50)}

# Grid search
model = GridSearchCV(model_DT,
                     params,
                     cv = 5,  # default 값이 5가 있어서
                     scoring = 'r2'
                     )

In [105]:
# 학습하기
model.fit(x_train,y_train)


In [106]:
# 예측 결과 확인
print(model.best_params_)
print(model.best_score_)

{'max_depth': 4}
0.8288418138536089


In [108]:
# 성능 검증
y_val_pred = model.predict(x_val)
print(r2_score(y_val,y_val_pred))  # r2 =  0.8573316318890312

0.8573316318890312


# 7. 최종 평가

In [82]:
# 평가 데이터 확인
data2.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,female,27.9,0,yes,southwest
1,18,male,33.77,1,no,southeast
2,28,male,33.0,3,no,southeast
3,33,male,22.705,0,no,northwest
4,32,male,28.88,0,no,northwest


In [83]:
# 평가 데이터 가변수화
dumm_cols = ['sex','smoker','region']

# 신규 테스트 데이터에 앞에 있던 region 이 없을 수 도 있을 수 도 있지 않을까?
data2 = pd.get_dummies(data2, columns = dumm_cols, drop_first = True)

# 확인
data2.head()

Unnamed: 0,age,bmi,children,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19,27.9,0,0,1,0,0,1
1,18,33.77,1,1,0,0,1,0
2,28,33.0,3,1,0,0,1,0
3,33,22.705,0,1,0,1,0,0
4,32,28.88,0,1,0,1,0,0


In [84]:
# 예측하기
x_test = data2
y_pred = model.predict(x_test)

In [85]:
# 확인
y_pred[:10]

array([17382.41633111,  4165.34250886,  8127.86655566,  4289.39439462,
        4289.39439462,  4289.39439462, 10356.72521   ,  8127.86655566,
        8127.86655566, 13906.31553763])

In [111]:
# 결과 제출
path = 'https://bit.ly/InsuranceTestFile'
final = pd.read_csv(path)

final['charges'] = y_pred_tuned

# 확인
final

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16441.877819
1,18,male,33.770,1,no,southeast,6036.364844
2,28,male,33.000,3,no,southeast,7713.831463
3,33,male,22.705,0,no,northwest,3388.496916
4,32,male,28.880,0,no,northwest,3388.496916
...,...,...,...,...,...,...,...
95,28,female,37.620,1,no,southeast,6036.364844
96,54,female,30.800,3,no,southwest,13008.483951
97,55,male,38.280,0,no,southeast,13008.483951
98,56,male,19.950,0,yes,northeast,22543.384700


In [112]:
# 엑셀로 저장
final.to_excel('InsurancePred.xlsx')

In [110]:
y_pred_tuned=model.predict(x_test)
y_pred_tuned[:10]

array([16441.87781917,  6036.3648442 ,  7713.83146267,  3388.49691576,
        3388.49691576,  3388.49691576,  9334.14831629,  7713.83146267,
        7713.83146267, 14819.39120143])