In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import urllib.request as req
import time
import folium
import geopandas as gpd
import html5lib as html
pd.options.display.float_format = '{:.5f}'.format

In [3]:
# 한글 폰트 문제 해결 
# matplotlib은 한글 폰트를 지원하지 않음
# os정보
import platform

# font_manager : 폰트 관리 모듈
# rc : 폰트 변경 모듈
import matplotlib.pyplot as plt
from matplotlib import font_manager, rc

# unicode 설정
plt.rcParams['axes.unicode_minus'] = False

if platform.system() == 'Darwin':
    rc('font', family='AppleGothic') # os가 macos
elif platform.system() == 'Windows':
    path = 'c:/Windows/Fonts/malgun.ttf' # os가 windows
    font_name = font_manager.FontProperties(fname=path).get_name()
    rc('font', family=font_name)
else:
    print("Unknown System")

### KNN

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_validate

df = pd.read_csv("../JejuData/jeju_data.csv")

df.reset_index(inplace=True,drop=True)
data=df.copy()

for column in ['읍면동명','업종명']:
    df_total2=pd.get_dummies(df[column],prefix=column)
    data=pd.concat([data,df_total2],axis=1)

data.drop(columns=['년월', '시도명', '읍면동명', '업종명', '이용자수', '총 유동인구', '연도', '월', '권역', '분기'],inplace=True)

train_input,test_input,train_target,test_target=train_test_split(data.iloc[:,1:],data[['이용금액']],random_state=42)

knr=KNeighborsRegressor(n_jobs=-1)
for n in [1,3,5,10]:
    # default: 5
    knr.n_neighbors = n
    knr.fit(train_input,train_target)
    print("-"*20)
    print("K ==>",n)
    print("Train Score:",knr.score(train_input,train_target))
    print("Test Score:",knr.score(test_input,test_target))
    if knr.score(train_input,train_target) > knr.score(test_input,test_target):
        print('과대적합')
    else:
        print('과소적합')

--------------------
K ==> 1
Train Score: 0.9280247173236684
Test Score: 0.924394784517429
과대적합
--------------------
K ==> 3
Train Score: 0.9566957088661645
Test Score: 0.9501554345956461
과대적합
--------------------
K ==> 5
Train Score: 0.9575411689949189
Test Score: 0.950881211861843
과대적합
--------------------
K ==> 10
Train Score: 0.9631518131391507
Test Score: 0.9568463157291245
과대적합


In [11]:
scores=cross_validate(knr,train_input,train_target,n_jobs=-1,cv=3)
print(np.mean(scores['test_score']))
print(np.mean(scores['score_time']))

0.962687694563335
0.9799889723459879


In [30]:
df

Unnamed: 0,년월,시도명,읍면동명,업종명,이용자수,이용금액,총 유동인구,연도,월,권역,분기
0,2018-01,서귀포시,남원읍,관광 민예품 및 선물용품 소매업,147,5320500,15559582.61000,2018,1,서귀포시 동부권역,1/4분기
1,2018-01,서귀포시,남원읍,기타 외국식 음식점업,173,4998000,15559582.61000,2018,1,서귀포시 동부권역,1/4분기
2,2018-01,서귀포시,남원읍,비알콜 음료점업,432,5110800,15559582.61000,2018,1,서귀포시 동부권역,1/4분기
3,2018-01,서귀포시,남원읍,빵 및 과자류 소매업,1003,13434780,15559582.61000,2018,1,서귀포시 동부권역,1/4분기
4,2018-01,서귀포시,남원읍,서양식 음식점업,2660,48084140,15559582.61000,2018,1,서귀포시 동부권역,1/4분기
...,...,...,...,...,...,...,...,...,...,...,...
15581,2021-05,서귀포시,천지동,빵 및 과자류 소매업,7298,129770690,10227689.92300,2021,5,서귀포시 동권역,2/4분기
15582,2021-05,서귀포시,천지동,비알콜 음료점업,7527,83256320,10227689.92300,2021,5,서귀포시 동권역,2/4분기
15583,2021-05,서귀포시,천지동,기타 외국식 음식점업,965,33476100,10227689.92300,2021,5,서귀포시 동권역,2/4분기
15584,2021-05,서귀포시,천지동,관광 민예품 및 선물용품 소매업,1903,44802148,10227689.92300,2021,5,서귀포시 동권역,2/4분기


In [35]:
rain = pd.read_csv("../JejuData/rain/rain_jejudo.csv")
rain.columns = ['시도명','년월','강수량']

In [48]:
rain = rain.iloc[0:82]

In [88]:
# rain['시도명'] = rain['시도명'].str.replace('제주','제주시')
# rain['시도명'] = rain['시도명'].str.replace('서귀포','서귀포시')

In [89]:
rain

Unnamed: 0,시도명,년월,강수량
0,제주시,2018-01,58.00000
1,서귀포시,2018-01,63.70000
2,제주시,2018-02,86.60000
3,서귀포시,2018-02,10.50000
4,제주시,2018-03,118.00000
...,...,...,...
77,서귀포시,2021-03,156.30000
78,제주시,2021-04,58.80000
79,서귀포시,2021-04,182.00000
80,제주시,2021-05,100.70000


In [91]:
df_sales_rain = pd.merge(df,rain,how='left', on=['시도명','년월'])

In [93]:
# df_sales_rain.to_csv("../JejuData/jeju_sales_rain.csv",index=False)

In [97]:
np.corrcoef(df_sales_rain['총 유동인구'],df_sales_rain['강수량'])

array([[ 1.        , -0.04066366],
       [-0.04066366,  1.        ]])

In [101]:
np.corrcoef(df_sales_rain.groupby('년월').mean()['이용자수'],df_sales_rain.groupby('년월').mean()['강수량'])

array([[1.        , 0.29632289],
       [0.29632289, 1.        ]])

In [33]:
# df.to_csv("../JejuData/jeju_rec_area_quarter.csv",index=False)

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15586 entries, 0 to 15585
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   년월      15586 non-null  object 
 1   시도명     15586 non-null  object 
 2   읍면동명    15586 non-null  object 
 3   업종명     15586 non-null  object 
 4   이용자수    15586 non-null  int64  
 5   이용금액    15586 non-null  int64  
 6   총 유동인구  15586 non-null  float64
 7   연도      15586 non-null  int64  
 8   월       15586 non-null  int64  
 9   권역      15586 non-null  object 
 10  분기      15586 non-null  object 
dtypes: float64(1), int64(4), object(6)
memory usage: 1.3+ MB


In [28]:
df['업종명'].unique()

array(['관광 민예품 및 선물용품 소매업', '기타 외국식 음식점업', '비알콜 음료점업', '빵 및 과자류 소매업',
       '서양식 음식점업', '스포츠 및 레크레이션 용품 임대업', '일식 음식점업', '중식 음식점업', '한식 음식점업',
       '기타 수상오락 서비스업'], dtype=object)

In [31]:
np.corrcoef(df['이용금액'],df['이용자수'])

array([[1.        , 0.96113934],
       [0.96113934, 1.        ]])

---
### 강수량 추가

In [105]:
jeju_sales_rain = pd.read_csv("../JejuData/jeju_sales_rain.csv")
df = jeju_sales_rain.drop(columns=['년월','연도','월'])
df

Unnamed: 0,시도명,읍면동명,업종명,이용자수,이용금액,총 유동인구,권역,분기,강수량
0,서귀포시,남원읍,관광 민예품 및 선물용품 소매업,147,5320500,15559582.61000,서귀포시 동부권역,1/4분기,63.70000
1,서귀포시,남원읍,기타 외국식 음식점업,173,4998000,15559582.61000,서귀포시 동부권역,1/4분기,63.70000
2,서귀포시,남원읍,비알콜 음료점업,432,5110800,15559582.61000,서귀포시 동부권역,1/4분기,63.70000
3,서귀포시,남원읍,빵 및 과자류 소매업,1003,13434780,15559582.61000,서귀포시 동부권역,1/4분기,63.70000
4,서귀포시,남원읍,서양식 음식점업,2660,48084140,15559582.61000,서귀포시 동부권역,1/4분기,63.70000
...,...,...,...,...,...,...,...,...,...
15581,서귀포시,천지동,빵 및 과자류 소매업,7298,129770690,10227689.92300,서귀포시 동권역,2/4분기,159.20000
15582,서귀포시,천지동,비알콜 음료점업,7527,83256320,10227689.92300,서귀포시 동권역,2/4분기,159.20000
15583,서귀포시,천지동,기타 외국식 음식점업,965,33476100,10227689.92300,서귀포시 동권역,2/4분기,159.20000
15584,서귀포시,천지동,관광 민예품 및 선물용품 소매업,1903,44802148,10227689.92300,서귀포시 동권역,2/4분기,159.20000


In [106]:
df.reset_index(inplace=True,drop=True)
data=df.copy()

for column in ['읍면동명','업종명']:
    df_total2=pd.get_dummies(df[column],prefix=column)
    data=pd.concat([data,df_total2],axis=1)

data.drop(columns=['시도명', '읍면동명', '업종명', '이용금액', '총 유동인구', '권역', '분기', '강수량'],inplace=True)

train_input,test_input,train_target,test_target=train_test_split(data.iloc[:,1:],data[['이용자수']],random_state=42)

knr=KNeighborsRegressor(n_jobs=-1)
for n in [1,3,5,10]:
    # default: 5
    knr.n_neighbors = n
    knr.fit(train_input,train_target)
    print("-"*20)
    print("K ==>",n)
    print("Train Score:",knr.score(train_input,train_target))
    print("Test Score:",knr.score(test_input,test_target))
    if knr.score(train_input,train_target) > knr.score(test_input,test_target):
        print('과대적합')
    else:
        print('과소적합')

--------------------
K ==> 1
Train Score: 0.9384543487597684
Test Score: 0.9356336329651689
과대적합
--------------------
K ==> 3
Train Score: 0.9631624859677695
Test Score: 0.9606414993369266
과대적합
--------------------
K ==> 5
Train Score: 0.9649010676169724
Test Score: 0.9614536223813362
과대적합
--------------------
K ==> 10
Train Score: 0.9699479927206461
Test Score: 0.9673196300980944
과대적합


In [107]:
scores=cross_validate(knr,train_input,train_target,n_jobs=-1,cv=3)
print(np.mean(scores['test_score']))
print(np.mean(scores['score_time']))

0.9694882353294664
1.0125380357106526


In [108]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rf = RandomForestRegressor()

In [109]:
df

Unnamed: 0,시도명,읍면동명,업종명,이용자수,이용금액,총 유동인구,권역,분기,강수량
0,서귀포시,남원읍,관광 민예품 및 선물용품 소매업,147,5320500,15559582.61000,서귀포시 동부권역,1/4분기,63.70000
1,서귀포시,남원읍,기타 외국식 음식점업,173,4998000,15559582.61000,서귀포시 동부권역,1/4분기,63.70000
2,서귀포시,남원읍,비알콜 음료점업,432,5110800,15559582.61000,서귀포시 동부권역,1/4분기,63.70000
3,서귀포시,남원읍,빵 및 과자류 소매업,1003,13434780,15559582.61000,서귀포시 동부권역,1/4분기,63.70000
4,서귀포시,남원읍,서양식 음식점업,2660,48084140,15559582.61000,서귀포시 동부권역,1/4분기,63.70000
...,...,...,...,...,...,...,...,...,...
15581,서귀포시,천지동,빵 및 과자류 소매업,7298,129770690,10227689.92300,서귀포시 동권역,2/4분기,159.20000
15582,서귀포시,천지동,비알콜 음료점업,7527,83256320,10227689.92300,서귀포시 동권역,2/4분기,159.20000
15583,서귀포시,천지동,기타 외국식 음식점업,965,33476100,10227689.92300,서귀포시 동권역,2/4분기,159.20000
15584,서귀포시,천지동,관광 민예품 및 선물용품 소매업,1903,44802148,10227689.92300,서귀포시 동권역,2/4분기,159.20000
