In [68]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

### 한글 폰트 설정
import matplotlib
from matplotlib import font_manager, rc
import matplotlib.pyplot as plt
import platform

path = "C:/Windows/Fonts/malgun.ttf"
if platform.system() == "Windows":
    font_name = font_manager.FontProperties(fname=path).get_name()
    rc('font', family=font_name)
elif platform.system()=="Darwin":
    rc('font', family='AppleGothic')
else:
    print("Unknown System")
    
matplotlib.rcParams['axes.unicode_minus'] = False

%matplotlib inline

In [85]:
train = pd.read_csv('../../../data/Parking_Demand/train.csv')
test = pd.read_csv('../../../data/Parking_Demand/test.csv')
age = pd.read_csv('../../../data/Parking_Demand/age_gender_info.csv')
sub = pd.read_csv('../../../data/Parking_Demand/sample_submission.csv')

train.shape, test.shape, age.shape, sub.shape

((2952, 15), (1022, 14), (16, 23), (150, 2))

In [86]:
train.columns

Index(['단지코드', '총세대수', '임대건물구분', '지역', '공급유형', '전용면적', '전용면적별세대수', '공가수',
       '자격유형', '임대보증금', '임대료', '도보 10분거리 내 지하철역 수(환승노선 수 반영)',
       '도보 10분거리 내 버스정류장 수', '단지내주차면수', '등록차량수'],
      dtype='object')

In [87]:
train.columns = ['단지코드', '총세대수', '임대건물구분', '지역', '공급유형', '전용면적', '전용면적별세대수', '공가수',
       '자격유형', '임대보증금', '임대료', '10분내지하철수',
       '10분내버스정류장수', '단지내주차면수', '등록차량수']
test.columns = ['단지코드', '총세대수', '임대건물구분', '지역', '공급유형', '전용면적', '전용면적별세대수', '공가수',
       '자격유형', '임대보증금', '임대료', '10분내지하철수',
       '10분내버스정류장수', '단지내주차면수']

In [88]:
train.columns

Index(['단지코드', '총세대수', '임대건물구분', '지역', '공급유형', '전용면적', '전용면적별세대수', '공가수',
       '자격유형', '임대보증금', '임대료', '10분내지하철수', '10분내버스정류장수', '단지내주차면수', '등록차량수'],
      dtype='object')

In [89]:
train.isna().sum()

단지코드            0
총세대수            0
임대건물구분          0
지역              0
공급유형            0
전용면적            0
전용면적별세대수        0
공가수             0
자격유형            0
임대보증금         569
임대료           569
10분내지하철수      211
10분내버스정류장수      4
단지내주차면수         0
등록차량수           0
dtype: int64

In [90]:
test.isna().sum()

단지코드            0
총세대수            0
임대건물구분          0
지역              0
공급유형            0
전용면적            0
전용면적별세대수        0
공가수             0
자격유형            2
임대보증금         180
임대료           180
10분내지하철수       42
10분내버스정류장수      0
단지내주차면수         0
dtype: int64

In [91]:
### 자격유형 isnull 확인
test.loc[test['자격유형'].isnull()]

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,임대료,10분내지하철수,10분내버스정류장수,단지내주차면수
196,C2411,962,아파트,경상남도,국민임대,46.9,240,25.0,,71950000,37470,0.0,2.0,840.0
258,C2253,1161,아파트,강원도,영구임대,26.37,745,0.0,,2249000,44770,0.0,2.0,173.0


In [92]:
grouped = test.groupby(['단지코드', '임대건물구분', '지역','공급유형'])
group1 = grouped.get_group( ('C2411', '아파트', '경상남도', '국민임대')  )
group1

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,임대료,10분내지하철수,10분내버스정류장수,단지내주차면수
193,C2411,962,아파트,경상남도,국민임대,39.43,56,25.0,A,11992000,100720,0.0,2.0,840.0
194,C2411,962,아파트,경상남도,국민임대,39.72,336,25.0,A,11992000,100720,0.0,2.0,840.0
195,C2411,962,아파트,경상남도,국민임대,39.82,179,25.0,A,11992000,100720,0.0,2.0,840.0
196,C2411,962,아파트,경상남도,국민임대,46.9,240,25.0,,71950000,37470,0.0,2.0,840.0
197,C2411,962,아파트,경상남도,국민임대,51.93,150,25.0,A,21586000,171480,0.0,2.0,840.0


In [93]:
test.loc[196,'자격유형'] = 'A'

In [94]:
group2 = grouped.get_group( ('C2253', '아파트', '강원도', '영구임대')  )
group2

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,임대료,10분내지하철수,10분내버스정류장수,단지내주차면수
258,C2253,1161,아파트,강원도,영구임대,26.37,745,0.0,,2249000,44770,0.0,2.0,173.0
259,C2253,1161,아파트,강원도,영구임대,31.32,239,0.0,C,3731000,83020,0.0,2.0,173.0
260,C2253,1161,아파트,강원도,영구임대,31.32,149,0.0,C,3731000,83020,0.0,2.0,173.0


In [95]:
test.loc[258,'자격유형'] = 'C'

In [96]:
print(train.자격유형.unique())
print(test.자격유형.unique())

['A' 'B' 'C' 'D' 'E' 'F' 'G' 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O']
['H' 'A' 'E' 'C' 'D' 'G' 'I' 'J' 'K' 'L' 'M' 'N']


In [97]:
### 라벨인코딩
mapping = {'A':1, 'B':2, 'C':3, 'D':4, 'E':5, 
           'F':6, 'G':7, 'H':8, 'I':9, 'J':10, 
           'K':11, 'L':12, 'M':13, 'N':14, 'O':15}
train['자격유형'] = train['자격유형'].map(mapping).astype(int)
test['자격유형'] = test['자격유형'].map(mapping).astype(int)

train.head(3), test.head(3)

(    단지코드  총세대수 임대건물구분    지역  공급유형   전용면적  전용면적별세대수   공가수  자격유형     임대보증금  \
 0  C2483   900    아파트  경상북도  국민임대  39.72       134  38.0     1  15667000   
 1  C2483   900    아파트  경상북도  국민임대  39.72        15  38.0     1  15667000   
 2  C2483   900    아파트  경상북도  국민임대  51.93       385  38.0     1  27304000   
 
       임대료  10분내지하철수  10분내버스정류장수  단지내주차면수   등록차량수  
 0  103680       0.0         3.0   1425.0  1015.0  
 1  103680       0.0         3.0   1425.0  1015.0  
 2  184330       0.0         3.0   1425.0  1015.0  ,
     단지코드  총세대수 임대건물구분   지역  공급유형   전용면적  전용면적별세대수   공가수  자격유형     임대보증금  \
 0  C1072   754    아파트  경기도  국민임대  39.79       116  14.0     8  22830000   
 1  C1072   754    아파트  경기도  국민임대  46.81        30  14.0     1  36048000   
 2  C1072   754    아파트  경기도  국민임대  46.90       112  14.0     8  36048000   
 
       임대료  10분내지하철수  10분내버스정류장수  단지내주차면수  
 0  189840       0.0         2.0    683.0  
 1  249930       0.0         2.0    683.0  
 2  249930       0.0         2.0    683.0  

In [98]:
test.head()

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,임대료,10분내지하철수,10분내버스정류장수,단지내주차면수
0,C1072,754,아파트,경기도,국민임대,39.79,116,14.0,8,22830000,189840,0.0,2.0,683.0
1,C1072,754,아파트,경기도,국민임대,46.81,30,14.0,1,36048000,249930,0.0,2.0,683.0
2,C1072,754,아파트,경기도,국민임대,46.9,112,14.0,8,36048000,249930,0.0,2.0,683.0
3,C1072,754,아파트,경기도,국민임대,46.9,120,14.0,8,36048000,249930,0.0,2.0,683.0
4,C1072,754,아파트,경기도,국민임대,51.46,60,14.0,8,43497000,296780,0.0,2.0,683.0


In [99]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2952 entries, 0 to 2951
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   단지코드        2952 non-null   object 
 1   총세대수        2952 non-null   int64  
 2   임대건물구분      2952 non-null   object 
 3   지역          2952 non-null   object 
 4   공급유형        2952 non-null   object 
 5   전용면적        2952 non-null   float64
 6   전용면적별세대수    2952 non-null   int64  
 7   공가수         2952 non-null   float64
 8   자격유형        2952 non-null   int32  
 9   임대보증금       2383 non-null   object 
 10  임대료         2383 non-null   object 
 11  10분내지하철수    2741 non-null   float64
 12  10분내버스정류장수  2948 non-null   float64
 13  단지내주차면수     2952 non-null   float64
 14  등록차량수       2952 non-null   float64
dtypes: float64(6), int32(1), int64(2), object(6)
memory usage: 334.5+ KB


In [100]:
### 데이터 나누기
from sklearn.model_selection import train_test_split

sel = ['총세대수', '자격유형', '전용면적별세대수']
X = train[sel]
y = train['등록차량수']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 3)

In [101]:
### 데이터 학습
from sklearn.linear_model import LinearRegression

model = LinearRegression().fit(X_train, y_train)   # 학습
pred = model.predict(X_test)
pred

array([ 440.11634689,  365.14306878,  497.05008351,  679.71799083,
        757.86739573,  989.07012489,  378.40084159,  888.5511174 ,
        472.63620684,  496.92500993,  527.02276876, 1111.48059153,
        699.91298151,  482.57478607,  645.71917825,  751.64422844,
        451.92237289,  635.54884954,  627.58896149,  906.71140723,
        715.58634599,  463.29119714,  569.99235458,  917.89328648,
        571.13258697,  603.07695617,  496.54464091,  493.7898425 ,
        511.89054195,  714.50827989,  388.63511347,  440.26138221,
        718.00526736,  905.57544847,  610.42140242,  380.74862092,
        469.35666946,  272.30686718,  613.63404736,  786.7600223 ,
        633.43486387,  439.58369847,   82.93360107,  382.69861784,
        414.46753219,  577.81899232,  545.63283797,  612.19098145,
        387.69868372,  678.65789389,  446.5534041 ,  263.82747048,
        647.84550468,  398.65385218,  773.84880002,  473.83879076,
        747.38438309,  167.68022447,  626.42226429,  802.60860

In [102]:
dict_dat = {"실제값":y_test, "예측값":pred, "오차":y_test - pred}
dat = pd.DataFrame(dict_dat )
dat

Unnamed: 0,실제값,예측값,오차
250,108.0,440.116347,-332.116347
781,93.0,365.143069,-272.143069
692,359.0,497.050084,-138.050084
21,1064.0,679.717991,384.282009
2428,780.0,757.867396,22.132604
...,...,...,...
378,127.0,534.116934,-407.116934
870,548.0,684.888178,-136.888178
112,62.0,392.974058,-330.974058
1424,657.0,633.486120,23.513880


In [103]:
dat['오차절대값'] = abs(dat['오차'])
dat['오차제곱'] = dat['오차'] ** (2)
dat

Unnamed: 0,실제값,예측값,오차,오차절대값,오차제곱
250,108.0,440.116347,-332.116347,332.116347,110301.267871
781,93.0,365.143069,-272.143069,272.143069,74061.849887
692,359.0,497.050084,-138.050084,138.050084,19057.825556
21,1064.0,679.717991,384.282009,384.282009,147672.662570
2428,780.0,757.867396,22.132604,22.132604,489.852172
...,...,...,...,...,...
378,127.0,534.116934,-407.116934,407.116934,165744.198123
870,548.0,684.888178,-136.888178,136.888178,18738.373279
112,62.0,392.974058,-330.974058,330.974058,109543.827323
1424,657.0,633.486120,23.513880,23.513880,552.902561


In [104]:
### MAE
mae_val = dat['오차절대값'].sum()/dat.shape[0]
mae_val

281.25859586880335

In [105]:
### MSE
mse_val = dat['오차제곱'].sum()/dat.shape[0]
mse_val

141549.23015325118

In [106]:
### RMSE
rmse_val = mse_val ** 0.5
rmse_val

376.23028872387613

In [107]:
from sklearn.metrics import mean_squared_log_error
np.sqrt(mean_squared_log_error(abs(y_test), abs(pred)))

0.806367541772341

In [108]:
# R^2의 값을 구하기- 결정계수 구하기
print("훈련 데이터 세트 점수 : {:.2f}".format(model.score(X_train, y_train)))
print("테스트 데이터 세트 점수 : {:.2f}".format(model.score(X_test, y_test)))

훈련 데이터 세트 점수 : 0.17
테스트 데이터 세트 점수 : 0.22


In [109]:
for i in range(1, 10, 1):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=(i/10), random_state=42)

    model = LinearRegression()
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    pred[:5]
    
    mae = np.abs(y_test - pred).sum() / len(pred)
    mse = ((y_test - pred)**2).sum()/len(pred)
    rmse = (((y_test - pred)**2).sum()/len(pred))**0.5
    
    print("test_size : ",(i/10))
    print("MAE : {:.3f}".format(mae))
    print("MSE : {:.3f}".format(mse))
    print("RMSE : {:.3f}".format(rmse))
    print("")

test_size :  0.1
MAE : 276.831
MSE : 143942.376
RMSE : 379.397

test_size :  0.2
MAE : 287.799
MSE : 150160.884
RMSE : 387.506

test_size :  0.3
MAE : 287.457
MSE : 153944.636
RMSE : 392.358

test_size :  0.4
MAE : 287.850
MSE : 156616.034
RMSE : 395.747

test_size :  0.5
MAE : 286.886
MSE : 156244.768
RMSE : 395.278

test_size :  0.6
MAE : 284.607
MSE : 152625.235
RMSE : 390.673

test_size :  0.7
MAE : 283.592
MSE : 152192.148
RMSE : 390.118

test_size :  0.8
MAE : 286.082
MSE : 154891.521
RMSE : 393.563

test_size :  0.9
MAE : 286.719
MSE : 155721.977
RMSE : 394.616

