### 선형모델 및 Lasso, Ridge 모델 만들고 적용

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
import pandas as pd

In [4]:
train = pd.read_csv('../../../data/Parking_Demand/train.csv')
test = pd.read_csv('../../../data/Parking_Demand/test.csv')
age = pd.read_csv('../../../data/Parking_Demand/age_gender_info.csv')
sub = pd.read_csv('../../../data/Parking_Demand/sample_submission.csv')

train.shape, test.shape, age.shape, sub.shape

((2952, 15), (1022, 14), (16, 23), (150, 2))

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2952 entries, 0 to 2951
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   단지코드                          2952 non-null   object 
 1   총세대수                          2952 non-null   int64  
 2   임대건물구분                        2952 non-null   object 
 3   지역                            2952 non-null   object 
 4   공급유형                          2952 non-null   object 
 5   전용면적                          2952 non-null   float64
 6   전용면적별세대수                      2952 non-null   int64  
 7   공가수                           2952 non-null   float64
 8   자격유형                          2952 non-null   object 
 9   임대보증금                         2383 non-null   object 
 10  임대료                           2383 non-null   object 
 11  도보 10분거리 내 지하철역 수(환승노선 수 반영)  2741 non-null   float64
 12  도보 10분거리 내 버스정류장 수            2948 non-null   float64
 13  단지내

In [8]:
train.columns, test.columns

(Index(['단지코드', '총세대수', '임대건물구분', '지역', '공급유형', '전용면적', '전용면적별세대수', '공가수',
        '자격유형', '임대보증금', '임대료', '도보 10분거리 내 지하철역 수(환승노선 수 반영)',
        '도보 10분거리 내 버스정류장 수', '단지내주차면수', '등록차량수'],
       dtype='object'),
 Index(['단지코드', '총세대수', '임대건물구분', '지역', '공급유형', '전용면적', '전용면적별세대수', '공가수',
        '자격유형', '임대보증금', '임대료', '도보 10분거리 내 지하철역 수(환승노선 수 반영)',
        '도보 10분거리 내 버스정류장 수', '단지내주차면수'],
       dtype='object'))

In [7]:
train.head()

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,임대료,도보 10분거리 내 지하철역 수(환승노선 수 반영),도보 10분거리 내 버스정류장 수,단지내주차면수,등록차량수
0,C2483,900,아파트,경상북도,국민임대,39.72,134,38.0,A,15667000,103680,0.0,3.0,1425.0,1015.0
1,C2483,900,아파트,경상북도,국민임대,39.72,15,38.0,A,15667000,103680,0.0,3.0,1425.0,1015.0
2,C2483,900,아파트,경상북도,국민임대,51.93,385,38.0,A,27304000,184330,0.0,3.0,1425.0,1015.0
3,C2483,900,아파트,경상북도,국민임대,51.93,15,38.0,A,27304000,184330,0.0,3.0,1425.0,1015.0
4,C2483,900,아파트,경상북도,국민임대,51.93,41,38.0,A,27304000,184330,0.0,3.0,1425.0,1015.0


In [9]:
train.isnull().sum()

단지코드                              0
총세대수                              0
임대건물구분                            0
지역                                0
공급유형                              0
전용면적                              0
전용면적별세대수                          0
공가수                               0
자격유형                              0
임대보증금                           569
임대료                             569
도보 10분거리 내 지하철역 수(환승노선 수 반영)    211
도보 10분거리 내 버스정류장 수                4
단지내주차면수                           0
등록차량수                             0
dtype: int64

### 단순선형 회귀 모델

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [24]:
sel = ['총세대수', '전용면적', '전용면적별세대수', '공가수', '단지내주차면수']
X = train[sel]
y = train['등록차량수']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

### 모델 만들기

In [25]:
model = LinearRegression()   # 모델생성
model.fit(X_train, y_train)   # 모델훈련
pred = model.predict(X_test)  # 모델예측
pred

array([ 841.0858482 , 1101.49086731,  633.91113121,  813.1714711 ,
        733.61252467,  111.23046994,  116.67275226,  110.55216359,
        504.98966932,  604.14249559,  123.07240254,  738.14160519,
        870.80885865,  997.11349432,  579.70843781,  252.57004095,
       1295.15072997,  771.38570368,  165.11912932,  621.35531526,
        172.72461412,  522.49982315,  620.75751906,  549.94467544,
        112.47860212,  545.7430355 ,  807.25007368, 1248.25470105,
        463.60606079,  632.20433722,  696.17817958, 1278.17938249,
        772.67094014,  398.0992706 ,  179.25301161,  573.49244635,
        807.86710061,  570.23074076,  745.22788793, 1511.34871647,
        327.57701936,  368.19887101,  272.01586692,  987.58537997,
        324.82729113,  820.22620834, 1224.86833191,  399.25316576,
        680.68475754,  475.1162951 ,  486.19236372,  531.12825931,
        453.77712976,  559.22432692,  505.62873533,  773.58968629,
        452.51485277,  108.06369207,  729.53504879,  696.62041

In [26]:
model.coef_, model.intercept_

(array([-0.13567538,  0.51203118,  0.03799471, -6.26052196,  1.07038424]),
 88.98213601823062)

### 모델 평가하기

In [27]:
import numpy as np

In [28]:
# MAE, MSE, RMSE
mae_val = np.sum(abs(y_test - pred)) / len(pred) # 실제값(y_test) - 예측값(pred)
mae_val

148.90150162608143

In [29]:
np.mean(abs(y_test - pred))

148.90150162608134

### MSE

In [30]:
mse_val = np.sum((y_test - pred)**2) / len(pred)
print(mse_val)
mse_val = np.mean((y_test - pred)**2)
print(mse_val)

44421.44095785564
44421.44095785563


### RMSE

In [31]:
rmse_val = mse_val ** 0.5
print(rmse_val)

rmse_val = np.sqrt(mse_val)
print(rmse_val)

210.76394605780095
210.76394605780095


### Feature 늘리기

In [33]:
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures

In [34]:
sel = ['총세대수', '전용면적', '전용면적별세대수', '공가수', '단지내주차면수']
X = train[sel]
y = train['등록차량수']

nor_X = MinMaxScaler().fit_transform(X)  # 입력 데이터 정규화
ex_X = PolynomialFeatures(degree=2, include_bias=False).fit_transform(nor_X)  # 데이터 feature 추가 생성

X_train, X_test, y_train, y_test = train_test_split(ex_X, y, random_state=0)

In [35]:
X_train.shape # 늘어난 feature 확인

(2214, 20)

### LASSO 모델, Ridge 모델 적용

In [38]:
from sklearn.linear_model import Lasso, Ridge

In [81]:
model = Lasso(alpha=0.5)
model.fit(X_train, y_train)
pred = model.predict(X_test)
pred

array([ 858.58153629, 1110.3517762 ,  655.89775887,  779.60176139,
        738.65515619,  126.0532492 ,  131.49587514,   78.454714  ,
        515.48554727,  604.49735481,  137.77183723,  602.07521879,
        874.37597164, 1034.73433271,  582.05504433,  258.41643695,
       1193.72598899,  764.81959821,  128.40772834,  634.53348131,
        145.59888523,  540.82520287,  615.43568976,  572.28830394,
         34.07824302,  509.93231211,  752.02877101, 1274.29431229,
        452.66363951,  654.53073846,  703.42922666, 1335.16758859,
        778.3856594 ,  420.22327324,  149.20231774,  542.31168699,
        768.69661666,  576.82370818,  605.87573096, 1644.86975775,
        286.09116211,  369.73128181,  241.78299919, 1030.26960923,
        316.57737952,  795.97787591, 1252.98516099,  344.80422664,
        665.60594057,  438.70056476,  477.61752707,  537.66216777,
        451.6180334 ,  502.5543385 ,  502.10130593,  787.27145048,
        478.32783367,   77.08117031,  740.04122443,  690.17522

In [82]:
model.coef_

array([ -125.53400969,   161.31507536,    56.31023173,   213.86127178,
        1850.16279188,  -315.40173431,    -0.        ,     0.        ,
           0.        ,   429.93736159,    -0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,  -179.76418831, -1149.09270961,   109.6771883 ])

### 평가하기(MAE, MSE, RMSE)

In [83]:
# MAE
mae_val = np.mean(abs(y_test - pred)) # 실제값(y_test) - 예측값(pred)
mae_val

138.5498565028435

In [84]:
# MSE
mse_val = np.mean((y_test - pred)**2)
print(mse_val)

40528.59206086982


In [85]:
# RMSE
rmse_val = np.sqrt(mse_val)
print(rmse_val)

201.31714298804715


### Ridge

In [86]:
model = Ridge(alpha=0.5)
model.fit(X_train, y_train)
pred = model.predict(X_test)
pred

array([ 830.52649844, 1160.28681726,  655.60935929,  797.63820861,
        736.19468302,  118.77480638,  138.81825327,   66.36976689,
        551.0340421 ,  632.50131825,  149.37946729,  518.49539531,
        879.16382375, 1030.70543037,  547.67898523,  277.84259868,
       1252.92612701,  778.3994118 ,  151.62612814,  624.17212673,
        110.97345188,  548.67307884,  619.76563886,  566.83321112,
         35.65937489,  440.48541493,  758.03401969, 1277.82260495,
        474.63822522,  649.00625261,  691.91768598, 1335.20329215,
        784.79242392,  404.41916027,  130.86674238,  522.91406392,
        782.57812088,  512.99693472,  560.65562161, 1650.87871009,
        287.34919361,  379.44123808,  218.18747604,  985.12230094,
        289.41399754,  861.16609259, 1274.66071803,  318.34496441,
        656.36892092,  441.22618318,  484.96298222,  510.0643236 ,
        449.78514108,  482.49429409,  510.51539661,  792.22632936,
        484.52119817,   59.18771047,  724.2869338 ,  632.57720

In [87]:
model.coef_

array([  -94.02904438,   858.46165232,   111.39741667,   358.37817244,
        1586.04977877,  -419.79309832,  -185.23427377,   148.33855264,
         199.64732818,   764.36302532,  -845.74680407,   207.51207797,
         534.48327261,   986.91364666,  -264.01261219,   104.5545755 ,
         156.9148051 ,  -417.67688369, -1310.3758256 ,   104.872246  ])

In [88]:
# MAE
mae_val = np.mean(abs(y_test - pred)) # 실제값(y_test) - 예측값(pred)
mae_val

138.5141293794775

In [89]:
# MSE
mse_val = np.mean((y_test - pred)**2)
print(mse_val)

39858.71112542404


In [90]:
# RMSE
rmse_val = np.sqrt(mse_val)
print(rmse_val)

199.64646534668236
