### 선형 모델 및 Lasso, Ridge 모델 만들고 적용하기

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression

In [2]:
import pandas as pd
train = pd.read_csv("train_df_errno.csv")
test = pd.read_csv("test_df.csv")
sub = pd.read_csv("sample_submission.csv")
age = pd.read_csv("age_gender_info.csv")
train.shape, test.shape, sub.shape, age.shape

((2896, 15), (1008, 14), (150, 2), (16, 23))

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2896 entries, 0 to 2895
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   단지코드        2896 non-null   object 
 1   총세대수        2896 non-null   int64  
 2   임대건물구분      2896 non-null   object 
 3   지역          2896 non-null   object 
 4   공급유형        2896 non-null   object 
 5   전용면적        2896 non-null   float64
 6   전용면적별세대수    2896 non-null   int64  
 7   공가수         2896 non-null   float64
 8   자격유형        2896 non-null   object 
 9   임대보증금       2327 non-null   object 
 10  임대료         2327 non-null   object 
 11  10분내지하철수    2685 non-null   float64
 12  10분내버스정류장수  2892 non-null   float64
 13  단지내주차면수     2896 non-null   float64
 14  등록차량수       2896 non-null   float64
dtypes: float64(6), int64(2), object(7)
memory usage: 339.5+ KB


In [4]:
train.columns, test.columns

(Index(['단지코드', '총세대수', '임대건물구분', '지역', '공급유형', '전용면적', '전용면적별세대수', '공가수',
        '자격유형', '임대보증금', '임대료', '10분내지하철수', '10분내버스정류장수', '단지내주차면수', '등록차량수'],
       dtype='object'),
 Index(['단지코드', '총세대수', '임대건물구분', '지역', '공급유형', '전용면적', '전용면적별세대수', '공가수',
        '자격유형', '임대보증금', '임대료', '10분내지하철수', '10분내버스정류장수', '단지내주차면수'],
       dtype='object'))

In [5]:
train.head()

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,임대료,10분내지하철수,10분내버스정류장수,단지내주차면수,등록차량수
0,C2515,545,아파트,경상남도,국민임대,33.48,276,17.0,A,9216000,82940,0.0,3.0,624.0,205.0
1,C2515,545,아파트,경상남도,국민임대,39.6,60,17.0,A,12672000,107130,0.0,3.0,624.0,205.0
2,C2515,545,아파트,경상남도,국민임대,39.6,20,17.0,A,12672000,107130,0.0,3.0,624.0,205.0
3,C2515,545,아파트,경상남도,국민임대,46.9,38,17.0,A,18433000,149760,0.0,3.0,624.0,205.0
4,C2515,545,아파트,경상남도,국민임대,46.9,19,17.0,A,18433000,149760,0.0,3.0,624.0,205.0


### 단순 선형 회귀모델 만들기

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [16]:
#  처음에 단순선형 하면서 총세대수만 함
# Lasso 들어오면서 5개로 feature 늘림
sel= ['총세대수','전용면적','전용면적별세대수','공가수','단지내주차면수']
X = train[sel]
y = train['등록차량수']

X_train, X_test, y_train, y_test =train_test_split(X,y, random_state=0)
X_train.shape , X_test.shape, y_train.shape, y_test.shape

((2172, 5), (724, 5), (2172,), (724,))

### 모델만들기

In [8]:
model = LinearRegression()   # 모델 생성
model.fit(X_train, y_train)  # 모델 훈련
pred = model.predict(X_test) # 새로운 데이터로 예측

In [9]:
model.coef_, model.intercept_

(array([-0.15912372,  0.48962892,  0.0719852 , -6.98404745,  1.09596934]),
 100.64601235358572)

### 모델 평가하기

In [10]:
import numpy as np

In [11]:
#mae, mse, rmse
mae_val = np.sum( abs( y_test - pred) ) / len(pred)
mae_val

149.12933411602248

In [12]:
np.mean( abs(y_test-pred)  )

149.1293341160224

### MSE

In [13]:
mse_val= np.sum(   (y_test-pred)**2 ) /len(pred)
print(mse_val)
mse_val= np.mean( (y_test-pred)**2 )
print(mse_val)

42701.70133960169
42701.70133960169


### RMSE

In [14]:
rmse_val = mse_val **0.5
print(rmse_val)
rmse_val = np.sqrt(mse_val)
print(rmse_val)

206.64389983641348
206.64389983641348


### 피처수 늘리기

In [19]:
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures

In [20]:
sel = ['총세대수', '전용면적', '전용면적별세대수', '공가수', '단지내주차면수']
X = train[sel]
y = train['등록차량수']
nor_X = MinMaxScaler().fit_transform(X)  # 입력 데이터 정규화
ex_X = PolynomialFeatures(degree=2, include_bias=False).fit_transform(nor_X)  # 데이터 feature 추가 생성
X_train, X_test, y_train, y_test = train_test_split(ex_X, y, random_state=0)

In [21]:
X_train.shape

(2172, 20)

### LASSO 모델 적용하기

In [23]:
from sklearn.linear_model import Lasso,Ridge

In [27]:
model = Lasso(alpha=0.01)
model.fit(X_train,y_train)
pred = model.predict(X_test)
pred[0:10]

array([2034.15114024,  368.91598673,  824.17931433,  496.8976281 ,
        121.24109349,  742.33586207,  438.66678347,  291.25489882,
        627.84464153,  432.58897793])

In [26]:
model.coef_    #가중치 보기

array([  -10.1263555 ,  1190.20115145,  -187.42652318,   330.12348095,
        1604.45785474,  -533.83686613, -2076.54678837,  1329.63865233,
         293.55663998,  1077.94559216,  -452.95657097,  3604.89611386,
         439.68006605,  3542.38639166, -1267.2875676 ,   346.19657   ,
        -101.5081487 ,  -330.71549027, -1541.75256794,  -241.13484863])

### 평가하기(mae,mse,rmse)

In [28]:
mae_val = np.mean( abs( y_test - pred) )
print(mae_val)
mse_val = np.mean((y_test-pred)**2)
print(mse_val)
rmse_val =mse_val**0.5
print(rmse_val)

130.31797654776875
34266.26334355182
185.11148895612024


### Ridge모델 적용하기

In [29]:
model = Ridge(alpha=0.01)
model.fit(X_train,y_train)
pred = model.predict(X_test)
pred[0:10]

array([2017.26924206,  368.42107124,  821.85083007,  497.6633284 ,
        119.62607584,  743.81368402,  437.08916815,  290.93181935,
        632.97896544,  433.76306719])

In [30]:
mae_val = np.mean( abs( y_test - pred) )
print(mae_val)
mse_val = np.mean((y_test-pred)**2)
print(mse_val)
rmse_val =mse_val**0.5
print(rmse_val)

130.57735384908366
34387.41485979873
185.4384395420721


### Lasso 실험

In [35]:
model = Lasso(alpha=10)
model.fit(X_train,y_train)
pred = model.predict(X_test)
pred[0:10]

array([1407.60364822,  379.70850871,  666.46742453,  451.19282297,
        186.61869433,  751.09827934,  470.9126338 ,  297.54263025,
        614.7029211 ,  402.71495468])

In [36]:
model.coef_  

array([  -0.        ,    0.        ,    0.        ,   -0.        ,
       1466.66093048,   -0.        ,   -0.        ,    0.        ,
         -0.        ,    0.        ,   -0.        ,    0.        ,
         -0.        ,    0.        ,    0.        ,   -0.        ,
          0.        ,   -0.        ,   -0.        ,    0.        ])