In [264]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [265]:
df = pd.read_table('suumo_tokyo.csv',encoding='utf-16',engine='python')

In [266]:
df.drop(['Unnamed: 0'], axis=1, inplace=True)

# データ分割

In [267]:
tmpX = df.loc[:,['賃料','築年数','建物高さ','専有面積','階1','間取りDK','間取りK','間取りL','間取りS']]
tmpX = tmpX.dropna(how='any')

In [268]:
sc = StandardScaler()
sc.fit(tmpX.iloc[:,1:5])
sc_df = pd.DataFrame(sc.transform(tmpX.iloc[:,1:5]))

In [269]:
tmp = df.loc[:,['間取りDK','間取りK','間取りL','間取りS']]
tmp = tmp.reset_index(drop=True)
X = pd.concat([sc_df,tmp], axis=1)
X.columns = ['築年数','建物高さ','専有面積','階1','間取りDK','間取りK','間取りL','間取りS']
y = tmpX.iloc[:,[0]]

In [270]:
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.5)

ValueError: Found input variables with inconsistent numbers of samples: [13235, 12941]

---

# 正則化なしの線形回帰

# モデル作成

In [243]:
reg_lr = LinearRegression()

In [244]:
reg_lr.fit(X_train,y_train)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [68]:
reg_lr.coef_

array([-1.19640066e+02,  4.61428830e+03,  7.63463771e+02, -1.42747726e+03,
        1.27074753e+03, -1.27074753e+03,  9.61771546e+03,  3.81793841e+05])

# MSE

- TrainデータでMSE、RMSE

In [69]:
train_pred = reg_lr.predict(X_train)
train_mse = mean_squared_error(y_true=y_train , y_pred=train_pred)
print(train_mse)

376911075.5524168


    RMSE

In [70]:
np.sqrt(train_mse)

19414.197782870575

- TestデータでMSE、RMSE

In [71]:
test_pred = reg_lr.predict(X_test)
test_mse = mean_squared_error(y_true=y_test , y_pred=test_pred)
print(test_mse)

9002101406.54041


    RMSE

In [72]:
np.sqrt(test_mse)

94879.40454355945

---

# L1正則化（Lasso）

# モデル作成

In [73]:
# alphaが正則化を強力にするパラメーター
#l1 = Lasso(alpha=1)
reg_l1 = Lasso(alpha=100)

In [74]:
reg_l1.fit(X_train,y_train)

Lasso(alpha=100, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [75]:
reg_l1.coef_

array([-1.29972406e+02,  4.66147061e+03,  8.00305658e+02, -7.79950082e+02,
        1.94184024e+03, -8.97189773e-12,  9.03837184e+03,  3.70684423e+05])

# MSE

- TrainデータでMSE、RMSE

In [76]:
train_pred = reg_l1.predict(X_train)
train_mse = mean_squared_error(y_true=y_train , y_pred=train_pred)
print(train_mse)

378196404.9337369


    RMSE

In [77]:
np.sqrt(train_mse)

19447.27242915409

- TestデータでMSE、RMSE

In [78]:
test_pred = reg_l1.predict(X_test)
test_mse = mean_squared_error(y_true=y_test , y_pred=test_pred)
print(test_mse)

8625094592.877106


    RMSE

In [79]:
np.sqrt(test_mse)

92871.3873745682

---

# L2正則化（Ridge）

# モデル作成

In [80]:
# alphaが正則化を強力にするパラメーター
#l1 = Ridge(alpha=1)
reg_l2 = Ridge(alpha=100)

In [81]:
reg_l2.fit(X_train,y_train)

Ridge(alpha=100, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [82]:
reg_l2.coef_

array([-333.88391713, 5778.79600189, 2055.17801966, 3250.41024114,
        311.96952159, -311.96952159,  487.70481366, 1560.79587803])

# MSE

- TrainデータでMSE、RMSE

In [83]:
train_pred = reg_l2.predict(X_train)
train_mse = mean_squared_error(y_true=y_train , y_pred=train_pred)
print(train_mse)

1848337735.2037718


    RMSE

In [84]:
np.sqrt(train_mse)

42992.298556878435

- TestデータでMSE、RMSE

In [85]:
test_pred = reg_l2.predict(X_test)
test_mse = mean_squared_error(y_true=y_test , y_pred=test_pred)
print(test_mse)

4361712722.214549


    RMSE

In [86]:
np.sqrt(test_mse)

66043.2640184792

---