In [74]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [75]:
df = pd.read_table('suumo_tokyo.csv',encoding='utf-16',engine='python')

In [76]:
df.drop(['Unnamed: 0'], axis=1, inplace=True)

# データ分割

In [77]:
tmpX = df.loc[:,['賃料','築年数','建物高さ','専有面積','階1','間取りDK','間取りK','間取りL','間取りS','住所']]
tmpX = tmpX.dropna(how='any')
tmpX = tmpX.reset_index(drop=True)

In [78]:
tmpX = tmpX.replace('(.*)足立(.*)','足立',regex=True)
tmpX = tmpX.replace('(.*)千代田(.*)','千代田',regex=True)
tmpX = tmpX.replace('(.*)中央(.*)','中央',regex=True)

In [79]:
le = []
enc_columns = np.array([])
for i in np.arange(0,1):
    le = np.append(le,LabelEncoder())
    le[i].fit(tmpX['住所','徒歩1'])
    enc_columns = np.append(enc_columns,le[i].classes_)
    tmpX['住所'] = le[i].transform(tmpX['住所'])

In [89]:
one_hot_encoder = OneHotEncoder()
one_hot_encoder.fit(tmpX.loc[:,['住所']])

OneHotEncoder(categorical_features='all', dtype=<class 'numpy.float64'>,
       handle_unknown='error', n_values='auto', sparse=True)

In [91]:
enc_data = one_hot_encoder.transform(tmpX.loc[:,['住所']]).toarray() # numpyマトリックス型で返されるデータをnumpy.array型に変換
enc_df = pd.DataFrame(enc_data)
enc_df.head()
enc_df.columns = enc_columns
tmpX = pd.concat([tmpX.iloc[:,0:-1],enc_df,tmpX['住所']],axis=1)

データの標準化

In [97]:
sc = StandardScaler()
sc.fit(tmpX.iloc[:,1:5])
sc_df = pd.DataFrame(sc.transform(tmpX.iloc[:,1:5]))

In [100]:
tmp = tmpX.loc[:,['間取りDK','間取りK','間取りL','間取りS']]
tmp = tmp.reset_index(drop=True)
X = pd.concat([sc_df,tmp,enc_df], axis=1)
X.columns = ['築年数','建物高さ','専有面積','階1','間取りDK','間取りK','間取りL','間取りS','中央','千代田','足立']
y = tmpX.iloc[:,[0]]

In [102]:
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.5)

---

# 正則化なしの線形回帰

# モデル作成

In [103]:
reg_lr = LinearRegression()

In [104]:
reg_lr.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [105]:
reg_lr.coef_

array([[ -1968.46787382,   4757.81718817,  57811.90258628,
         13900.03032299, -39333.57977975,  -6196.65791826,
          8853.31453451,   5544.12394029,   4924.99346382,
         28580.97983212, -33505.97329594]])

# MSE

- TrainデータでMSE、RMSE

In [106]:
train_pred = reg_lr.predict(X_train)
train_mse = mean_squared_error(y_true=y_train , y_pred=train_pred)
print(train_mse)

1561549080.949987


    RMSE

In [107]:
np.sqrt(train_mse)

39516.440641206376

- TestデータでMSE、RMSE

In [108]:
test_pred = reg_lr.predict(X_test)
test_mse = mean_squared_error(y_true=y_test , y_pred=test_pred)
print(test_mse)

1590621925.7705007


    RMSE

In [109]:
np.sqrt(test_mse)

39882.60179289336

---

# L1正則化（Lasso）

# モデル作成

In [110]:
# alphaが正則化を強力にするパラメーター
#l1 = Lasso(alpha=1)
reg_l1 = Lasso(alpha=100)

In [111]:
reg_l1.fit(X_train,y_train)

Lasso(alpha=100, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [112]:
reg_l1.coef_

array([ -1883.36466239,   4829.75951287,  57621.39142591,  13987.56426878,
       -36164.1899272 ,  -4203.47062173,   7570.24912865,      0.        ,
            0.        ,  22992.10030103, -38483.3309775 ])

# MSE

- TrainデータでMSE、RMSE

In [113]:
train_pred = reg_l1.predict(X_train)
train_mse = mean_squared_error(y_true=y_train , y_pred=train_pred)
print(train_mse)

1562747930.265793


    RMSE

In [114]:
np.sqrt(train_mse)

39531.606725072445

- TestデータでMSE、RMSE

In [115]:
test_pred = reg_l1.predict(X_test)
test_mse = mean_squared_error(y_true=y_test , y_pred=test_pred)
print(test_mse)

1591329982.7063944


    RMSE

In [116]:
np.sqrt(test_mse)

39891.47756985688

---

# L2正則化（Ridge）

# モデル作成

In [80]:
# alphaが正則化を強力にするパラメーター
#l1 = Ridge(alpha=1)
reg_l2 = Ridge(alpha=100)

In [81]:
reg_l2.fit(X_train,y_train)

Ridge(alpha=100, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [82]:
reg_l2.coef_

array([-333.88391713, 5778.79600189, 2055.17801966, 3250.41024114,
        311.96952159, -311.96952159,  487.70481366, 1560.79587803])

# MSE

- TrainデータでMSE、RMSE

In [83]:
train_pred = reg_l2.predict(X_train)
train_mse = mean_squared_error(y_true=y_train , y_pred=train_pred)
print(train_mse)

1848337735.2037718


    RMSE

In [84]:
np.sqrt(train_mse)

42992.298556878435

- TestデータでMSE、RMSE

In [85]:
test_pred = reg_l2.predict(X_test)
test_mse = mean_squared_error(y_true=y_test , y_pred=test_pred)
print(test_mse)

4361712722.214549


    RMSE

In [86]:
np.sqrt(test_mse)

66043.2640184792

---