In [1]:
import pandas as pd
import numpy as np

In [2]:
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]

In [3]:
x = pd.DataFrame(data, 
                columns=["CRIM", "ZN", "INDUS", "CHAS", "NOX",
                       "RM", "AGE", "DIS", "RAD", "TAX", "PTRATIDO",
                       "B", "LSTAT"])
y = pd.DataFrame(target, columns=["PRICE"])

In [4]:
# linear regression, ridge, lasso, elasticnet 을 가지고
# 학습시킨 후, 4개의 모델을 비교하세요.
# 비교 내역 : score, coef_, intercept_

In [35]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet

In [36]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)

In [37]:
model_list = [LinearRegression(), Ridge(), Lasso(), ElasticNet()]

In [47]:
names = list()
scores = list()
weights = list()
bias = list()

In [48]:
for model in model_list:
    model.fit(x_train, y_train)
    
    names.append(model.__class__.__name__)
    score.append(model.score(x_test, y_test))
    weights.append(model.coef_)
    bias.append(model.intercept_)

In [49]:
models = pd.DataFrame(scores, index=names, columns=["score"])
model

In [50]:
print(len(weights))
print(weights[0])
print(weights[1])
print(weights[2])
print(weights[3])
# 결과에 [[]], [] 두 개가 있다. 
# 얘네 둘을 하나로 바꿔줘야지 데이터프레임 안에 나타날 것 

4
[[-9.85424717e-02  6.07841138e-02  5.91715401e-02  2.43955988e+00
  -2.14699650e+01  2.79581385e+00  3.57459778e-03 -1.51627218e+00
   3.07541745e-01 -1.12800166e-02 -1.00546640e+00  6.45018446e-03
  -5.68834539e-01]]
[[-8.99352520e-02  6.20345865e-02  1.21404325e-02  2.23426149e+00
  -1.12838152e+01  2.89618901e+00 -4.81458007e-03 -1.36998976e+00
   2.83653073e-01 -1.22828776e-02 -8.84229846e-01  7.09753443e-03
  -5.80033848e-01]]
[-0.05256765  0.05904289 -0.          0.         -0.          0.
  0.01964989 -0.5767539   0.23300934 -0.01230686 -0.72109227  0.00600289
 -0.79711475]
[-0.06892062  0.06229229 -0.          0.         -0.          0.60621493
  0.01506499 -0.69022698  0.26553588 -0.01387045 -0.71551352  0.00639852
 -0.75468341]


In [51]:
weights[0] = weights[0].flatten()
weights[1] = weights[1].flatten()

In [56]:
weight_list = list()
for i in range(len(weights)):
    temp_dict = dict()
    for j in range(len(weights[i])):
        temp_dict[x.columns[j]] = weights[i][j]
    weight_list.append(temp_dict)
weight_df = pd.DataFrame(weight_list, index=names)
weight_df    

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIDO,B,LSTAT
LinearRegression,-0.098542,0.060784,0.059172,2.43956,-21.469965,2.795814,0.003575,-1.516272,0.307542,-0.01128,-1.005466,0.00645,-0.568835
Ridge,-0.089935,0.062035,0.01214,2.234261,-11.283815,2.896189,-0.004815,-1.36999,0.283653,-0.012283,-0.88423,0.007098,-0.580034
Lasso,-0.052568,0.059043,-0.0,0.0,-0.0,0.0,0.01965,-0.576754,0.233009,-0.012307,-0.721092,0.006003,-0.797115
ElasticNet,-0.068921,0.062292,-0.0,0.0,-0.0,0.606215,0.015065,-0.690227,0.265536,-0.01387,-0.715514,0.006399,-0.754683


In [57]:
models = pd.concat([models, weight_df], axis=1)
models 
# 컬럼부터 가중치가 잡힌다.
# weight 와 weights를 어떻게 구별하나요? 사용할 때  
# => 그냥 변수들이어서 weight가 하나면 weight  여러개면  weights로 사용

Unnamed: 0,score,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIDO,B,LSTAT
LinearRegression,,-0.098542,0.060784,0.059172,2.43956,-21.469965,2.795814,0.003575,-1.516272,0.307542,-0.01128,-1.005466,0.00645,-0.568835
Ridge,,-0.089935,0.062035,0.01214,2.234261,-11.283815,2.896189,-0.004815,-1.36999,0.283653,-0.012283,-0.88423,0.007098,-0.580034
Lasso,,-0.052568,0.059043,-0.0,0.0,-0.0,0.0,0.01965,-0.576754,0.233009,-0.012307,-0.721092,0.006003,-0.797115
ElasticNet,,-0.068921,0.062292,-0.0,0.0,-0.0,0.606215,0.015065,-0.690227,0.265536,-0.01387,-0.715514,0.006399,-0.754683


In [59]:
models["bias"] = list(map(lambda x: x[0],bias))
models
# bias가 배열배열이어서 리스트로 바꿔주세요 한 다음에 추가한 것 (bias는 상수)
# 스코어 뒤에 있는것들 = 각각의 가중치
# 그렇다면 가중치가 하는 역할은? => y와 x의 관계를 만들 때 x가 y한테 얼마만큼의 영향을 가지느냐 를 나타낸 것
# x_train이 y_train한테 얼마나 영향을 끼쳤냐
# nox의 경우에는 다른애들이 -,0,+일 때 가장 많은 영향력을 주고 있다. -값이 크다.
# 근데 bias의 영향력도 큰건가?  음수인게 영향력이 큰건가?

Unnamed: 0,score,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIDO,B,LSTAT,bias
LinearRegression,,-0.098542,0.060784,0.059172,2.43956,-21.469965,2.795814,0.003575,-1.516272,0.307542,-0.01128,-1.005466,0.00645,-0.568835,46.396494
Ridge,,-0.089935,0.062035,0.01214,2.234261,-11.283815,2.896189,-0.004815,-1.36999,0.283653,-0.012283,-0.88423,0.007098,-0.580034,38.951619
Lasso,,-0.052568,0.059043,-0.0,0.0,-0.0,0.0,0.01965,-0.576754,0.233009,-0.012307,-0.721092,0.006003,-0.797115,46.990859
ElasticNet,,-0.068921,0.062292,-0.0,0.0,-0.0,0.606215,0.015065,-0.690227,0.265536,-0.01387,-0.715514,0.006399,-0.754683,43.504887
