# California 데이터 선형 회귀분석

## 필요한 라이브러리 불러오기

In [2]:
import multiprocessing
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use(['seaborn-whitegrid'])

In [10]:
from sklearn.datasets import fetch_california_housing
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, OrthogonalMatchingPursuit, ElasticNet

## California 데이터 분석하기

In [5]:
california = fetch_california_housing()

print(california.keys())
print(california.DESCR)

Downloading Cal. housing from https://ndownloader.figshare.com/files/5976036 to /root/scikit_learn_data


dict_keys(['data', 'target', 'feature_names', 'DESCR'])
.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block
        - HouseAge      median house age in block
        - AveRooms      average number of rooms
        - AveBedrms     average number of bedrooms
        - Population    block population
        - AveOccup      average house occupancy
        - Latitude      house block latitude
        - Longitude     house block longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
http://lib.stat.cmu.edu/datasets/

The target variable is the median house value for California districts.

This dataset was derived from the 1990 U.S. census, using one row per census
block group. A block group is the 

## California 최적 파라미터 찾기

In [6]:
X_train, X_test, y_train, y_test = train_test_split(california.data, california.target, test_size=0.2)

In [9]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

### Lasso

In [76]:
estimator = Lasso()
TF = [True, False]
alpha = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10] 
tol = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10] 
param_grid = {'alpha':alpha,
              'copy_X':TF,
              'fit_intercept':TF,
              'normalize':TF,
              'positive':TF,
              'precompute':TF,
              'random_state':TF,
              'tol':tol,
              'warm_start':TF}

gs = GridSearchCV(
    estimator=estimator,
    param_grid=param_grid,
    n_jobs=multiprocessing.cpu_count(),
    cv=10,
    verbose=True
)
result = gs.fit(X_train, y_train)

print("최적의 점수: {}".format(result.best_score_))
print("최적의 파라미터: {}".format(result.best_params_))
print(gs.best_estimator_)
pd.DataFrame(result.cv_results_);

Fitting 10 folds for each of 10368 candidates, totalling 103680 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done 646 tasks      | elapsed:    3.5s
[Parallel(n_jobs=2)]: Done 5446 tasks      | elapsed:   22.8s
[Parallel(n_jobs=2)]: Done 13446 tasks      | elapsed:   54.2s
[Parallel(n_jobs=2)]: Done 24646 tasks      | elapsed:  1.6min
[Parallel(n_jobs=2)]: Done 39046 tasks      | elapsed:  2.5min
[Parallel(n_jobs=2)]: Done 56646 tasks      | elapsed:  3.5min
[Parallel(n_jobs=2)]: Done 77446 tasks      | elapsed:  4.8min
[Parallel(n_jobs=2)]: Done 101446 tasks      | elapsed:  6.2min
[Parallel(n_jobs=2)]: Done 103680 out of 103680 | elapsed:  6.3min finished


최적의 점수: 0.6019882919322056
최적의 파라미터: {'alpha': 0.005, 'copy_X': True, 'fit_intercept': True, 'normalize': False, 'positive': False, 'precompute': True, 'random_state': True, 'tol': 0.001, 'warm_start': True}
Lasso(alpha=0.005, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=True, random_state=True,
      selection='cyclic', tol=0.001, warm_start=True)


### Ridge

In [80]:
estimator = Ridge()
TF = [True, False]
alpha = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1] 
tol = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1] 
param_grid = {'alpha':alpha,
              'copy_X':TF,
              'fit_intercept':TF,
              'normalize':TF,
              'random_state':TF,
              'tol':tol,}

gs = GridSearchCV(
    estimator=estimator,
    param_grid=param_grid,
    n_jobs=multiprocessing.cpu_count(),
    cv=10,
    verbose=True
)
result = gs.fit(X_train, y_train)

print("최적의 점수: {}".format(result.best_score_))
print("최적의 파라미터: {}".format(result.best_params_))
print(gs.best_estimator_)
pd.DataFrame(result.cv_results_);

Fitting 10 folds for each of 784 candidates, totalling 7840 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done 584 tasks      | elapsed:    2.9s
[Parallel(n_jobs=2)]: Done 5384 tasks      | elapsed:   18.4s


최적의 점수: 0.5997504518970558
최적의 파라미터: {'alpha': 0.001, 'copy_X': True, 'fit_intercept': True, 'normalize': True, 'random_state': True, 'tol': 0.001}
Ridge(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=True, random_state=True, solver='auto', tol=0.001)


[Parallel(n_jobs=2)]: Done 7840 out of 7840 | elapsed:   26.3s finished


### ElasticNet

In [83]:
estimator = ElasticNet()
TF = [True, False]
alpha = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1] 
tol = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1] 
param_grid = {'alpha':alpha,
              'copy_X':TF,
              'fit_intercept':TF,
              'normalize':TF,
              'random_state':TF,
              'tol':tol,}
              

gs = GridSearchCV(
    estimator=estimator,
    param_grid=param_grid,
    n_jobs=multiprocessing.cpu_count(),
    cv=10,
    verbose=True
)
result = gs.fit(X_train, y_train)

print("최적의 점수: {}".format(result.best_score_))
print("최적의 파라미터: {}".format(result.best_params_))
print(gs.best_estimator_)
pd.DataFrame(result.cv_results_);

Fitting 10 folds for each of 784 candidates, totalling 7840 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done 956 tasks      | elapsed:    7.6s
[Parallel(n_jobs=2)]: Done 5756 tasks      | elapsed:   31.0s


최적의 점수: 0.6012345023418828
최적의 파라미터: {'alpha': 0.005, 'copy_X': True, 'fit_intercept': True, 'normalize': False, 'random_state': True, 'tol': 0.001}
ElasticNet(alpha=0.005, copy_X=True, fit_intercept=True, l1_ratio=0.5,
           max_iter=1000, normalize=False, positive=False, precompute=False,
           random_state=True, selection='cyclic', tol=0.001, warm_start=False)


[Parallel(n_jobs=2)]: Done 7840 out of 7840 | elapsed:   37.7s finished


### OrthogonalMatching

In [87]:
estimator = OrthogonalMatchingPursuit(n_nonzero_coefs=5)
TF = [True, False]
tol = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1] 
param_grid = {'fit_intercept':TF,
              'normalize':TF,
              'tol':tol}


gs = GridSearchCV(
    estimator=estimator,
    param_grid=param_grid,
    n_jobs=multiprocessing.cpu_count(),
    cv=10,
    verbose=True
)
result = gs.fit(X_train, y_train)

print("최적의 점수: {}".format(result.best_score_))
print("최적의 파라미터: {}".format(result.best_params_))
print(gs.best_estimator_)
pd.DataFrame(result.cv_results_);

Fitting 10 folds for each of 28 candidates, totalling 280 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


최적의 점수: 0.5997047110291353
최적의 파라미터: {'fit_intercept': True, 'normalize': True, 'tol': 0.001}
OrthogonalMatchingPursuit(fit_intercept=True, n_nonzero_coefs=5, normalize=True,
                          precompute='auto', tol=0.001)


[Parallel(n_jobs=2)]: Done 270 tasks      | elapsed:    2.1s
[Parallel(n_jobs=2)]: Done 280 out of 280 | elapsed:    2.1s finished
dependence in the dictionary. The requested precision might not have been met.

  return_n_iter=True)


### Linear Regression

In [89]:
estimator = LinearRegression()
TF = [True, False]
param_grid = {'copy_X':TF,
              'fit_intercept':TF,
              'normalize':TF}

gs = GridSearchCV(
    estimator=estimator,
    param_grid=param_grid,
    n_jobs=multiprocessing.cpu_count(),
    cv=10,
    verbose=True
)
result = gs.fit(X_train, y_train)

print("최적의 점수: {}".format(result.best_score_))
print("최적의 파라미터: {}".format(result.best_params_))
print(gs.best_estimator_)
pd.DataFrame(result.cv_results_);

Fitting 10 folds for each of 8 candidates, totalling 80 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


최적의 점수: 0.5997047110291354
최적의 파라미터: {'copy_X': True, 'fit_intercept': True, 'normalize': True}
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)


[Parallel(n_jobs=2)]: Done  80 out of  80 | elapsed:    1.7s finished


## 결과 : Lasso 가 가장 높게 나타남

Fitting 10 folds for each of 10368 candidates, totalling 103680 fits<br>
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.<br>
[Parallel(n_jobs=2)]: Done 646 tasks      | elapsed:    3.5s<br>
[Parallel(n_jobs=2)]: Done 5446 tasks      | elapsed:   22.8s<br>
[Parallel(n_jobs=2)]: Done 13446 tasks      | elapsed:   54.2s<br>
[Parallel(n_jobs=2)]: Done 24646 tasks      | elapsed:  1.6min<br>
[Parallel(n_jobs=2)]: Done 39046 tasks      | elapsed:  2.5min<br>
[Parallel(n_jobs=2)]: Done 56646 tasks      | elapsed:  3.5min<br>
[Parallel(n_jobs=2)]: Done 77446 tasks      | elapsed:  4.8min<br>
[Parallel(n_jobs=2)]: Done 101446 tasks      | elapsed:  6.2min<br>
[Parallel(n_jobs=2)]: Done 103680 out of 103680 | elapsed:  6.3min finished<br>
### **최적의 점수: 0.6019882919322056**<br>
최적의 파라미터: {'alpha': 0.005, 'copy_X': True, 'fit_intercept': True, <br>'normalize': False, 'positive': False, 'precompute': True, 'random_state': True, <br>'tol': 0.001, 'warm_start': True}<br>
Lasso(alpha=0.005, copy_X=True, fit_intercept=True, max_iter=1000,<br>
      normalize=False, positive=False, precompute=True, random_state=True,<br>
      selection='cyclic', tol=0.001, warm_start=True)