In [1]:
import os
import tarfile
import urllib.request

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH= os.path.join("datasets", "housing")
HOUSING_URL= DOWNLOAD_ROOT + "datasets/housing/housing.tgz"


#カリフォルニア住宅データ
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path=os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz=tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

fetch_housing_data()    
    
import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
        csv_path = os.path.join(housing_path,"housing.csv")
        return pd.read_csv(csv_path)
    
    

    
    
housing = load_housing_data()
#%matplotlib inline
import matplotlib.pyplot as plt
#housing.hist(bins=50,figsize=(20,15))
#plt.show()

import numpy as np

def split_train_test(data,test_radio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data)*test_radio)
    test_indices=shuffled_indices[:test_set_size]
    train_indices=shuffled_indices[test_set_size:]
    return data.iloc[train_indices],data.iloc[test_indices]

train_set, test_set = split_train_test(housing, 0.2)
print(len(train_set), "train +" ,len(test_set), "test")

import hashlib

def test_set_check(identifier, test_ratio, hash):
    return hash(np.int64(identifier)).digest()[-1] <256 * test_ratio

def split_train_test_by_id(data, test_ratio, id_column, hash=hashlib.md5):
    ids=data[id_column]
    in_test_set=ids.apply(lambda id_: test_set_check(id_, test_ratio, hash))
    return data.loc[~in_test_set], data.loc[in_test_set]

housing_with_id=housing.reset_index()
housing_with_id["id"]=housing["longitude"]*1000 +housing["latitude"]
train_set, test_set = split_train_test_by_id(housing_with_id,0.2,"id")


from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size=0.2, random_state =42)

housing["income_cat"] = np.ceil(housing["median_income"]/1.5)
housing["income_cat"].where(housing["income_cat"] < 5,5.0, inplace=True)

#層化抽出法
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]
    

strat_test_set["income_cat"].value_counts()/len(strat_test_set)    

for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat",axis=1, inplace=True)
    
housing=strat_train_set.copy()

#housing.plot(kind="scatter",x="longitude", y="latitude")
#housing.plot(kind="scatter",x="longitude", y="latitude",alpha=0.1)
#housing.plot(kind="scatter",x="longitude", y="latitude",alpha=0.1,s=housing["population"]/100,label="population", figsize=(10,7),
#            c="median_house_value",cmap=plt.get_cmap("jet"), colorbar=True,
#            )

#plt.show()

corr_matrix=housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

from pandas.plotting import scatter_matrix

attributes=["median_house_value","median_income","total_rooms","housing_median_age"]
#scatter_matrix(housing[attributes],figsize=(12,8))

#housing.plot(kind="scatter",x="median_income",y="median_house_value",alpha=0.1)

housing["rooms_per_household"]=housing["total_rooms"]/housing["households"]
housing["bedrooms_per_rooms"]=housing["total_bedrooms"]/housing["total_rooms"]
housing["Population_per_household"]=housing["population"]/housing["households"]
#corr_matrix=housing.corr()
#corr_matrix["median_house_value"].sort_values(ascending=False)

housing = strat_train_set.drop("median_house_value",axis=1)
housing_labels=strat_train_set["median_house_value"].copy()

#housing.dropna(subset=["total_bedrooms"])
#housing.drop("total_bedrooms",axis=1)
#median=housing["total_bedrooms"].median()
#housing["total_bedrooms"].fillna(median,inplace=True)

#欠損値補完
from sklearn.impute import SimpleImputer

imputer=SimpleImputer(strategy="median")

housing_num = housing.drop("ocean_proximity", axis=1)
imputer.fit(housing_num)
imputer.statistics_
housing_num.median().values

X=imputer.transform(housing_num)
housing_tr=pd.DataFrame(X, columns=housing_num.columns)

housing_cat=housing["ocean_proximity"]
housing_cat.head(10)
housing_cat_encoded, housing_categories = housing_cat.factorize()
housing_cat_encoded[:10]
print(housing_categories)

#カテゴリ1/0処理
from sklearn.preprocessing import OneHotEncoder
encoder=OneHotEncoder(categories='auto')
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))
#print(housing_cat_1hot)
#housing_cat_1hot.toarray()
#from sklearn.preprocessing import CategoricalEncoder
#cat_encoder = CategoricalEncoder()
#housing_cat_reshaped = housing_cat.values.reshape(-1,1)
#housing_cat_1hot = cat_encoder.fit_transform(housing_cat_reshped)
#housing_cat_1hot

from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, bedrooms_ix, population_ix, household_ix=3,4,5,6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self
    def transform(self, X,y=None):
        rooms_per_household = X[:, rooms_ix]/X[:, household_ix]
        population_per_household = X[:, population_ix]/X[:,household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room=X[:,bedrooms_ix]/X[:,rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]##c_列ベクトル結合
        else:
            return np.c_[X, rooms_per_household, population_per_household]
        
        
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

#パイプライン通す
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ("imputer",imputer),
    ("attribs_adder",CombinedAttributesAdder()),
    ("std_sclaer", StandardScaler()),
])
        
housing_num_tr = num_pipeline.fit_transform(housing_num)

#属性選択クラス
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        return X[self.attribute_names].values
    
num_attribs = list(housing_num)
cat_attribs=["ocean_proximity"]
    
num_pipeline = Pipeline([
    ("selector", DataFrameSelector(num_attribs)),
    ("imputer", imputer),
    ("attribs_adder", CombinedAttributesAdder()),
    ("std_scaler", StandardScaler()),
])    


cat_pipeline = Pipeline([
('selector', DataFrameSelector(cat_attribs)),
('encoder', OneHotEncoder(sparse=False,categories='auto'))##categories='auto',
])


full_pipeline = FeatureUnion(transformer_list=[
('num_pipeline',num_pipeline),
('cat_pipeline',cat_pipeline),
])



#try:
##    from sklearn.compose import ColumnTransformer
#except ImportError:
#    from future_encoders import ColumnTransformer
#num_attribs = list(housing_num)
#cat_attribs = ["ocean_proximity"]

#full_pipeline = ColumnTransformer([
#        ("num", num_pipeline, num_attribs),
#        ("cat", OneHotEncoder(), cat_attribs),
#    ])

housing_prepared=full_pipeline.fit_transform(housing)
print(type(housing_prepared[1]))
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = num_pipeline.fit_transform(some_data)
#print("Predictions:", lin_reg.predict(some_data_prepared))
#print("Labels:", list(some_labels))

#MSE
from sklearn.metrics import mean_squared_error
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse=mean_squared_error(housing_labels, housing_predictions)
lin_rmse=np.sqrt(lin_mse)
#lin_rmse
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)

housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
#tree_rmse

from sklearn.model_selection import cross_val_score
scores=cross_val_score(tree_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:",scores.std())
    
#display_scores(tree_rmse_scores)    

lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,scoring="neg_mean_squared_error",cv=10)
lin_rmse_scores=np.sqrt(-lin_scores)
#display_scores(lin_rmse_scores)

from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)
forest_reg_predictions = forest_reg.predict(housing_prepared)
forest_reg_mse = mean_squared_error(housing_labels, forest_reg_predictions)
forest_rmse = np.sqrt(forest_reg_mse)
forest_rmse
forest_reg_scores = cross_val_score(forest_reg, housing_prepared, housing_labels,scoring="neg_mean_squared_error",cv=10)
forest_reg_rmse_scores=np.sqrt(-forest_reg_scores)
#display_scores(forest_reg_rmse_scores)
#from sklearn.externals import joblib
import joblib

from sklearn.model_selection import GridSearchCV
param_grid=[
    {"n_estimators":[3,10,30],"max_features":[2,4,6,8]},
    {"bootstrap":[False],"n_estimators":[3,10],"max_features":[2,3,4]},
]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                          scoring="neg_mean_squared_error",refit=True,
                        )

grid_search.fit(housing_prepared, housing_labels)

grid_search.best_params_
grid_search.best_estimator_

cvres= grid_search.cv_results_
#for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
#    print(np.sqrt(-mean_score),params)
    
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

extra_attribs = ["rooms_per_hhold", "pop_per_hhold","bedrooms_per_room"]
attribs=num_attribs + extra_attribs
sorted(zip(feature_importances,attributes),reverse=True)

final_model = grid_search.best_estimator_

X_test = strat_test_set.drop("median_house_value",axis=1)
y_test = strat_test_set["median_house_value"].copy()

X_test_prepared = full_pipeline.transform(X_test)
    
final_predictions = final_model.predict(X_test_prepared)

final_mse=mean_squared_error(y_test,final_predictions)
final_rmse=np.sqrt(final_mse)


#SVM
from sklearn.svm import SVR #回帰の場合はSVCではなくてSVRを使う

param_grid = [
        {'kernel': ['linear'], 'C': [10., 30., 100., 300., 1000., 3000., 10000., 30000.0]},
        {'kernel': ['rbf'], 'C': [1.0, 3.0, 10., 30., 100., 300., 1000.0],
         'gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0]},
    ]
svm_reg = SVR()   #モデルのパラメータはすべて辞書形式でまとめる　rbf_SVC=SVC(kernel="linear")　こうしない
grid_search = GridSearchCV(svm_reg, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=2, n_jobs=4)
grid_search.fit(housing_prepared, housing_labels)

cvres=grid_search.cv_results_
negative_mse=cvres["mean_test_score"]
#negative_mse = grid_search.best_score_
rmse = np.sqrt(-negative_mse)
rmse

grid_search.best_params_

##設問２
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import expon, reciprocal
param_distribs = {
        'kernel': ['linear', 'rbf'],
        'C': reciprocal(20, 200000),
        'gamma': expon(scale=1.0),
    }

svm_reg = SVR()
rnd_search = RandomizedSearchCV(svm_reg, param_distributions=param_distribs,
                                n_iter=50, cv=5, scoring='neg_mean_squared_error',
                                verbose=2, n_jobs=4, random_state=42)
rnd_search.fit(housing_prepared, housing_labels)


feature_importancesa = rnd_search.best_estimator_.feature_importances_
from sklearn.base import BaseEstimator, TransformerMixin

def indices_of_top_k(arr,k):
    return np,sort(np.argpartition(np.array(arr), -k)[-k:])

#重要な特徴量抽出
class TopFeatureSelector(BaseEstimator, TransformerMixin):
    def _init_(seld,feature_importances,k):
        self.feature_importances=feature_importances
        self.k=k
    def fit(self, X,y=None):
        self.feature_indices_=indices_of_top_k(self.feature_importances, self,k)
        return self
    def transform(self,X):
        return X[:, self.feature_indices]
    
k = 5

prepare_Select_And_predict_pipeline=Pipeline([
    ("preparation", full_pipeline),
    ("feature_selection",TopFeatureSelector(feature_importances,k)),
    ("svm_reg",SVR(**rnd_search.best_params_))  
])



#前処理のオプション
param_grid=[{
    "preparation__num__imputer__strategy":["mean","median",
                                       "most_frequent"],
    "feature_selection__k": list(range(1, len(feature_imporances)+1))
}]

grid_search_prep = GridSearchCV(prepare_select_and_predict_pipeline,param_grid,cv=5,
                                scoring="neg_mean_squared_error",verbose=2,n_jobs=4)

grid_search_prep.fit(housing, housing_labels)



16512 train + 4128 test
Index(['<1H OCEAN', 'NEAR OCEAN', 'INLAND', 'NEAR BAY', 'ISLAND'], dtype='object')


NameError: name 'FeatureUnion' is not defined

In [None]:
param_grid = [
        {'kernel': ['linear'], 'C': [10., 30., 100., 300., 1000., 3000., 10000., 30000.0]},
        {'kernel': ['rbf'], 'C': [1.0, 3.0, 10., 30., 100., 300., 1000.0],
         'gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0]},
    ]
svm_reg = SVR()   #モデルのパラメータはすべて辞書形式でまとめる　rbf_SVC=SVC(kernel="linear")　こうしない

In [None]:

import os
import tarfile
import urllib.request

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH= os.path.join("datasets", "housing")
HOUSING_URL= DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL,housing_path=HOUSING_PATH):
    os.makedirs(HOUSING_PATH,exist_ok=True)
    tgz_path=os.path.join(housing_path,"housing.tgz")
    urllib.request.urlretrieve(housing_url,tgz_path)
    housing_tgz=tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()
    
    
fetch_housing_data()

import pandas as pd
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path=os.path.join(housing_path,"housing.csv")
    return pd.read_csv(csv_path)
    
housing=load_housing_data()
housing.head()

housing.info()

housing["ocean_proximity"].value_counts()
housing.describe()
%matplotlib inline
import matplotlib.pyplot as plt
housing.hist(bins=50,figsize=(20,15))
import numpy as np

def split_train_test(data,test_ratio):
    shuffled_index=np.random.permutation(len(data))
    test_size=int(test_ratio*len(data))
    test_indices=shuffled_index[:test_size]
    train_indices=shuffled_index[test_size:]
    return data.iloc[train_indices],data.iloc[test_indices]

train_set,test_set=split_train_test(housing,0.2)
len(train_set),len(test_set)
from zlib import crc32

def test_set_check(identifier,test_ratio):
    return crc32(np.int64(identifier)) & 0xffffffff <test_ratio *2**32

def split_train_test_by_id(data,test_ratio,id_column):
    ids=data[id_column]
    in_test_set=ids.apply(lambda id_:test_set_check(id_,test_ratio))
    return data.loc[~in_test_set],data.loc[in_test_set]

housing_with_id=housing.reset_index()
train_set,test_set=split_train_test_by_id(housing_with_id,0.2,"index")

housing_with_id["id"]=housing["longitude"]*1000+housing["latitude"]
train_set,test_set=split_train_test_by_id(housing_with_id,0.2,"id")

from sklearn.model_selection import train_test_split
train_set,test_set=train_test_split(housing,test_size=0.2,random_state=42)

import numpy as np
import pandas as pd
housing["income_cat"]=pd.cut(housing["median_income"],bins=[0.,1.5,3,4.5,6.,np.inf],labels=[1,2,3,4,5])
housing["income_cat"].hist()

#層化抽出
from sklearn.model_selection import StratifiedShuffleSplit
split=StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)
for train_index,test_index in split.split(housing,housing["income_cat"]):
    strat_train_set=housing.loc[train_index]
    strat_test_set=housing.loc[test_index]
    
#strat_test_set["income_cat"].value_counts(normalize=True)
strat_test_set["income_cat"].value_counts()/len(strat_test_set)

for set_ in (strat_train_set,strat_test_set): 
    set_.drop("income_cat",axis=1,inplace=True)#inplaceで元のオブジェクト交換
    
housing=strat_train_set.copy()

#housing.plot(kind="scatter",x="longitude",y="latitude",alpha=0.1)
#housing.plot(kind="scatter",x="longitude",y="latitude",alpha=0.4,s=housing["population"]/100,label="population",figsize=(10,7),c="median_house_value",cmap=plt.get_cmap("jet"),colorbar=True,)
#plt.legend()

corr_matrix=housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

from pandas.plotting import scatter_matrix
attribes=["median_house_value","median_income","total_rooms","housing_median_age"]
scatter_matrix(housing[attribes],figsize=(12,8))
#housing.plot(kind="scatter",x="median_income",y="median_house_value",alpha=0.1)

housing["rooms_per_household"]=housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"]=housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"]=housing["population"]/housing["households"]

corr_matrix=housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

housing=strat_train_set.drop("median_house_value",axis=1)
housing_labels=strat_train_set["median_house_value"].copy()

housing.drop("total_bedrooms",axis=1)
housing.dropna(subset=["total_bedrooms"])
median=housing["total_bedrooms"].median()
housing["total_bedrooms"].fillna(median,inplace=True)

#欠損地補完
from sklearn.impute import SimpleImputer

imputer=SimpleImputer(strategy="median")
housing_num=housing.drop("ocean_proximity",axis=1)
imputer.fit(housing_num)
imputer.statistics_
housing_num.median().values

X=imputer.transform(housing_num)

housing_tr=pd.DataFrame(X,columns=housing_num.columns,index=housing_num.index)
#インスタンス変数　https://uxmilk.jp/41600
#オブジェクト　すべての総称　クラス　設計書　インスタンス　クラスからインスタンス化されたオブジェクト
#コンストラクタ　クラスをインスタンス化する際に実行される特別なメソッド。

housing_cat=housing[["ocean_proximity"]]#[[二つで1次元が2次元表記
housing_cat.head(10)

from sklearn.preprocessing import OrdinalEncoder
encoder=OrdinalEncoder()
housing_cat_encoded=encoder.fit_transform(housing_cat)
housing_cat_encoded[:10]
encoder.categories_

#カテゴリー処理
from sklearn.preprocessing import OneHotEncoder
cat_encoder=OneHotEncoder()#sparse=False)
housing_cat_1hot=cat_encoder.fit_transform(housing_cat)
housing_cat_1hot
cat_encoder.categories_

from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix,bedrooms_ix,population_ix,households_ix=3,4,5,6

#属性変換
class CombinedAttributesAdder(BaseEstimator,TransformerMixin):
    def __init__(self,add_bedrooms_per_room=True):
        self.add_bedrooms_per_room=add_bedrooms_per_room
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        rooms_per_household = X[:,rooms_ix]/X[:,households_ix]
        population_per_households=X[:,population_ix]/X[:,households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room=X[:,bedrooms_ix]/X[:,rooms_ix]
            return np.c_[X,rooms_per_household,population_per_households,bedrooms_per_room]
        else:
            return np.c_[X,rooms_per_household,population_per_households]
        
attr_adder=CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs=attr_adder.transform(housing.values)

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline=Pipeline([
    ("imputer",SimpleImputer(strategy="median")),
    ("attr_adder",CombinedAttributesAdder()),
    ("std_scaler",StandardScaler())
])

housing_num_tr=num_pipeline.fit_transform(housing_num)

from sklearn.compose import ColumnTransformer

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

import numpy as np

full_pipeline=ColumnTransformer([
    ("num",num_pipeline,num_attribs),
    ("cat",OneHotEncoder(),cat_attribs),
])

housing_prepared = full_pipeline.fit_transform(housing)
from sklearn.linear_model import LinearRegression

lin_reg=LinearRegression()
lin_reg.fit(housing_prepared,housing_labels)

some_data=housing.iloc[:5]
some_labels=housing_labels[:5]
some_data_prepared=full_pipeline.transform(some_data)
#print(some_data_prepared==housing_prepared[:5])
print("predictions :",lin_reg.predict(some_data_prepared))
print("labels :", list(some_labels))

from sklearn.metrics import mean_squared_error
housing_predictions=lin_reg.predict(housing_prepared)
lin_mse=mean_squared_error(housing_labels,housing_predictions)
lin_rmse=np.sqrt(lin_mse)
print("RMSE :",lin_rmse)

#決定木
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared,housing_labels)

housing_predictions = tree_reg.predict(housing_prepared)
tree_mse=mean_squared_error(housing_labels,housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

#交差検証
from sklearn.model_selection import cross_val_score
scores=cross_val_score(tree_reg,housing_prepared,housing_labels,scoring = "neg_mean_squared_error",cv=10)
tree_rmse_scores=np.sqrt(-scores)

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())
    
display_scores(tree_rmse_scores) 
lin_scores = cross_val_score(lin_reg,housing_prepared,housing_labels,scoring = "neg_mean_squared_error",cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

#ランダムフォレスト
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)
forest_scores = cross_val_score(forest_reg,housing_prepared,housing_labels,scoring = "neg_mean_squared_error",cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

#モデル保存
import joblib
joblib.dump(tree_rmse_scores, r"C:\Users\a.sakata\py3env\my_env\machine_learning\2sec_model\tree_reg.pkl")
joblib.dump(forest_rmse_scores, r"C:\Users\a.sakata\py3env\my_env\machine_learning\2sec_model\forest_reg.pkl")
joblib.dump(lin_rmse_scores, r"C:\Users\a.sakata\py3env\my_env\machine_learning\2sec_model\lin_reg.pkl")

my_model_loaded=joblib.load(r"C:\Users\a.sakata\py3env\my_env\machine_learning\2sec_model\lin_reg.pkl")

#グリッドサーチ、交差検証
from sklearn.model_selection import GridSearchCV
param_dict=[
    {"n_estimators":[3,10,30],"max_features":[2,4,6,8]},
    {"bootstrap":[False],"n_estimators":[3,10],"max_features":[2,3,4]},
]

forest_reg=RandomForestRegressor()
grid=GridSearchCV(forest_reg,param_dict,cv=5,scoring="neg_mean_squared_error",return_train_score=True)

grid.fit(housing_prepared,housing_labels)
grid.best_params_
grid.best_estimator_
joblib.dump(r"C:\Users\a.sakata\py3env\my_env\machine_learning\2sec_model\grid_forest_reg.pkl")
cvrs=grid.cv_results_
for mean_score,params in zip(cvrs["mean_test_score"],cvrs["params"]):
    print(np.sqrt(mean_score),params)
joblib.dump(grid,r"C:\Users\a.sakata\py3env\my_env\machine_learning\2sec_model\grid_forest_reg.pkl")   

feature_importances=grid.best_estimator_.feature_importances_
feature_importances

extra_attribs=["rooms_per_household","population_per_households","bedrooms_per_room"]
cat_encoder = full_pipeline.named_transformers_["cat"]
cat_one_hot_attribs = list(cat_encoder.categories_[0])
attributes=num_attribs+extra_attribs+cat_one_hot_attribs
sorted(zip(attributes,feature_importances),reverse = True)
