In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer
from sklearn import decomposition, datasets
from sklearn import neighbors
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn import set_config
from sklearn.neighbors import KNeighborsRegressor
import joblib

In [3]:
silicon_df = pd.read_csv('housing-train-data.csv')
#silicon_df = silicon_df.rename(columns={"Unnamed: 0": "id_zone"})
silicon_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16512 entries, 0 to 16511
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0          16512 non-null  int64  
 1   longitude           16512 non-null  float64
 2   latitude            16512 non-null  float64
 3   housing_median_age  16512 non-null  float64
 4   total_rooms         16512 non-null  float64
 5   total_bedrooms      16336 non-null  float64
 6   population          16512 non-null  float64
 7   households          16512 non-null  float64
 8   median_income       16512 non-null  float64
 9   median_house_value  16512 non-null  float64
 10  ocean_proximity     16512 non-null  object 
dtypes: float64(9), int64(1), object(1)
memory usage: 1.4+ MB


In [4]:
num_cols = silicon_df.drop(['ocean_proximity','median_house_value'], axis=1).columns.tolist()
cat_cols = ['ocean_proximity']

num_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean')),
    ('scale',MinMaxScaler())
])

cat_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('one-hot',OneHotEncoder(handle_unknown='ignore', sparse=False))
])

col_trans = ColumnTransformer(transformers=[
    ('num_pipeline', num_pipeline, num_cols),
    ('cat_pipeline', cat_pipeline, cat_cols)
], remainder='drop', n_jobs=-1)

In [5]:
KNN = neighbors.KNeighborsRegressor()
pipe = Pipeline(steps = [('col_trans', col_trans),
                           ("KNN", KNN)])
pipe

In [6]:
X = silicon_df.drop('median_house_value', axis=1)
y = silicon_df['median_house_value']

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.25, random_state=42)

In [7]:
n_neighbors = [2, 3, 5, 10]
algorithm = ["auto",  "ball_tree", "kd_tree", "brute"]

parameters = dict(KNN__n_neighbors=n_neighbors,
                      KNN__algorithm=algorithm)

In [9]:
regr = GridSearchCV(pipe, parameters, cv=5, scoring='r2')

regr.fit(X_train, y_train)



In [8]:
regr.best_params_

{'KNN__algorithm': 'auto', 'KNN__n_neighbors': 10}

In [10]:
best_regr = regr.best_estimator_
best_regr

In [11]:
y_pred = best_regr.predict(X_test)

best_regr.score(X_test, y_test)

0.7196075871419345

In [12]:
filename = 'modele_regr_Ryan.joblib'
joblib.dump(regr, filename)

['modele_regr_Ryan.joblib']

In [13]:
load_regr = joblib.load(filename)