In [2]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib


In [3]:
 # Charger les données
housing_df = pd.read_csv("housing-train-data.csv")
housing_df

Unnamed: 0.1,Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,2072,-119.84,36.77,6.0,1853.0,473.0,1397.0,417.0,1.4817,72000.0,INLAND
1,10600,-117.80,33.68,8.0,2032.0,349.0,862.0,340.0,6.9133,274100.0,<1H OCEAN
2,2494,-120.19,36.60,25.0,875.0,214.0,931.0,214.0,1.5536,58300.0,INLAND
3,4284,-118.32,34.10,31.0,622.0,229.0,597.0,227.0,1.5284,200000.0,<1H OCEAN
4,16541,-121.23,37.79,21.0,1922.0,373.0,1130.0,372.0,4.0815,117900.0,INLAND
...,...,...,...,...,...,...,...,...,...,...,...
16507,1099,-121.90,39.59,20.0,1465.0,278.0,745.0,250.0,3.0625,93800.0,INLAND
16508,18898,-122.25,38.11,49.0,2365.0,504.0,1131.0,458.0,2.6133,103100.0,NEAR BAY
16509,11798,-121.22,38.92,19.0,2531.0,461.0,1206.0,429.0,4.4958,192600.0,INLAND
16510,6637,-118.14,34.16,39.0,2776.0,840.0,2546.0,773.0,2.5750,153500.0,<1H OCEAN


In [4]:
data =  housing_df

In [5]:
# Supprimer les doublons (à faire avant de créer le pipeline)
#data.drop_duplicates(inplace=True)

In [6]:
numeric_features = [
    'longitude', 'latitude', 'housing_median_age', 'total_rooms',
    'total_bedrooms', 'population', 'households', 'median_income'
]
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Remplace les NaN par la médiane
    ('scaler', StandardScaler())
])

categorical_features = ['ocean_proximity']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),  # Remplace les NaN par 'missing'
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


In [7]:
# Créer le pipeline complet (préprocessing + modèle)
#drop_columns_transformer = DropColumnsTransformer(columns_to_drop=['Unnamed: 0', 'median_house_value'])
#pipeline = Pipeline(steps=[
 #   ('drop_columns', drop_columns_transformer),
  #  ('preprocessor', preprocessor),
    # Ajoutez votre modèle ici (par exemple, ('knn', KNNRegressor()))
#])


In [20]:
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
    
])
model = KNeighborsRegressor(n_neighbors= 8)

pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])


In [21]:
X = data.drop('median_house_value', axis=1)
y = data['median_house_value']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
pipeline.fit(X_train, y_train)

In [23]:
y_pred = pipeline.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R² Score:", r2)

Mean Absolute Error: 40399.063237965485
Mean Squared Error: 3559441175.5134964
Root Mean Squared Error: 59661.05241707941
R² Score: 0.7304156256147942


In [12]:
filename = 'knn_regressor_pipeline.joblib'
joblib.dump(pipeline, filename)

['knn_regressor_pipeline.joblib']

In [13]:
loaded_pipeline = joblib.load(filename)