In [None]:
import numpy as np
import pandas as pd
import pickle
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder


In [None]:
file_name = r".\data\properties.csv"
df = pd.read_csv(file_name)


In [None]:
df['postal_zone'] = df['zip_code'].astype(str).str[:2]
columns_to_drop = ["id", "zip_code", "locality","latitude","longitude","construction_year","nbr_frontages", "equipped_kitchen", "epc","fl_double_glazing", "state_building", "fl_open_fire"]
df = df.drop(labels=columns_to_drop, axis=1)
# df = df.dropna(subset=["terrace_sqm", "garden_sqm","primary_energy_consumption_sqm","total_area_sqm"])

df = df[df["price"] <= 1200000]

In [None]:
condition = "APARTMENT"

df.loc[df['property_type'] == condition, 'surface_land_sqm'] = df.loc[df['property_type'] == condition, 'total_area_sqm']

In [None]:
X = df.loc[:, df.columns != "price"]
y = df["price"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=50, test_size=0.2)


## Linear Regression Pipeline

### BluePrint Pipeline

A pipeline in scikit-learn is a sequence of data processing steps that are chained together. These steps typically include data preprocessing, feature extraction, and model fitting. Each step in the pipeline is represented by a tuple containing a name (string) and an estimator (an object implementing the fit and transform methods).

An important aspect of pipelines in scikit-learn is their ability to uniformly handle transformations across all columns of the dataset. However not all data has to be handled the same way. A way to get around that is by using ColumnTransformer.

In [None]:
# Create pipelines for numerical transformations
numerical_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('imputer', KNNImputer(n_neighbors=5)) # You can adjust n_neighbors as needed. These 
    #, ...
])

In [None]:
# Create pipelines for categorical transformations
categorical_pipeline = Pipeline([
    ('onehot', OneHotEncoder())
    #, ...
])

In [None]:
# Find the numerical columns
numerical_columns = X_train.select_dtypes(include=['int', 'float']).columns

# Find the categorical columns
categorical_columns = X_train.select_dtypes(include=['object']).columns

The ColumnTransformer empowers me to precisely specify the columns within the dataset where transformations should be applied. This capability eliminates the necessity for manual dataset subsetting on my part. 
ColumnTransformer has an almost similar setup as the Pipeline, however the tuple requires another argument, namely the list of columns you want to apply the transformations on.
Another note is that a pipeline can be part of another Pipeline or a ColumnTransformer.
Below you can see how I input the previously designed Pipelines inside the ColumnTransformer.

In [None]:
# Combine numerical and categorical pipelines using ColumnTransformer
preprocessor = ColumnTransformer([
    #(name, method or pipeline, list of columns)
    ('numerical', numerical_pipeline, numerical_columns),
    ('categorical', categorical_pipeline, categorical_columns)
    #, ...
])

In [None]:
# Create the final pipeline
regression_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    #, ... could add something for feature selection
    ('model', LinearRegression())
])

### Building the Pipeline and the model

The .fit() method in scikit-learn is used to train the machine learning model on the provided training data. 

In [None]:
regression_pipeline.fit(X_train, y_train)

### Saving Pipeline

In [None]:
save_path = r'.\model.pkl'
with open(save_path, 'wb') as f:
    pickle.dump(regression_pipeline, f)


### Using saved model to predict

### Opening Pipeline from Pickle File

In [None]:
path_to_file = r'.\model.pkl'
with open(path_to_file, 'rb') as f:
    regression_model_pickle = pickle.load(f)

In [None]:
y_pred = regression_model_pickle.predict(X_test)
print(y_pred)

In [None]:
print(regression_model_pickle.score(X_test, y_test))

Printing the name of features after preprocessing.

In [None]:
preprocessing_step = regression_model_pickle.named_steps['preprocessor']
feature_names_after_preprocessing = preprocessing_step.get_feature_names_out()
print(feature_names_after_preprocessing)

In [None]:
print(X_test.columns)

# Q&A?

LinkedIn: https://www.linkedin.com/in/jens-dedeyne/