In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import seaborn as sns
from pathlib import Path
import logging
from sklearn.preprocessing import StandardScaler
import pickle

In [2]:
logging.basicConfig(level=logging.INFO)

## Data Split

Split des données en ensemble d'entraînement et de test.

Notre variable cible est silica_concentrate et se trouve dans la dernière colonne du dataset.

L'issu de ce script seront 4 datasets (X_test, X_train, y_test, y_train) que vous pouvez stocker dans data/processed.

In [4]:
INPUT_DATA_PATH = Path('data/raw_data/raw.csv')
OUTPUT_FOLDER = Path("data/processed_data")

In [5]:
logging.info("Reading data from '%s'", INPUT_DATA_PATH)
df = pd.read_csv(INPUT_DATA_PATH)

INFO:root:Reading data from 'data/raw_data/raw.csv'


In [9]:
logging.info("Getting X & y datasets")
X = df[[
    'ave_flot_air_flow', 
    'ave_flot_level', 
    'iron_feed',
    'starch_flow',
    'amina_flow',
    'ore_pulp_flow',
    'ore_pulp_pH',
    'ore_pulp_density',
]].values

y = df['silica_concentrate'].values

INFO:root:Getting X & y datasets


In [10]:
logging.info("Splitting into a train & test set")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

INFO:root:Splitting into a train & test set


In [19]:
logging.info("Shapes: X_train=%s X_test=%s y_train=%s y_test=%s", 
             X_train.shape, X_test.shape, y_train.shape, y_test.shape)

INFO:root:Shapes: X_train=(1453, 8) X_test=(364, 8) y_train=(1453,) y_test=(364,)


In [21]:
# save
logging.info("Saving the 4 subsets into : '%s'", OUTPUT_FOLDER)
np.save(OUTPUT_FOLDER / "X_train", X_train)
np.save(OUTPUT_FOLDER / "X_test", X_test)
np.save(OUTPUT_FOLDER / "y_train", y_train)
np.save(OUTPUT_FOLDER / "y_test", y_test)

INFO:root:Saving the 4 subsets into : 'data/processed_data'


## Data Normalization

Normalisation des données. 

Comme vous pouvez le noter, les données sont dans des échelles très variés donc une normalisation est nécessaire.

 Vous pouvez utiliser des fonctions pré-existantes pour la construction de ce script. 
 
 En sortie, ce script créera deux nouveaux datasets : (X_train_scaled, X_test_scaled) que vous sauvegarderez également dans data/processed.

In [109]:
INPUT_FOLDER = Path("data/processed_data")
OUTPUT_FOLDER = Path("data/processed_data")

In [112]:
logging.info("Loading the 4 subsets from : '%s'", INPUT_FOLDER)

X_train = np.load(INPUT_FOLDER / "X_train.npy")
X_test = np.load(INPUT_FOLDER / "X_test.npy")
y_train = np.load(INPUT_FOLDER / "y_train.npy")
y_test = np.load(INPUT_FOLDER / "y_test.npy")

logging.info("Shapes: X_train=%s X_test=%s y_train=%s y_test=%s", 
             X_train.shape, X_test.shape, y_train.shape, y_test.shape)

INFO:root:Loading the 4 subsets from : 'data/processed_data'
INFO:root:Shapes: X_train=(1453, 8) X_test=(364, 8) y_train=(1453,) y_test=(364,)


In [113]:
logging.info("Fitting Standard Scaler & transforming X_train & X_test ")

X_scaler = StandardScaler().fit(np.vstack([X_train, X_test]))

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

INFO:root:Fitting Standard Scaler & transforming X_train & X_test 


In [114]:
# save
logging.info("Saving X_train_scaled & X_test_scaled into : '%s'", OUTPUT_FOLDER)
np.save(OUTPUT_FOLDER / "X_train_scaled", X_train_scaled)
np.save(OUTPUT_FOLDER / "X_test_scaled", X_test_scaled)

INFO:root:Saving X_train_scaled & X_test_scaled into : 'data/processed_data'


## Grid Search

GridSearch des meilleurs paramètres à utiliser pour la modélisation.

Vous déciderez le modèle de regression à implémenter et des paramètres à tester.

À l'issue de ce script vous aurez les meilleurs paramètres sous forme de fichier .pkl que vous sauvegarderez dans le dossier models.

In [115]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [128]:
INPUT_FOLDER = Path("data/processed_data")

# PARAM_GRID = {
#     'n_estimators': [100, 200, 300, 400],
#     'max_depth': [None, 10, 20, 30, 40],
# }
PARAM_GRID = {
    'n_estimators': [300],
}

OUTPUT_PARAMETERS_FILE = Path("models/best_parameters.pkl")

In [118]:
logging.info("Loading the 4 subsets from : '%s'", INPUT_FOLDER)

X_train_scaled = np.load(INPUT_FOLDER / "X_train_scaled.npy")
X_test_scaled = np.load(INPUT_FOLDER / "X_test_scaled.npy")
y_train = np.load(INPUT_FOLDER / "y_train.npy")
y_test = np.load(INPUT_FOLDER / "y_test.npy")

logging.info("Shapes: X_train_scaled=%s X_test_scaled=%s y_train=%s y_test=%s", 
             X_train_scaled.shape, X_test_scaled.shape, y_train.shape, y_test.shape)

INFO:root:Loading the 4 subsets from : 'data/processed_data'
INFO:root:Shapes: X_train_scaled=(1453, 8) X_test_scaled=(364, 8) y_train=(1453,) y_test=(364,)


In [106]:
logging.info("RandomForestRegression model initialisation")
rf_reg = RandomForestRegressor(random_state=42)

logging.info("Grid Searching over hyper-parameters")
grid_search = GridSearchCV(
    estimator=rf_reg,
    param_grid=PARAM_GRID,
    scoring='neg_mean_squared_error',
    verbose=2
)

# Lancer le search
grid_search.fit(X_train_scaled, y_train)

INFO:root:RandomForestRegression model initialisation
INFO:root:Grid Searching over hyper-parameters


Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END ...................................n_estimators=300; total time=   4.6s
[CV] END ...................................n_estimators=300; total time=   4.4s
[CV] END ...................................n_estimators=300; total time=   4.6s
[CV] END ...................................n_estimators=300; total time=   4.7s
[CV] END ...................................n_estimators=300; total time=   5.2s


In [None]:
logging.info("Best parameters: %s., Saving them to '%s'", grid_search.best_params_, OUTPUT_FILE)
with open(OUTPUT_PARAMETERS_FILE, "wb") as f:
    pickle.dump(grid_search.best_params_, f)

INFO:root:Best parameters: {'n_estimators': 300}., Saving them to 'models/best_parameters.pkl'


## Training

Entraînement du modèle. En utilisant les paramètres retrouvés à travers le GridSearch, on entraînera le modèle en sauvegardant le modèle entraîné dans le dossier models.

In [168]:
INPUT_PARAMETERS_FILE = Path("models/best_parameters.pkl")
INPUT_DATA_FOLDER = Path("data/processed_data")
OUTPUT_MODEL_FILE = Path("models/regressor_model.pkl")

In [172]:
logging.info("Loading X_train_scaled & y_train from '%s'", INPUT_DATA_FOLDER)
X_train_scaled = np.load(INPUT_FOLDER / "X_train_scaled.npy")
y_train = np.load(INPUT_FOLDER / "y_train.npy")

INFO:root:Loading X_train_scaled & y_train from 'data/processed_data'


In [150]:
with open(INPUT_PARAMETERS_FILE, 'rb') as f:
    best_params = pickle.load(f)

logging.info("Loaded from '%s' the parameters: %s", INPUT_PARAMETERS_FILE, best_params)

INFO:root:Loaded from 'models/best_parameters.pkl' the parameters: {'n_estimators': 300}


In [151]:
import joblib

In [152]:
logging.info("Training Random Forest Regressor with best parameters")

rf_reg = RandomForestRegressor(**best_params, random_state=42)

rf_reg.fit(X_train_scaled, y_train)

INFO:root:Training Random Forest Regressor with best parameters


In [161]:
logging.info("Saving regressor to '%s'", OUTPUT_MODEL_FILE)
joblib.dump(rf_reg, OUTPUT_MODEL_FILE);

INFO:root:Saving regressor to 'models/regressor_model.pkl'


## Evaluation

Evaluation du modèle. Finalement, en utilisant le modèle entraîné on évaluera ses performances et on fera des prédictions avec ce modèle de sorte qu'à la fin de ce script on aura un nouveau dataset dans data qui contiendra les predictions ainsi qu'un fichier scores.json dans le dossier metrics qui récupérera les métriques d'évaluation de notre modèle (i.e. mse, r2, etc).

In [193]:
INPUT_DATA_FOLDER = Path("data/processed_data")
INPUT_MODEL_FILE = Path("models/regressor_model.pkl")

OUTPUT_DATA_FOLDER = Path("data/processed_data")
OUTPUT_METRIC_FILE = Path("metrics/scores.json")

In [176]:
logging.info("Loading X_test_scaled & y_test from '%s'", INPUT_DATA_FOLDER)

X_test_scaled = np.load(INPUT_FOLDER / "X_test_scaled.npy")
y_test = np.load(INPUT_DATA_FOLDER / "y_test.npy")

INFO:root:Loading X_test_scaled & y_test from 'data/processed_data'


In [177]:
logging.info("Loading regressor from '%s'", INPUT_MODEL_FILE)
rf_reg = joblib.load(INPUT_MODEL_FILE)

INFO:root:Loading regressor from 'models/regressor_model.pkl'


In [178]:
logging.info("Predicting on X_test_scaled")
y_test_pred = rf_reg.predict(X_test_scaled)

INFO:root:Predicting on X_test_scaled


In [179]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [187]:
logging.info("Calculating various metrics")
mae = mean_absolute_error(y_test, y_test_pred)
mse = mean_squared_error(y_test, y_test_pred)
r2 = r2_score(y_test, y_test_pred)

logging.info(f"MAE={mae:.3f}, MSE={mse:.3f}, R²={r2:.3f}")

INFO:root:Calculating various metrics
INFO:root:MAE=0.680, MSE=0.779, R²=0.222


In [188]:
import json

In [195]:
OUTPUT_METRIC_FILE.write_text(
    json.dumps({
        "mae": mae,
        "mse": mse,
        "r2": r2
    }, 
    indent=4)
)

94