# Prédiction

## Encodage

- Utilisation du OneHotEncoding
- Scaling de nos données continues

## Sélection


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option("display.max_colwidth", None)

data = pd.read_csv("cleaned.csv")
data.columns

# Itération 1


In [None]:
def get_df(
    data,
    drop=[
        "SecondLargestPropertyUseType",
        "SecondLargestPropertyUseTypeGFA",
        "ThirdLargestPropertyUseType",
        "ThirdLargestPropertyUseTypeGFA",
        "ENERGYSTARScore",
    ],
):
    df = data.copy()
    df = df.drop(columns=drop)
    return df


df = get_df(data)


In [None]:
df.dtypes.value_counts()


In [None]:
df.select_dtypes("object").head()


In [None]:
df.select_dtypes(np.number).head()


In [None]:
from sklearn.preprocessing import OneHotEncoder


def encoding(df, cols):
    ohes = []
    for col in cols:
        ohe = OneHotEncoder(sparse=False)
        ohe.fit(df[col].unique().reshape(-1, 1))
        t = ohe.transform(df[col].to_numpy().reshape(-1, 1))
        ohes.append(ohe)
        df_t = pd.DataFrame(t, columns=ohe.get_feature_names_out())
        df = pd.concat([df, df_t], axis=1).drop(columns=col)
    return df, ohe


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


def preprocessing(df, targets, encode_cols):
    df, ohes = encoding(df, encode_cols)

    target_splits = []
    X = df.drop(columns=targets)
    for target in targets:
        y = df[target].values.ravel()

        print(f"Target {target} with {X.shape[1]} features ({y.shape[0]} lines)")

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

        n_ohe_cols = df.columns[(~df.columns.isin(encode_cols)) & (~df.columns.isin(targets))]

        sc = StandardScaler()
        X_train[n_ohe_cols] = sc.fit_transform(X_train[n_ohe_cols])
        X_test[n_ohe_cols] = sc.transform(X_test[n_ohe_cols])

        target_splits.append([X_train, X_test, y_train, y_test])

    return target_splits

In [None]:
targets = ["TotalGHGEmissions", "SiteEnergyUse(kBtu)"]
ohe_cols = [
    "BuildingType",
    "PrimaryPropertyType",
    "Neighborhood",
    "LargestPropertyUseType",
    "YearBuiltRange",
    "ZipCode",
    "CouncilDistrictCode",
    "NumberofBuildings",
    "NumberofFloors",
]

emissionTTS, energyTTS = preprocessing(df, targets, ohe_cols)

for v in emissionTTS:
    print(v.shape)

In [None]:
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_squared_error, median_absolute_error, r2_score
from sklearn.model_selection import cross_val_score

model = DummyRegressor()

def evaluation(model, tts, verbose=0):
    X_train, X_test, y_train, y_test = tts

    r2 = cross_val_score(model, X_train, y_train, cv=5, scoring="r2").mean()

    if(verbose):
        print(" "*7 + " Validation set metrics")
        print(f"{'R2': <6} : {r2:.4f}")

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    test_r2 = r2_score(y_test, y_pred)
    if(verbose):
        print(" "*7 + " Test set metrics")
        print(f"{'RMSE': <6} : {mean_squared_error(y_test, y_pred, squared=False):.2f}")
        print(f"{'MAE': <6} : {median_absolute_error(y_test, y_pred):.2f}")
        print(f"{'R2': <6} : {test_r2:.4f}")
    return r2, test_r2

evaluation(model, emissionTTS, verbose=1);

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import (
    RandomForestRegressor,
    GradientBoostingRegressor,
    BaggingRegressor,
)
from sklearn.linear_model import RidgeCV, LassoCV, Perceptron
from tqdm import tqdm

random_state = 1

def compare_models(tts):
    models = [
        SVR(),
        #Perceptron(alpha=0.0001, random_state=0, n_jobs=-1),
        RidgeCV(alphas=np.arange(0.1, 10, 0.05)),
        RandomForestRegressor(random_state=random_state),
        GradientBoostingRegressor(random_state=random_state),
        DecisionTreeRegressor(random_state=random_state),
        KNeighborsRegressor(),
        BaggingRegressor(n_estimators=100),
        LassoCV(tol=0.01, random_state=random_state)
    ]

    metrics = pd.DataFrame(columns=["name", "val_r2", "test_r2"])

    for model in tqdm(models):
        name = type(model).__name__
        val_r2, test_r2 = evaluation(model,tts)
        metrics.loc[len(metrics)] = {"name": name, "val_r2": val_r2, "test_r2": test_r2}

    return metrics

emission_metrics = compare_models(emissionTTS)
emission_metrics

In [None]:
energy_metrics = compare_models(energyTTS)
energy_metrics

In [None]:
def show_model_perfs(target_vals):
    _, axs = plt.subplots(1, 2, figsize=(16,4))
    for i, v_type in enumerate(["val_r2", "test_r2"]):
        sns.scatterplot(data=target_vals[0], x="name", y=v_type, ax=axs[i])
        sns.scatterplot(data=target_vals[1], x="name", y=v_type, ax=axs[i])
        axs[i].set_xticklabels(energy_metrics.name, rotation=45, ha="right")
        axs[i].set_title(f"Models {v_type} on targets")
        axs[i].legend(targets)
    plt.show()

show_model_perfs([emission_metrics, energy_metrics])

In [None]:
emission_metrics

In [None]:
emission_metrics.iloc[np.argmax(emission_metrics.mean(axis=1))]

In [None]:
# keeping best models
target_models = [
    RandomForestRegressor(random_state=random_state),
    GradientBoostingRegressor(random_state=random_state)
]

target_tts_model = list(zip(targets, [emissionTTS, energyTTS], target_models))

for target, tts, model in target_tts_model:
    print("="*5 + f"> {target}")
    # show r2's
    evaluation(model, tts, verbose=True);

## Grid Search


In [None]:
from sklearn.model_selection import GridSearchCV

def search_cv(model, tts, params):
    X_train, _, y_train, __ = tts

    cv = GridSearchCV(
        model,
        cv=5,
        param_grid=params,
        scoring=["r2", "neg_root_mean_squared_error", "neg_median_absolute_error"],
        n_jobs=-1,
        refit="r2",
    )
    cv.fit(X_train, y_train)
    return cv

cvs = []

for _, tts, model, params in zip(*zip(*target_tts_model), [
    {
        "criterion": ["squared_error", "absolute_error"],
        "n_estimators": range(50, 200, 50)
    },
    {
        "loss": ['squared_error', 'absolute_error', 'huber', 'quantile'],
        "learning_rate": np.arange(0.05, 0.2, 0.05),
        "n_estimators": range(50, 200, 50)
    }
]):
    cvs.append(search_cv(model, tts, params))

In [None]:
def df_from_cv(cv_res) -> pd.DataFrame:
    cv_res = pd.DataFrame(cv_res)
    result_cols = [
        "params",
        "mean_test_r2",
        "rank_test_r2",
        "mean_test_neg_root_mean_squared_error",
        "rank_test_neg_root_mean_squared_error",
        "mean_test_neg_median_absolute_error",
        "rank_test_neg_median_absolute_error",
    ]
    rename = ["params", "r2", "rank_r2", "RMSE", "rank_RMSE", "MAE", "rank_MAE"]
    cv_res = cv_res[result_cols]
    cv_res = cv_res.rename(columns={o: n for o, n in zip(result_cols, rename)})
    cv_res["RMSE"] = -cv_res["RMSE"]
    cv_res["MAE"] = -cv_res["MAE"]
    return cv_res.sort_values(by="rank_r2")


In [None]:
for i, (target, __, model) in enumerate(target_tts_model):
    name = type(model).__name__
    print("=" * 10 + f" {target} - {name} " + "=" * 10)
    res = df_from_cv(cvs[i].cv_results_)
    res = res[max(res["r2"]) == res["r2"]]
    for c in ["params", "r2"]:
        print(f"{c:<20}", res[c].values[0])

In [None]:
best_models = list(map(lambda cv: cv.best_estimator_, cvs))

for target, tts, __, best_model in zip(*zip(*target_tts_model), best_models):
    print("="*10, f"{target} - {type(best_model).__name__}")
    evaluation(best_model, tts, verbose=1)

In [None]:
def errors(model, tts, mesure):
    _, X_test, __, y_test = tts
    y_pred = model.predict(X_test)
    abs_err = np.abs(y_pred - y_test)

    y_err_kg = abs_err.mean() * 1000

    print(f"Model has errors around +-{y_err_kg:.0f} {mesure}")


for target, tts, __, best_model, mesure in zip(*zip(*target_tts_model), best_models, ["kg of CO2 / year", "kBtu / year"]):
    print("="*5, target)
    errors(best_model, tts, mesure)

Make a pipeline to do the same process again later

In [None]:
def model_selection_pipeline(df, targets, ohe_cols):
    ttss = preprocessing(df, targets, ohe_cols)

    dfs = []
    for target, tts in zip(targets, ttss):
        print(f"Training for target {target}")
        dfs.append(compare_models(tts))
    
    show_model_perfs(dfs)

    best_models = []
    for df in dfs:
        best_models.append(df.iloc[np.argmax(df.mean(axis=1))])

    df = pd.DataFrame(best_models).reset_index(drop=True)
    df["target"] = targets
    return df, ttss

model_selection_pipeline(df, targets, [
    "BuildingType",
    "PrimaryPropertyType",
    "Neighborhood",
    "LargestPropertyUseType",
    "YearBuiltRange",
    "ZipCode",
    "CouncilDistrictCode",
    "NumberofBuildings",
    "NumberofFloors",
])[0]

In [None]:
def search_cv_results(model, tts, params, target):
    cv = search_cv(model, tts, params)
    name = type(model).__name__
    print("=" * 10 + f" {target} - {name} " + "=" * 10)
    res = df_from_cv(cv.cv_results_)
    res = res[max(res["r2"]) == res["r2"]]
    for c in ["params", "r2"]:
        print(f"{c:<20}", res[c].values[0])
    return cv.best_estimator_

# Itération 2

## Feature selection


In [None]:
for i, target in enumerate(targets):
    plt.figure(figsize=(10,30))
    plt.barh(emissionTTS[0].columns, best_models[i].feature_importances_)
    plt.title(f"Feature importance for {target}")
    plt.show()

Certaines variables n'apporte pas beaucoup à notre modèle. On peut ainsi effectuer une réduction de dimension ce qui aura pour effet d'aider le modèle à apprendre sur des données plus cohérentes et également de réduire le temps d'entrainement.

In [None]:
drop = ["Neighborhood", "CouncilDistrictCode", "ZipCode", "PrimaryPropertyType", "BuildingType", "Latitude", "Longitude", "GHGEmissionsIntensity","SecondLargestPropertyUseType", "SecondLargestPropertyUseTypeGFA", "ThirdLargestPropertyUseType", "ThirdLargestPropertyUseTypeGFA", "ENERGYSTARScore"]
drop = ["ENERGYSTARScore"]
df = get_df(data, drop=drop)
df.head()

In [None]:
ohe_cols = np.concatenate([ohe_cols, ["SecondLargestPropertyUseType", "ThirdLargestPropertyUseType"]])

In [None]:
ohe_cols = np.array(ohe_cols)

In [None]:
ohe_cols

In [None]:
dfs, ttss = model_selection_pipeline(df, targets, ohe_cols[~np.isin(ohe_cols, drop)])
dfs

GradientBoosting seems to be the way to go for the 2 features !

In [None]:
model = GradientBoostingRegressor(random_state=0)
params = {
    "loss": ['squared_error', 'absolute_error', 'huber'],
    "n_estimators": range(100, 300, 100),
    "criterion": ['friedman_mse', 'squared_error']
}

best_models = []
for tts, target in zip(ttss, targets):
    best_models.append(search_cv_results(model, tts, params, target))

In [None]:
for i, target in enumerate(targets):
    plt.figure(figsize=(10,30))
    plt.barh(tts[0].columns, best_models[i].feature_importances_)
    plt.title(f"Feature importance for {target}")
    plt.show()