In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [10]:
melbourne_file_path = 'data/melbourne-housing-snapshot/melb_data.csv'
melbourne_data = pd.read_csv(melbourne_file_path)

# simple DecisionTreeRegressor model creation

In [11]:
# drops missing values
melbourne_data = melbourne_data.dropna(axis=0)

# target variable y
y = melbourne_data.Price

# features selected for judgement
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']
X = melbourne_data[melbourne_features]

# validation

In [12]:
X, y = X.values, y.values
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 42, shuffle=True, test_size=0.10)

In [13]:
def training_crossval_models(X,Y):
    best_models = []
    moyenne = 0
    liste_predictions = []
    liste_indexes = []
    for i in tqdm(range(100)):
        y_preds_class = None
        
        kf = KFold(n_splits=6, shuffle=True, random_state=np.random.randint(9,999))
        for j, (train_index, test_index) in enumerate(kf.split(X)):
            X_train, X_val = X[train_index], X[test_index]
            Y_train, Y_val = Y[train_index], Y[test_index]
            model = RandomForestRegressor(random_state=42)
            model.fit(X_train, Y_train)
            predictions = model.predict(X_val)
            liste_predictions.append(predictions)
            liste_indexes.append(test_index)
    return liste_predictions, liste_indexes


def correction_data(X, Y, liste_predictions, liste_indexes, topk=10):
    dico = {}
    for i in range(len(liste_predictions)):
        liste_prediction = liste_predictions[i]
        liste_index = liste_indexes[i]
        for j in range(len(liste_index)):
            if liste_index[j] not in dico:
                dico[liste_index[j]] = [liste_prediction[j]]
            else:
                dico[liste_index[j]] += [liste_prediction[j]]

    entropy = {}
    values_mean = {}
    for i in dico:
        liste = []
        for value_prediction in dico[i]:
            liste.append((value_prediction-Y[i])**2)
        liste = np.array(liste)
        entropy_ = liste
        entropy[i] = np.log(np.mean(entropy_))- 2*np.std(dico[i])
        values_mean[i] = np.mean(dico[i])
    liste = list(entropy.items())
    keys, values = [], []
    for i,j in liste:
        keys.append(i)
        values.append(j)
        
    indexes_sorted = np.argsort(values)[::-1][:topk]
    keys = np.array(keys)[indexes_sorted]
    values = np.array(values)[indexes_sorted]
    for i,key_index in enumerate(keys):
        Y[key_index] = values_mean[key_index]
        
    return Y

In [None]:
for i in range(150):
    model = RandomForestRegressor(random_state=42)
    model.fit(train_X, train_y)
    val_predictions = model.predict(val_X)
    val_mae = np.sqrt(mean_absolute_error(val_y, val_predictions))
    print("Validation {} MAE for best value of max_leaf_nodes: {:,.5f}".format(i, val_mae))
    
    liste_predictions, liste_indexes = training_crossval_models(train_X, train_y)
    train_y = correction_data(train_X,train_y, liste_predictions, liste_indexes)

# log(x) -2*std HAUT POTENTIEL (sans overflow)

Validation 0 MAE for best value of max_leaf_nodes: 418.03501


 23%|██▎       | 23/100 [03:18<11:50,  9.22s/it]

In [None]:
# refaire la formule basé sur l'AIC et la BIC
# https://fr.wikipedia.org/wiki/Critère_d'information_d'Akaike

In [None]:
log(x)-std est la transformation sans overflow de x/exp(std) 
plus std est petit et plus x est grand plus il doit etre corrigé
TESTER log(x)-std**2 (parce que x au carré)
trouver la formule avec brice

In [None]:
for i in range(150):
    model = RandomForestRegressor(random_state=42)
    model.fit(train_X, train_y)
    val_predictions = model.predict(val_X)
    val_mae = np.sqrt(mean_absolute_error(val_y,val_predictions))
    print("Validation {} MAE for best value of max_leaf_nodes: {:,.5f}".format(i, val_mae))
    
    liste_predictions, liste_indexes = training_crossval_models(train_X, train_y)
    train_y = correction_data(train_X,train_y, liste_predictions, liste_indexes)
    
# x/exp(std) NON

In [None]:
for i in range(150):
    model = RandomForestRegressor(random_state=42)
    model.fit(train_X, train_y)
    val_predictions = model.predict(val_X)
    val_mae = np.sqrt(mean_absolute_error(val_y,val_predictions))
    print("Validation {} MAE for best value of max_leaf_nodes: {:,.5f}".format(i, val_mae))
    
    liste_predictions, liste_indexes = training_crossval_models(train_X, train_y)
    train_y = correction_data(train_X,train_y, liste_predictions, liste_indexes)
    
# x*exp(x)/std NON

In [None]:
for i in range(150):
    model = RandomForestRegressor(random_state=42)
    model.fit(train_X, train_y)
    val_predictions = model.predict(val_X)
    val_mae = np.sqrt(mean_absolute_error(val_y,val_predictions))
    print("Validation {} MAE for best value of max_leaf_nodes: {:,.5f}".format(i, val_mae))
    
    liste_predictions, liste_indexes = training_crossval_models(train_X, train_y)
    train_y = correction_data(train_X,train_y, liste_predictions, liste_indexes)

# exp(-x)/x/std NON

In [None]:
# laisser tourner 15 epoques pour voir ce que ça donne avec ça
# remettre std(y) car cest ce que lon cherche donc np.mean(MAE)/np.std(ŷ)
