## IMPORTING PACKAGES 

In [42]:
# Importing packages 
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.compose import make_column_selector
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

## USING READ PICKLE 

In [2]:
df_trees = pd.read_pickle('./df_trees.pkl')

## CREATING DATA SPLIT 

In [3]:
# Creating y and x 
y = df_trees['ANNEEDEPLANTATION']
x = df_trees.drop('ANNEEDEPLANTATION', axis=1)

# Creating training and testing dataset 
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=0) 

## PIPELINE 

In [4]:
# Creating list of numerical and categorical values 
num_var = make_column_selector(dtype_include=np.number)
cat_var = make_column_selector(dtype_include=object)

In [5]:
# Creating mun et cat pipelines 
num_pipeline = make_pipeline(SimpleImputer(strategy='most_frequent'), 
                             StandardScaler())

cat_pipeline = make_pipeline(SimpleImputer(strategy='most_frequent'),
                            OneHotEncoder(handle_unknown='ignore'))

In [6]:
# Creating pipeline combining num et cat var
preprocess = make_column_transformer((num_pipeline, num_var),
                                    (cat_pipeline, cat_var))

In [8]:
# lin_model.named_steps['columntransformer'].transformers_[1][1].named_steps['onehotencoder'].categories_


## TESTING NAIF_MODELE

In [9]:
# Creating naif model 
naif_model = y_train.mean()

In [21]:
# computing MSE of naif model 
def compute_MSE(y, y_predict):
    mse = 0
    n = len(y)
    for i in (y):
        mse_calcule = (i - y_predict)**2
        mse = mse_calcule + mse
    return mse / n 

# print(f'naif_model MSE =
MSE_naif_model = compute_MSE(y_test, naif_model)
print(f'mean MSE = {MSE_naif_model}')

mean MSE = 321.4670713373224


## TESTING LIN_MODELE

In [43]:
# Trainning lin_model and testing overfitting  

# Creating 
lin_model = make_pipeline(preprocess, LinearRegression())

# Testing overfitting 
lin_cv_scores_train = cross_val_score(lin_model, x_train, y_train, cv=5, scoring='neg_mean_squared_error')
lin_cv_scores_R2_train = cross_val_score(lin_model, x_train, y_train, cv=5, scoring='r2')

print(f'naif model MSE = {MSE_naif_model}')
print('lin_model train mean MSE:',np.mean(-lin_cv_scores_train))
print('lin_model train std MSE:',np.std(-lin_cv_scores_train))
print('lin_model train mean R²:',np.mean(lin_cv_scores_R2_train))
print('lin_model train std R²:',np.std(lin_cv_scores_R2_train))

naif model MSE = 321.4670713373224
lin_model train mean MSE: 162.58526634313242
lin_model train std MSE: 4.693593014502271
lin_model train mean R²: 0.4899604137256414
lin_model train std R²: 0.009223693810323792


In [41]:
# Fitting and testing lin_model

# Fitting 
lin_model.fit(x_train,y_train)

# Testing 
y_pred_train_lin = lin_model.predict(x_train)
lin_mse_train = mean_squared_error(y_train,y_pred_train_lin)
y_pred_test_lin = lin_model.predict(x_test)
lin_mse_test = mean_squared_error(y_test,y_pred_test_lin)
lin_r2_train = lin_model.score(x_train, y_train)
lin_r2_test = lin_model.score(x_test, y_test)

print(f'naif model MSE = {MSE_naif_model}')
print(f'lin_model train MSE : {lin_mse_train}')
print(f'lin_model train R² : {lin_r2_train}')
print(f'lin_model test MSE : {lin_mse_test}')
print(f'lin_model test R² : {lin_r2_test}')

naif model MSE = 321.4670713373224
lin_model train MSE : 156.89878787773398
lin_model train R² : 0.5077390743965187
lin_model test MSE : 163.84672887055598
lin_model test R² : 0.49031431942549275


## TESTING SVR_MODEL

In [38]:
# Creating forest_model and testing overfitting 

# Creating 
SVR_model = make_pipeline(preprocess, SVR())

# Testing overfitting 
SVR_cv_scores_train = cross_val_score(SVR_model, x_train, y_train, cv=5, scoring='neg_mean_squared_error')
SVR_cv_scores_R2_train = cross_val_score(SVR_model, x_train, y_train, cv=5, scoring='r2')

print(f'naif model MSE = {MSE_naif_model}')
print('SVR_model train mean MSE:',np.mean(-SVR_cv_scores_train))
print('SVR_model train std MSE:',np.std(-SVR_cv_scores_train))
print('SVR_model train mean R²:',np.mean(SVR_cv_scores_R2_train))
print('SVR_model train std R²:',np.std(SVR_cv_scores_R2_train))

naif model MSE = 321.4670713373224
SVR_model train mean MSE: 155.5955785609023
SVR_model train std MSE: 5.625580288666737
SVR_model train mean R²: 0.5118950437257619
SVR_model train std R²: 0.013632678461715661


In [39]:
# Fitting and testing SVR_model

# Fitting 
SVR_model.fit(x_train,y_train)

# Testing 
y_pred_train_SVR = SVR_model.predict(x_train)
SVR_mse_train = mean_squared_error(y_train,y_pred_train_SVR)
y_pred_test_SVR = SVR_model.predict(x_test)
SVR_mse_test = mean_squared_error(y_test,y_pred_test_SVR)
SVR_r2_train = SVR_model.score(x_train, y_train)
SVR_r2_test = SVR_model.score(x_test, y_test)

print(f'naif model MSE = {MSE_naif_model}')
print(f'SVR_model train MSE : {SVR_mse_train}')
print(f'SVR_model train R² : {SVR_r2_train}')
print(f'SVR_model test MSE : {SVR_mse_test}')
print(f'SVR_model test R² : {SVR_r2_test}')

naif model MSE = 321.4670713373224
SVR_model train MSE : 145.97638236194607
SVR_model train R² : 0.5420074936860813
SVR_model test MSE : 153.445155778634
SVR_model test R² : 0.5226709791949418


## TESTING TREE_MODEL

In [36]:
# Creating tree_model and testing overfitting

# Creating 
tree_model = make_pipeline(preprocess, tree.DecisionTreeRegressor())

# Testing overfitting 
tree_cv_scores_train = cross_val_score(tree_model, x_train, y_train, cv=5, scoring='neg_mean_squared_error')
tree_cv_scores_R2_train = cross_val_score(tree_model, x_train, y_train, cv=5, scoring='r2')

print(f'naif model MSE = {MSE_naif_model}')
print('tree_model train mean MSE:',np.mean(-tree_cv_scores_train))
print('tree_model train std MSE:',np.std(-tree_cv_scores_train))
print('tree_model train mean R²:',np.mean(tree_cv_scores_R2_train))
print('tree_model train std R²:',np.std(tree_cv_scores_R2_train))

naif model MSE = 321.4670713373224
tree_model train mean MSE: 76.62608424456337
tree_model train std MSE: 5.526464114823786
tree_model train mean R²: 0.7575177242332058
tree_model train std R²: 0.015454159856259241


In [37]:
# Testing tree_model 

# Fitting 
tree_model.fit(x_train,y_train)

# Testing 
y_pred_train_tree = tree_model.predict(x_train)
tree_mse_train = mean_squared_error(y_train,y_pred_train_tree)
y_pred_test_tree = tree_model.predict(x_test)
tree_mse_test = mean_squared_error(y_test,y_pred_test_tree)
tree_r2_train = tree_model.score(x_train, y_train)
tree_r2_test = tree_model.score(x_test, y_test)

print(f'naif model MSE = {MSE_naif_model}')
print(f'tree_model train MSE : {tree_mse_train}')
print(f'tree_model train R² : {tree_r2_train}')
print(f'tree_model test MSE : {tree_mse_test}')
print(f'tree_model test R² : {tree_r2_test}')

naif model MSE = 321.4670713373224
tree_model train MSE : 0.0
tree_model train R² : 1.0
tree_model test MSE : 73.25247851949769
tree_model test R² : 0.7721300899606238


## TESTING FOREST_MODEL

In [32]:
# Creating forest_model and testing overfitting 

# Creating 
forest_model = make_pipeline(preprocess, RandomForestRegressor(n_estimators=10))

# Testing overfitting 
forest_cv_scores_train = cross_val_score(forest_model, x_train, y_train, cv=5, scoring='neg_mean_squared_error')
forest_cv_scores_R2_train = cross_val_score(forest_model, x_train, y_train, cv=5, scoring='r2')

print(f'naif model MSE = {MSE_naif_model}')
print('forest_model train mean MSE:',np.mean(-forest_cv_scores_train))
print('forest_model train std MSE:',np.std(-forest_cv_scores_train))
print('forest_model train mean R²:',np.mean(forest_cv_scores_R2_train))
print('forest_model train std R²:',np.std(forest_cv_scores_R2_train))

naif model MSE = 321.4670713373224
forest_model train mean MSE: 52.51664284142011
forest_model train std MSE: 4.880023098675987
forest_model train mean R²: 0.8340979899405413
forest_model train std R²: 0.01389320100372562


In [35]:
# Fitting and testing forest_model

# Fitting 
forest_model.fit(x_train,y_train)

# Testing 
y_pred_train_forest = forest_model.predict(x_train)
forest_mse_train = mean_squared_error(y_train,y_pred_train_forest)
y_pred_test_forest = forest_model.predict(x_test)
forest_mse_test = mean_squared_error(y_test,y_pred_test_forest)
forest_r2_train = forest_model.score(x_train, y_train)
forest_r2_test = forest_model.score(x_test, y_test)

print(f'naif model MSE = {MSE_naif_model}')
print(f'forest_model train MSE : {forest_mse_train}')
print(f'forest_model train R² : {forest_r2_train}')
print(f'forest_model test MSE : {forest_mse_test}')
print(f'forest_model test R² : {forest_r2_test}')

naif model MSE = 321.4670713373224
forest_model train MSE : 8.6451764861411
forest_model train R² : 0.9728762558548918
forest_model test MSE : 47.146270984798406
forest_model test R² : 0.8533398904019537


In [45]:
# Varifiying model kyes 
forest_model.named_steps.keys()

dict_keys(['columntransformer', 'randomforestregressor'])

## GRIDSEARCH RANDOM FOREST  

In [47]:
# Creating model 
model = make_pipeline(preprocess, RandomForestRegressor())

# Creating 
params = {
    'randomforestregressor__n_estimators':[75, 100, 125],
    'randomforestregressor__min_samples_split':[2, 3, 4,],
    'randomforestregressor__min_samples_leaf':[1, 2, 3, 4]
}

# Creating gridsearch 
grid = GridSearchCV(model, param_grid=params, cv=5,n_jobs=-1)

# Model fitting 
grid.fit(x_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('pipeline-1',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer(strategy='most_frequent')),
                                                                                         ('standardscaler',
                                                                                          StandardScaler())]),
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x7f6153826f40>),
                                                                        ('pipeline-2',
                                                                         Pipeline(steps=[('simpleimputer...
                     

In [50]:
# Model testing 
grid.score(x_test, y_test)

0.8685407677601558

In [52]:
grid.best_estimator_

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('pipeline-1',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('standardscaler',
                                                                   StandardScaler())]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7f614b10c6d0>),
                                                 ('pipeline-2',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehotencoder',
                                                       