In [1]:
# Importing packages 
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.compose import make_column_selector
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

## USING READ PICKLE 

In [2]:
df_trees = pd.read_pickle('./df_trees.pkl')

## PREPARING DATA 

In [3]:
# Creating y and x 
y = df_trees['ANNEEDEPLANTATION']
x = df_trees.drop('ANNEEDEPLANTATION', axis=1)

# Creating training and testing dataset 
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=0) 

## PIPELINE 

In [4]:
# Creating list of numerical and categorical values 
num_var = make_column_selector(dtype_include=np.number)
cat_var = make_column_selector(dtype_include=object)

In [5]:
# Creating mun et cat pipelines 
num_pipeline = make_pipeline(SimpleImputer(strategy='most_frequent'), 
                             StandardScaler())

cat_pipeline = make_pipeline(SimpleImputer(strategy='most_frequent'),
                            OneHotEncoder(handle_unknown='ignore'))

In [6]:
# Creating pipeline combining num et cat var
preprocess = make_column_transformer((num_pipeline, num_var),
                                    (cat_pipeline, cat_var))

In [7]:
# Creating model pipeline 
lin_model = make_pipeline(preprocess, LinearRegression())
# lin_model.fit(x, y)

In [8]:
# lin_model.named_steps['columntransformer'].transformers_[1][1].named_steps['onehotencoder'].categories_


## TESTING NAIF_MODELE

In [9]:
# Creating naif model 
naif_model = y_train.mean()

In [20]:
# computing MSE of naif model 
def compute_MSE(y, y_predict):
    mse = 0
    n = len(y)
    for i in (y):
        mse_calcule = (i - y_predict)**2
        mse = mse_calcule + mse
    return mse / n 

# print(f'naif_model MSE =
MSE_naif_model = compute_MSE(y_test, naif_model)
print(f'mean MSE = {MSE_naif_model}')

136.65283405203198
215.7919959081103
640.6031711603994
640.6031711603994
266.0206567286344
44.75423095856813
387.6905990015742
93.89339281464645
215.7919959081103
215.7919959081103
136.65283405203198
93.89339281464645
312.93115776418864
7.235348483797036
69.05842167817654
859.0842886856283
75.51367219595367
454.1220536351705
640.6031711603994
204.7800979660199
234.40037734732712
86.67870105948377
1575.2850113754296
215.7919959081103
21.99478972118258
114.27311343333922
32.37451033987536
93.89339281464645
93.89339281464645
372.88149487255606
609.589202095038
372.88149487255606
28.197583534254857
39.81786291556209
514.8297608576524
745.8437299230138
2.855627865104264
609.589202095038
1575.2850113754296
32.37451033987536
21.99478972118258
745.8437299230138
543.3626123977849
127.91925982209823
215.7919959081103
387.6905990015742
86.67870105948377
215.7919959081103
312.93115776418864
187.41227528941752
312.93115776418864
234.40037734732712
387.6905990015742
246.17171652680307
215.7919959081

## TESTING LIN_MODELE

In [11]:
# Trainning lin_model 
lin_model.fit(x_train, y_train)



Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('pipeline-1',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('standardscaler',
                                                                   StandardScaler())]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7f6153826f40>),
                                                 ('pipeline-2',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehotencoder',
                                                       

In [13]:
# Testing lin_model 
cv_scores = cross_val_score(lin_model, x_train, y_train, cv=5, scoring='neg_mean_squared_error')
cv_scores_R2 = cross_val_score(lin_model, x_train, y_train, cv=5, scoring='accuracy')

print(f'naif model MSE = {MSE_naif_model}')
print('mean MSE:',np.mean(-cv_scores))
print('std MSE:',np.std(-cv_scores))
print('mean R²:',np.mean(cv_scores_R2))
print('std R²:',np.std(cv_scores_R2))
# print(f' train = {lin_model.score(x_train, y_train)}')
# print(f' test = {lin_model.score(x_test, y_test)}')

Traceback (most recent call last):
  File "/home/fernando.arroyo@Digital-Grenoble.local/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 687, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/fernando.arroyo@Digital-Grenoble.local/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 87, in __call__
    score = scorer._score(cached_call, estimator,
  File "/home/fernando.arroyo@Digital-Grenoble.local/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 242, in _score
    return self._sign * self._score_func(y_true, y_pred,
  File "/home/fernando.arroyo@Digital-Grenoble.local/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/home/fernando.arroyo@Digital-Grenoble.local/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_classification.py", line 202, in accuracy_score
    y_type, y_true, y_pred = _check_targets(y_true,

naif model MSE = 321.4670713373224
mean MSE: 162.58526634313242
std MSE: 4.693593014502271
mean R²: nan
std R²: nan


Traceback (most recent call last):
  File "/home/fernando.arroyo@Digital-Grenoble.local/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 687, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/fernando.arroyo@Digital-Grenoble.local/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 87, in __call__
    score = scorer._score(cached_call, estimator,
  File "/home/fernando.arroyo@Digital-Grenoble.local/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 242, in _score
    return self._sign * self._score_func(y_true, y_pred,
  File "/home/fernando.arroyo@Digital-Grenoble.local/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/home/fernando.arroyo@Digital-Grenoble.local/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_classification.py", line 202, in accuracy_score
    y_type, y_true, y_pred = _check_targets(y_true,

## TESTING SVR_MODEL

In [14]:
# Creating SVR_model pipeline and tranning 
SVR_model = make_pipeline(preprocess, SVR())
SVR_model.fit(x_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('pipeline-1',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('standardscaler',
                                                                   StandardScaler())]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7f6153826f40>),
                                                 ('pipeline-2',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehotencoder',
                                                       

In [15]:
# Testing SVR_model 
# SVR_model.score(x_test, y_test)
print(f' train = {SVR_model.score(x_train, y_train)}')
print(f' test = {SVR_model.score(x_test, y_test)}')

 train = 0.5420074936860813
 test = 0.5226709791949418


## TESTING TREE_MODEL

In [16]:
# Creating tree_model and tranning 
tree_model = make_pipeline(preprocess, tree.DecisionTreeRegressor())
tree_model.fit(x_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('pipeline-1',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('standardscaler',
                                                                   StandardScaler())]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7f6153826f40>),
                                                 ('pipeline-2',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehotencoder',
                                                       

In [17]:
# Testing tree_model 
# tree_model.score(x_test, y_test)
print(f' train = {tree_model.score(x_train, y_train)}')
print(f' test = {tree_model.score(x_test, y_test)}')

 train = 1.0
 test = 0.7668091298756123


## TESTING FOREST_MODEL

In [18]:
# Creating forest_model and tranning 
forest_model = make_pipeline(preprocess, RandomForestRegressor(n_estimators=10))
forest_model.fit(x_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('pipeline-1',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('standardscaler',
                                                                   StandardScaler())]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7f6153826f40>),
                                                 ('pipeline-2',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehotencoder',
                                                       

In [19]:
# Testing forest_model
print(f' train = {forest_model.score(x_train, y_train)}')
print(f' test = {forest_model.score(x_test, y_test)}')

 train = 0.9735978040326154
 test = 0.8531457205911235
