In [29]:
# Importing packages 
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.compose import make_column_selector
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor

## USING READ PICKLE 

In [2]:
df_trees = pd.read_pickle('./df_trees.pkl')

## PREPARING DATA 

In [3]:
# Creating y and x 
y = df_trees['ANNEEDEPLANTATION']
x = df_trees.drop('ANNEEDEPLANTATION', axis=1)

# Creating training and testing dataset 
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=0) 

## PIPELINE 

In [4]:
# Creating list of numerical and categorical values 
num_var = make_column_selector(dtype_include=np.number)
cat_var = make_column_selector(dtype_include=object)

In [5]:
# Creating mun et cat pipelines 
num_pipeline = make_pipeline(SimpleImputer(strategy='most_frequent'), 
                             StandardScaler())

cat_pipeline = make_pipeline(SimpleImputer(strategy='most_frequent'),
                            OneHotEncoder(handle_unknown='ignore'))

In [6]:
# Creating pipeline combining num et cat var
preprocess = make_column_transformer((num_pipeline, num_var),
                                    (cat_pipeline, cat_var))

In [7]:
# Creating model pipeline 
lin_model = make_pipeline(preprocess, LinearRegression())
# lin_model.fit(x, y)

In [8]:
# lin_model.named_steps['columntransformer'].transformers_[1][1].named_steps['onehotencoder'].categories_

## TESTING LIN_MODELE

In [9]:
# Trainning lin_model 
lin_model.fit(x_train, y_train)



Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('pipeline-1',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('standardscaler',
                                                                   StandardScaler())]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fc14434a370>),
                                                 ('pipeline-2',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehotencoder',
                                                       

In [36]:
# Testing lin_model 
# lin_model.score(x_test, y_test)
print(f' train = {lin_model.score(x_train, y_train)}')
print(f' test = {lin_model.score(x_test, y_test)}')

 train = 0.5077390743965187
 test = 0.49031431942549275


## TESTING SVR_MODEL

In [25]:
# Creating SVR_model pipeline and tranning 
SVR_model = make_pipeline(preprocess, SVC())
SVR_model.fit(x_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('pipeline-1',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('standardscaler',
                                                                   StandardScaler())]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fc14434a370>),
                                                 ('pipeline-2',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehotencoder',
                                                       

In [35]:
# Testing SVR_model 
# SVR_model.score(x_test, y_test)
print(f' train = {SVR_model.score(x_train, y_train)}')
print(f' test = {SVR_model.score(x_test, y_test)}')

 train = 0.6919314326003614
 test = 0.6457369464639788


## TESTING TREE_MODEL

In [27]:
# Creating tree_model and tranning 
tree_model = make_pipeline(preprocess, tree.DecisionTreeRegressor())
tree_model.fit(x_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('pipeline-1',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('standardscaler',
                                                                   StandardScaler())]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fc14434a370>),
                                                 ('pipeline-2',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehotencoder',
                                                       

In [34]:
# Testing tree_model 
# tree_model.score(x_test, y_test)
print(f' train = {tree_model.score(x_train, y_train)}')
print(f' test = {tree_model.score(x_test, y_test)}')

 train = 1.0
 test = 0.7811670868530733


## TESTING FOREST_MODEL

In [30]:
# Creating forest_model and tranning 
forest_model = make_pipeline(preprocess, RandomForestRegressor(n_estimators=10))
forest_model.fit(x_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('pipeline-1',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('standardscaler',
                                                                   StandardScaler())]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fc14434a370>),
                                                 ('pipeline-2',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehotencoder',
                                                       

In [33]:
# Testing forest_model
print(f' train = {forest_model.score(x_train, y_train)}')
print(f' test = {forest_model.score(x_test, y_test)}')

 train = 0.9735503946813454
 test = 0.8544623950425769
