In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import FunctionTransformer, PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse

from scipy.special import boxcox, inv_boxcox


from prettytable import PrettyTable

In [25]:
processed_df = pd.read_csv('processed_dataset/std_dataset.csv', index_col = 0)
dev_df = processed_df.loc[:24487,:]
dev_df.shape

(24488, 39)

In [27]:
X = dev_df.loc[:,:'NOX'].drop(columns = ['TEY', 'CDP'])
y = dev_df['CO']
X.shape, y.shape

((24488, 9), (24488,))

In [39]:
rs = 42
models = [make_pipeline(
            PolynomialFeatures(degree = 2),
            RandomForestRegressor(random_state = rs)
            ),
          make_pipeline(
            PolynomialFeatures(degree = 3),
            RandomForestRegressor(random_state = rs)
            ),
          make_pipeline(
            RandomForestRegressor(random_state = rs)
            )
          ]

names = [
    f'poly{2}+rf',
    f'poly{3}+rf',
    'rf',
]

t = PrettyTable()
t.field_names = ['lambda', 'model', 'MSE']

for l in [0.1, 0, 1]:
    print('lambda: ', l)
    for model, name in zip(models, names):
            print(model)
            X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = rs, test_size = 0.25) #Qui y normale

            print('X_train_shape: ',X_train.shape)

            print('y_train describe before: \n', y_train.describe())

            y_train= boxcox(y_train, l)
            print('y_train describe after: \n',y_train.describe())
            print('y_train shape: ',y_train.shape) 

            model.fit(X_train, y_train)
            y_pred = model.predict(X_test) #Sqrt y_pred
            print(pd.Series(y_pred).describe())

            y_pred = inv_boxcox(y_pred,l) #y_pred riportati ai valori reali
            print(pd.Series(y_pred).describe())

            t.add_row([l, name, mse(y_test, y_pred)])
    print(t)


lambda:  0.1
Pipeline(steps=[('polynomialfeatures', PolynomialFeatures()),
                ('randomforestregressor',
                 RandomForestRegressor(random_state=42))])
X_train_shape:  (18366, 9)
y_train describe before: 
 count    18366.000000
mean         2.372954
std          2.268559
min          0.000388
25%          1.184375
50%          1.716800
75%          2.875800
max         44.103000
Name: CO, dtype: float64
y_train describe after: 
 count    18366.000000
mean         0.615947
std          0.824441
min         -5.441434
25%          0.170655
50%          0.555334
75%          1.114140
max          4.603159
Name: CO, dtype: float64
y_train shape:  (18366,)
count    6122.000000
mean        0.613572
std         0.741991
min        -1.825037
25%         0.179784
50%         0.489297
75%         1.050325
max         3.909697
dtype: float64
count    6122.000000
mean        2.282911
std         1.949286
min         0.133309
25%         1.195049
50%         1.612367
75%     