In [1]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_excel('DATA/final_train_s_dummies.xlsx')

In [4]:
X = data[['Tax Related', 'Number of Lawyers',
     'Number of Legal Parties', 'Value formatted',
       'Unified Contribution formatted', 'Milano', 'Bari', 'Bologna', 'Genova',
       'Palermo', 'Napoli', 'Torino', 'Trento', 'Roma', "L'Aquila", 'Potenza',
       'Perugia', 'Campobasso', 'Firenze', 'Cagliari', 'Venezia', 'Cosenza',
       'Ancona', 'Trieste', 'Aosta','OR-140999', 'OR-145009', 'OR-139999',
       'OR-145999', 'OR-130099', 'OR-101003', 'OR-130121', 'OR-130111',
       'OR-130131', 'OR-101002', 'OR-180002', 'OSA-180002']]
y=data['Settlement']

## <a style=background:yellow;color:black id='train_val_test_split'> Standardization and Split in training, validation and testing set </a>

<a style=color:deepsky> **You need to scale just the variables that are neither dummies nor boolean** </a>

In [5]:
X_to_scale = X[['Number of Lawyers','Number of Legal Parties', 'Value formatted',
       'Unified Contribution formatted']]
X_not_to_scale = X[['Tax Related','Milano', 'Bari', 'Bologna', 'Genova',
       'Palermo', 'Napoli', 'Torino', 'Trento', 'Roma', "L'Aquila", 'Potenza',
       'Perugia', 'Campobasso', 'Firenze', 'Cagliari', 'Venezia', 'Cosenza',
       'Ancona', 'Trieste', 'Aosta','OR-140999', 'OR-145009', 'OR-139999',
       'OR-145999', 'OR-130099', 'OR-101003', 'OR-130121', 'OR-130111',
       'OR-130131', 'OR-101002', 'OR-180002', 'OSA-180002']]

In [6]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

y = pd.DataFrame(y)
std_scale = StandardScaler()
X_scaled = std_scale.fit_transform(X_to_scale)

In [7]:
X_scaled_df = pd.DataFrame(X_scaled, columns=[X_to_scale.columns])
X_scaled_df = pd.concat([X_scaled_df, X_not_to_scale], axis=1)

In [8]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X_scaled_df, y, random_state=0,
                                                            test_size=0.1)

X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val,random_state=0,
                                                            test_size=0.2  )

This is useful just if you dind't merge the cities

In [9]:
X_train_val = pd.DataFrame(np.array(X_train_val), columns=[['Number of Lawyers',
     'Number of Legal Parties', 'Value formatted',
       'Unified Contribution formatted', 'Tax Related','Milano', 'Bari', 'Bologna', 'Genova',
       'Palermo', 'Napoli', 'Torino', 'Trento', 'Roma', "L'Aquila", 'Potenza',
       'Perugia', 'Campobasso', 'Firenze', 'Cagliari', 'Venezia', 'Cosenza',
       'Ancona', 'Trieste', 'Aosta','OR-140999', 'OR-145009', 'OR-139999',
       'OR-145999', 'OR-130099', 'OR-101003', 'OR-130121', 'OR-130111',
       'OR-130131', 'OR-101002', 'OR-180002', 'OSA-180002']], index = X_train_val.index)

X_test = pd.DataFrame(np.array(X_test), columns=[['Number of Lawyers',
     'Number of Legal Parties', 'Value formatted',
       'Unified Contribution formatted', 'Tax Related','Milano', 'Bari', 'Bologna', 'Genova',
       'Palermo', 'Napoli', 'Torino', 'Trento', 'Roma', "L'Aquila", 'Potenza',
       'Perugia', 'Campobasso', 'Firenze', 'Cagliari', 'Venezia', 'Cosenza',
       'Ancona', 'Trieste', 'Aosta','OR-140999', 'OR-145009', 'OR-139999',
       'OR-145999', 'OR-130099', 'OR-101003', 'OR-130121', 'OR-130111',
       'OR-130131', 'OR-101002', 'OR-180002', 'OSA-180002']], index = X_test.index)

## <a style=background:yellow;color:black id='reg_tree'> REGRESSION-TREE</a>

In [10]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error

In [12]:
reg_tree = DecisionTreeRegressor(random_state=42, criterion='absolute_error',
                                min_samples_split=10,
                                max_features='sqrt')

reg_tree.fit(X_train_val,y_train_val)

In [None]:
y_pred = reg_tree.predict(X_test)
residuals = np.ravel(np.array(y_pred)) - np.ravel(np.array(y_test))

pd.DataFrame({'y_true':np.ravel(y_test), 'y_pred':np.ravel(y_pred), 'residuals':np.ravel(residuals)})

In [None]:
y_pred_descaled = (y_pred*std_scale.var_**0.5)+std_scale.mean_
y_val_descaled = (y_val*std_scale.var_**0.5)+std_scale.mean_
print(mean_absolute_error(y_val_descaled, y_pred_descaled))

In [None]:
param_grid_1 = [{"min_samples_leaf": list(range(10,201,10)),
                'min_samples_split': list(range(10,201,10)), 
                'max_depth': list(range(5,26,5))}]
reg_tree = DecisionTreeRegressor(random_state=42)
grid_search_1 = GridSearchCV(reg_tree, param_grid=param_grid_1, cv=5,
                          scoring='neg_mean_squared_error',
                          return_train_score=True,n_jobs=-1)

grid_search_1.fit(X_train_val, y_train_val)

In [None]:
my_model_1 = grid_search_1.best_estimator_

In [None]:
my_model_1.get_params()

In [None]:
y_pred = grid_search_1.predict(X_test)

residuals = np.array(y_pred) - np.array(y_test)

In [None]:
print(mean_absolute_error(y_test, grid_search_1.predict(X_test)))
print(mean_absolute_error(y_train_val, grid_search_1.predict(X_train_val)))

In [None]:
prova = pd.DataFrame({'y_true':y_test, 'Value':X_test['Value formatted'],
                      'y_pred':y_pred, 'residuals':residuals})

prova.sort_values('residuals')

## <a style=background:yellow;color:black> MARS (Check the R Script)</a>

## <a style=background:yellow;color:black id='Random_Forest'> Random Forest </a>

#### Without grid search

* You can use <a style=color:deepskyblue> **Out-of-Bag** </a> evaluation. Random forests are decision trees with bagging, therefore at each iteration we don't consider all the observations. On average, with big samples we just consider 63% of the training instances. The other 37% are not considered (of course this 37% changes for each estimator). We can use this 37% (called out-of-bag) instead of the test set. This could lead to better results since we're using more training instances.
* When splitting a predictor having k possible unordered values, there are $2^{(k-1)}-1$ possible partitions of k values into two groups. The computation becomes unfeasible as k increases. We can simplify it with one-hot encoding, ordering the predictor classes according to the proportion falling into outcome 1; then we split this predictor as if it were an ordered predictor. However, having a fearly big (in terms of unique values (or levels)) categorical unordinal variable is not good for computation and for overfitting, therefore you should avoid such variables. 
* One major problem with trees is their high variance. Often a small change in the data can result in a very different series of splits. This is due to the hierarchical nature of the process (if an error occurs in the first split, then all the other splits can just make it worse and worse). A way to reduce this variance is to use Bagging.
* The lack of smoothness of the predictions surfaces and the difficulty in capturing additive structure can be a problem for regression tasks. We can solve this problems by using <a style=color:deepskyblue> **MARS** </a> (Multivariate Additive Regression Splines).

In [None]:
from sklearn.ensemble import RandomForestRegressor

rnd_reg = RandomForestRegressor(n_estimators=50,
                                #n_jobs=5,
                                max_depth=10,
                                criterion='absolute_error',
                               random_state = 42,
                               min_samples_split=20,
                               min_samples_leaf=20,
                               max_leaf_nodes=None)

In [None]:
rnd_reg.fit(X_train_val, y_train_val)

  rnd_reg.fit(X_train_val, y_train_val)


RandomForestRegressor(criterion='absolute_error', max_depth=10,
                      min_samples_leaf=20, min_samples_split=20,
                      n_estimators=50, random_state=42)

In [None]:
import pickle
filename='rnd_reg_50_estimators.sav'
#pickle.dump(rnd_reg_50_estimators, open(filename, 'wb')) 
rnd_reg = pickle.load(open(filename, 'rb'))

In [None]:
from sklearn.metrics import mean_absolute_error

y_pred = rnd_reg.predict(X_test)

residuals = np.array(y_pred) - np.array(y_test)

print(mean_absolute_error(y_test, y_pred))
print(mean_absolute_error(y_train_val, rnd_reg.predict(X_train_val)))

765.0234303933746
727.9669157159126


In [None]:
# # This chunk is for the MARS model.
# final_train_s_dummies_std = pd.concat([X_test, X_train_val],axis=0 )
# final_train_s_dummies_std['Settlement'] = pd.concat([y_test, y_train_val], axis = 0)
# final_train_s_dummies_std.to_excel('final_train_s_dummies_std.xlsx')

Let's destandardize X_train_val and X_test

In [None]:
X_train_val_inv_trans = pd.DataFrame(std_scale.inverse_transform(X_train_val[['Number of Lawyers','Number of Legal Parties', 'Value formatted',
       'Unified Contribution formatted']]), columns = ['Number of Lawyers','Number of Legal Parties', 'Value formatted',
       'Unified Contribution formatted'], index = X_train_val.index)

X_test_inv_trans = pd.DataFrame(std_scale.inverse_transform(X_test[['Number of Lawyers','Number of Legal Parties', 'Value formatted',
       'Unified Contribution formatted']]), columns = ['Number of Lawyers','Number of Legal Parties', 'Value formatted',
       'Unified Contribution formatted'], index=X_test.index)

X_train_val_not_std = X_train_val[['Tax Related','Milano', 'Bari', 'Bologna', 'Genova',
       'Palermo', 'Napoli', 'Torino', 'Trento', 'Roma', "L'Aquila", 'Potenza',
       'Perugia', 'Campobasso', 'Firenze', 'Cagliari', 'Venezia', 'Cosenza',
       'Ancona', 'Trieste', 'Aosta','OR-140999', 'OR-145009', 'OR-139999',
       'OR-145999', 'OR-130099', 'OR-101003', 'OR-130121', 'OR-130111',
       'OR-130131', 'OR-101002', 'OR-180002', 'OSA-180002']]

X_test_not_std = X_test[['Tax Related','Milano', 'Bari', 'Bologna', 'Genova',
       'Palermo', 'Napoli', 'Torino', 'Trento', 'Roma', "L'Aquila", 'Potenza',
       'Perugia', 'Campobasso', 'Firenze', 'Cagliari', 'Venezia', 'Cosenza',
       'Ancona', 'Trieste', 'Aosta','OR-140999', 'OR-145009', 'OR-139999',
       'OR-145999', 'OR-130099', 'OR-101003', 'OR-130121', 'OR-130111',
       'OR-130131', 'OR-101002', 'OR-180002', 'OSA-180002']]

X_train_val_destandardized = pd.concat([X_train_val_inv_trans, X_train_val_not_std],axis=1)
X_test_destandardized = pd.concat([X_test_inv_trans, X_test_not_std],axis=1)

X_train_val_destandardized = pd.DataFrame(np.array(X_train_val_destandardized), columns=[['Number of Lawyers',
     'Number of Legal Parties', 'Value formatted',
       'Unified Contribution formatted', 'Tax Related','Milano', 'Bari', 'Bologna', 'Genova',
       'Palermo', 'Napoli', 'Torino', 'Trento', 'Roma', "L'Aquila", 'Potenza',
       'Perugia', 'Campobasso', 'Firenze', 'Cagliari', 'Venezia', 'Cosenza',
       'Ancona', 'Trieste', 'Aosta','OR-140999', 'OR-145009', 'OR-139999',
       'OR-145999', 'OR-130099', 'OR-101003', 'OR-130121', 'OR-130111',
       'OR-130131', 'OR-101002', 'OR-180002', 'OSA-180002']], index = X_train_val_destandardized.index)

X_test_destandardized = pd.DataFrame(np.array(X_test_destandardized), columns=[['Number of Lawyers',
     'Number of Legal Parties', 'Value formatted',
       'Unified Contribution formatted', 'Tax Related','Milano', 'Bari', 'Bologna', 'Genova',
       'Palermo', 'Napoli', 'Torino', 'Trento', 'Roma', "L'Aquila", 'Potenza',
       'Perugia', 'Campobasso', 'Firenze', 'Cagliari', 'Venezia', 'Cosenza',
       'Ancona', 'Trieste', 'Aosta','OR-140999', 'OR-145009', 'OR-139999',
       'OR-145999', 'OR-130099', 'OR-101003', 'OR-130121', 'OR-130111',
       'OR-130131', 'OR-101002', 'OR-180002', 'OSA-180002']], index = X_test_destandardized.index)

In [None]:
# y_pred_MARS = np.array(pd.read_excel('y_pred.xlsx')).ravel()

# data_for_plotting = pd.DataFrame({'y_true':np.concatenate([np.array(y_test),np.array(y_train_val)]).ravel(),
#  'y_pred_rnd_for_no_grid':np.concatenate([np.array(rnd_reg.predict(X_test)),np.array(rnd_reg.predict(X_train_val))]),
# 'y_pred_MARS':y_pred_MARS,
#  'Value formatted':np.array(pd.concat([X_test_destandardized['Value formatted'], X_train_val_destandardized['Value formatted']], axis=0)).ravel(),
#  'Unified contribution formatted':np.array(pd.concat([X_test_destandardized['Unified Contribution formatted'], X_train_val_destandardized['Unified Contribution formatted']], axis=0)).ravel(),
# 'train_test':np.concatenate([np.repeat('test',y_test.shape[0]),np.repeat('train',y_train_val.shape[0])])})

# data_for_plotting.to_excel('data_for_plotting.xlsx')

#### With grid search

In [None]:
# clf = RandomForestClassifier(warm_start=True)
# number_of_checkpoints = 10

# for checkpoint in range(number_of_checkpoints):

#     # Load only a subset of the data and train on it
#     X, y = load_data_batch(batches=number_of_checkpoints, current_batch=checkpoint)
#     clf.fit(X, y)

#     # Save model checkpoint for each fit
#     with open('path/to/models/random_forest_ckp_{}.p'.format(checkpoint), 'wb') as f:
#         pickle.dump(clf, f)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

rnd_reg_grid = RandomForestRegressor(criterion='absolute_error',
                                random_state=42)

param_grid = [{'max_depth':np.arange(1,100,10),
               'min_samples_split':np.arange(5,100,10),
               'min_samples_leaf':np.arange(100,500,10),
               'n_estimators':np.arange(10,150,15)}]

rnd_search_cv = RandomizedSearchCV(estimator=rnd_reg_grid, param_distributions=param_grid,
                                   n_iter=100, scoring='neg_mean_absolute_error',
                                   n_jobs=-1, cv=5, random_state=42)

rnd_search_cv.fit(X_train_val, np.array(y_train_val).ravel())

## <a style=background:yellow;color:black id='xgboost'> XGBoost</a>

In [None]:
import xgboost

from sklearn.metrics import mean_squared_error, mean_absolute_error

from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = [{'gamma':[0.5,1,1.5,2,5],

              'max_depth': [5,6,9,10],

               'subsample' : [0.6,0.8,1],

               'colsample_bytree' : [0.6,0.8,1],

               'min_child_weight' : [1,5,10]

               }]

In [None]:
xgb_reg = xgboost.XGBRegressor(random_state = 42,learning_rate = 0.02,n_estimators = 500)

grid_search = GridSearchCV(xgb_reg, param_grid=param_grid, cv=5,

                          scoring='neg_mean_squared_error',

                          return_train_score=True,n_jobs=-1)



grid_search.fit(X_train_val,y_train_val)

In [None]:
final_model = grid_search.best_estimator_

final_model

 

y_pred_test = final_model.predict(X_test)

 

print(mean_absolute_error(y_pred_test, y_test))

## <a style=background:yellow;color:black id='neural_network'> Neural Network </a>

In [None]:
import tensorflow as tf
from tensorflow import keras
import os

In [None]:
tf.random.set_seed(42)
keras.backend.clear_session()

In [None]:
def scheduler(epoch, lr):
    if epoch < 3:
        return lr
    else:
        return lr * tf.math.exp(-0.025)

def get_run_logdir(root_logdir, model_name):
    import time
    run_id = time.strftime('run_%Y_%m_%d-%H_%M_%S')
    return os.path.join(root_logdir, model_name+'_'+run_id)

model_name = input('Enter model name: ')
root_logdir = os.path.join(os.curdir, 'my_logs')
run_logdir = get_run_logdir(root_logdir, model_name)

lr_schedule_cb = tf.keras.callbacks.LearningRateScheduler(scheduler)
early_stop_cb = tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)
tensorboard_cb = keras.callbacks.TensorBoard(run_logdir,
                                            histogram_freq=1)

In [None]:
model = keras.models.Sequential([
    keras.layers.Dense(64, activation="relu", kernel_initializer="he_normal",input_shape=X_train.shape[1:]),
    keras.layers.Dropout(0.1),
    keras.layers.Dense(128, activation='relu', kernel_initializer="he_normal"),
    keras.layers.Dropout(0.1),
    keras.layers.Dense(64, activation='relu', kernel_initializer="he_normal"),
    keras.layers.Dropout(0.1),
    keras.layers.Dense(30, activation="relu", kernel_initializer="he_normal"),
    keras.layers.Dense(1)
])

model.compile(loss='mae', optimizer=keras.optimizers.Adam(learning_rate=0.015))

In [None]:
history = model.fit(X_train, y_train, epochs=100, validation_data=(X_val, y_val),callbacks=[tensorboard_cb, lr_schedule_cb, early_stop_cb])

Epoch 1/100
   1/2174 [..............................] - ETA: 13:24 - loss: 1867.6157

2023-01-06 16:33:40.588529: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




2023-01-06 16:33:56.485004: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100


In [None]:
!tensorboard --logdir=./my_logs --port=6006

Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.9.1 at http://localhost:6006/ (Press CTRL+C to quit)
^C


In [None]:
from sklearn.metrics import mean_absolute_error

mean_absolute_error(y_test, model.predict(X_test))



2023-01-06 16:41:37.173969: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




768.7058876937961

## <a style=background:yellow;color:black> Oversampling </a>

In [None]:
X_train_val_inv_trans = pd.DataFrame(std_scale.inverse_transform(X_train_val[['Number of Lawyers','Number of Legal Parties', 'Value formatted',
       'Unified Contribution formatted']]), columns = ['Number of Lawyers','Number of Legal Parties', 'Value formatted',
       'Unified Contribution formatted'], index = X_train_val.index)

X_test_inv_trans = pd.DataFrame(std_scale.inverse_transform(X_test[['Number of Lawyers','Number of Legal Parties', 'Value formatted',
       'Unified Contribution formatted']]), columns = ['Number of Lawyers','Number of Legal Parties', 'Value formatted',
       'Unified Contribution formatted'], index=X_test.index)

X_train_val_not_std = X_train_val[['Tax Related','Milano', 'Bari', 'Bologna', 'Genova',
       'Palermo', 'Napoli', 'Torino', 'Trento', 'Roma', "L'Aquila", 'Potenza',
       'Perugia', 'Campobasso', 'Firenze', 'Cagliari', 'Venezia', 'Cosenza',
       'Ancona', 'Trieste', 'Aosta','OR-140999', 'OR-145009', 'OR-139999',
       'OR-145999', 'OR-130099', 'OR-101003', 'OR-130121', 'OR-130111',
       'OR-130131', 'OR-101002', 'OR-180002', 'OSA-180002']]

X_test_not_std = X_test[['Tax Related','Milano', 'Bari', 'Bologna', 'Genova',
       'Palermo', 'Napoli', 'Torino', 'Trento', 'Roma', "L'Aquila", 'Potenza',
       'Perugia', 'Campobasso', 'Firenze', 'Cagliari', 'Venezia', 'Cosenza',
       'Ancona', 'Trieste', 'Aosta','OR-140999', 'OR-145009', 'OR-139999',
       'OR-145999', 'OR-130099', 'OR-101003', 'OR-130121', 'OR-130111',
       'OR-130131', 'OR-101002', 'OR-180002', 'OSA-180002']]

X_train_val_destandardized = pd.concat([X_train_val_inv_trans, X_train_val_not_std],axis=1)
X_test_destandardized = pd.concat([X_test_inv_trans, X_test_not_std],axis=1)

X_train_val_destandardized = pd.DataFrame(np.array(X_train_val_destandardized), columns=[['Number of Lawyers',
     'Number of Legal Parties', 'Value formatted',
       'Unified Contribution formatted', 'Tax Related','Milano', 'Bari', 'Bologna', 'Genova',
       'Palermo', 'Napoli', 'Torino', 'Trento', 'Roma', "L'Aquila", 'Potenza',
       'Perugia', 'Campobasso', 'Firenze', 'Cagliari', 'Venezia', 'Cosenza',
       'Ancona', 'Trieste', 'Aosta','OR-140999', 'OR-145009', 'OR-139999',
       'OR-145999', 'OR-130099', 'OR-101003', 'OR-130121', 'OR-130111',
       'OR-130131', 'OR-101002', 'OR-180002', 'OSA-180002']], index = X_train_val_destandardized.index)

X_test_destandardized = pd.DataFrame(np.array(X_test_destandardized), columns=[['Number of Lawyers',
     'Number of Legal Parties', 'Value formatted',
       'Unified Contribution formatted', 'Tax Related','Milano', 'Bari', 'Bologna', 'Genova',
       'Palermo', 'Napoli', 'Torino', 'Trento', 'Roma', "L'Aquila", 'Potenza',
       'Perugia', 'Campobasso', 'Firenze', 'Cagliari', 'Venezia', 'Cosenza',
       'Ancona', 'Trieste', 'Aosta','OR-140999', 'OR-145009', 'OR-139999',
       'OR-145999', 'OR-130099', 'OR-101003', 'OR-130121', 'OR-130111',
       'OR-130131', 'OR-101002', 'OR-180002', 'OSA-180002']], index = X_test_destandardized.index)

In [None]:
data_train_val_destandardized = pd.concat([X_train_val_destandardized, y_train_val],names=list(X_train_val_destandardized.columns)+['Settlemnet'],axis=1)
data_train_val_destandardized_5200_26000 = data_train_val_destandardized.loc[(data_train_val_destandardized[('Value formatted',)]>5200) & (data_train_val_destandardized[('Value formatted',)]<= 26000)]

oversampled_5200_26000_train_val = pd.DataFrame(columns=list(data_train_val_destandardized_5200_26000.columns))
for i in range(4):
    mask = np.random.choice(data_train_val_destandardized_5200_26000.shape[0], data_train_val_destandardized_5200_26000.shape[0])
    oversampled_5200_26000_train_val = oversampled_5200_26000_train_val.append(data_train_val_destandardized_5200_26000.iloc[mask])

final_oversampled = data_train_val_destandardized.loc[(data_train_val_destandardized[('Value formatted',)]<=5200) | (data_train_val_destandardized[('Value formatted',)]> 26000)]\
    .append(oversampled_5200_26000_train_val)

# final_oversampled.to_excel('final_oversampled.xlsx')

In [None]:
final_oversampled = pd.read_excel('final_oversampled.xlsx').drop('Unnamed: 0',axis=1)

X_y_train_val_oversampled = pd.DataFrame(np.array(final_oversampled), columns=[['Number of Lawyers',
     'Number of Legal Parties', 'Value formatted',
       'Unified Contribution formatted', 'Tax Related','Milano', 'Bari', 'Bologna', 'Genova',
       'Palermo', 'Napoli', 'Torino', 'Trento', 'Roma', "L'Aquila", 'Potenza',
       'Perugia', 'Campobasso', 'Firenze', 'Cagliari', 'Venezia', 'Cosenza',
       'Ancona', 'Trieste', 'Aosta','OR-140999', 'OR-145009', 'OR-139999',
       'OR-145999', 'OR-130099', 'OR-101003', 'OR-130121', 'OR-130111',
       'OR-130131', 'OR-101002', 'OR-180002', 'OSA-180002','Settlement']], index = final_oversampled.index)



In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

X_oversamp = X_y_train_val_oversampled.drop('Settlement',axis=1)
y_oversamp = X_y_train_val_oversampled['Settlement']
y_oversamp = pd.DataFrame(y_oversamp)

X_to_scale_oversamp = X_oversamp[['Number of Lawyers','Number of Legal Parties', 'Value formatted',
       'Unified Contribution formatted']]
X_not_to_scale_oversamp  = X_oversamp[['Tax Related','Milano', 'Bari', 'Bologna', 'Genova',
       'Palermo', 'Napoli', 'Torino', 'Trento', 'Roma', "L'Aquila", 'Potenza',
       'Perugia', 'Campobasso', 'Firenze', 'Cagliari', 'Venezia', 'Cosenza',
       'Ancona', 'Trieste', 'Aosta','OR-140999', 'OR-145009', 'OR-139999',
       'OR-145999', 'OR-130099', 'OR-101003', 'OR-130121', 'OR-130111',
       'OR-130131', 'OR-101002', 'OR-180002', 'OSA-180002']]

std_scale = StandardScaler()
X_scaled_oversamp  = std_scale.fit_transform(X_to_scale_oversamp )

  X_oversamp = X_y_train_val_oversampled.drop('Settlement',axis=1)


In [None]:
X_scaled_df_oversamp  = pd.DataFrame(X_scaled_oversamp , columns=[X_to_scale_oversamp.columns.tolist()])
X_scaled_df_oversamp  = pd.concat([X_scaled_df_oversamp, X_not_to_scale_oversamp], axis=1)

In [None]:
X_train_val_oversamp, X_test_oversamp, y_train_val_oversamp, y_test_oversamp = train_test_split(X_scaled_df_oversamp,
                                                            y_oversamp, random_state=0,test_size=0.1)

X_train_oversamp, X_val_oversamp, y_train_oversamp, y_val_oversamp = train_test_split(X_train_val_oversamp,
                                                             y_train_val_oversamp,random_state=0,test_size=0.2)

In [None]:
import tensorflow as tf
from tensorflow import keras
import os

def scheduler(epoch, lr):
    if epoch < 3:
        return lr
    else:
        return lr * tf.math.exp(-0.025)

def get_run_logdir(root_logdir, model_name):
    import time
    run_id = time.strftime('run_%Y_%m_%d-%H_%M_%S')
    return os.path.join(root_logdir, model_name+'_'+run_id)

model_name = input('Enter model name: ')
root_logdir = os.path.join(os.curdir, 'my_logs')
run_logdir = get_run_logdir(root_logdir, model_name)

lr_schedule_cb = tf.keras.callbacks.LearningRateScheduler(scheduler)
early_stop_cb = tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)
tensorboard_cb = keras.callbacks.TensorBoard(run_logdir,
                                            histogram_freq=1)

In [None]:
tf.random.set_seed(42)
keras.backend.clear_session()

model = keras.models.Sequential([
    keras.layers.Dense(64, activation="relu", kernel_initializer="he_normal",input_shape=X_train.shape[1:]),
    keras.layers.Dropout(0.1),
    keras.layers.Dense(128, activation='relu', kernel_initializer="he_normal"),
    keras.layers.Dropout(0.1),
    keras.layers.Dense(64, activation='relu', kernel_initializer="he_normal"),
    keras.layers.Dropout(0.1),
    keras.layers.Dense(30, activation="relu", kernel_initializer="he_normal"),
    keras.layers.Dense(1)
])

model.compile(loss='mae', optimizer=keras.optimizers.Adam(learning_rate=0.015))

In [None]:
history = model.fit(X_train_oversamp, y_train_oversamp, epochs=100, validation_data=(X_val_oversamp, y_val_oversamp),callbacks=[tensorboard_cb, lr_schedule_cb, early_stop_cb])

In [None]:
# data_for_plotting = pd.read_excel('data_for_plotting.xlsx')
# data_for_plotting['y_pred_neural_network_oversamp'] = np.concatenate([np.array(model.predict(X_test)),np.array(model.predict(X_train_val))])
# data_for_plotting.to_excel('data_for_plotting.xlsx')