In [1]:
#import packages

import pandas as pd
import numpy as np
from scipy import stats
import seaborn as sns
import sklearn
from matplotlib import pyplot as plt
from sklearn.preprocessing import OneHotEncoder, StandardScaler #scalers 
from sklearn.metrics import r2_score, mean_absolute_error #score used for regression
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor #Random Forest Model
from sklearn.ensemble import GradientBoostingRegressor #Gradient Boost Model
from sklearn.tree import DecisionTreeRegressor #Decision Tree Model
from xgboost import XGBRegressor #XGBoost Model
from lightgbm import LGBMRegressor #LightGBM Model
from sklearn.model_selection import GridSearchCV #Grid Search Cross Validation
from sklearn.model_selection import train_test_split

In [14]:
#data scaling and splitting 'datetime' into 'date' and 'time', excluding delimeteres

data=pd.read_csv(r'C:/Users/daima/OneDrive/Documenti/GDP/ML/src/train.csv')
new= data["detection_time"].str.split(" ", n = 0, expand = True)
data['date'] = new[0]
data['time'] = new[1]
data['time'] = pd.Series((data['time']).astype(str).str.replace(':', '', regex=False))
data['date'] = pd.Series((data['date']).astype(str).str.replace('-', '', regex=False))
data['time'] = data['time'].astype(int)
data['date'] = data['date'].astype(int)
data = data.drop(["detection_time","tracked_point_id"], axis=1)
data['holiday'] = data.holiday.astype(int)
transformers = [
        ['one_hot', OneHotEncoder(), ['season','weather', 'time']],
        ['scaler', StandardScaler(), ['weather_index','event_index','attractions_index','time_index', 'date']],
]
ct = ColumnTransformer(transformers, remainder="passthrough")
X = ct.fit_transform(data)

In [3]:
#check NaN values on people_concentration column

data.isnull().values.any()

False

In [4]:
#detecting and deleting rows with outliers in any column

#dataset length before checking outliers
print(len(data.index))
data = data[(np.abs(stats.zscore(data['people_concentration'])) < 3)]

#dataset length after checking and removing outliers
print(len(data.index))

342144
341297


In [5]:
#filtering dataset

time_array = [0, 30, 100, 130, 200, 230, 300, 330, 400, 430, 500, 530,
             600, 630, 700, 730, 800, 830, 900, 930, 1000, 1030, 1100,
             1130, 1200, 1230, 1300, 1330, 1400, 1430, 1500, 1530,
             1600, 1630, 1700, 1730, 1800, 1830, 1900, 1930, 2000,
             2030, 2100, 2130, 2200, 2230, 2300, 2330]
data_filtered = data.loc[data['time'].isin(time_array)]
data_1 = len(data_filtered.index)
print(data_1)
data_filtered.head(300)

56871


Unnamed: 0,season,weather,events,attractions,holiday,weather_index,attractions_index,event_index,time_index,people_concentration,date,time
0,0,0,0,0,1,3,3,4,2,17,20180101,0
6,0,0,0,0,1,3,3,4,2,16,20180101,30
12,0,0,0,0,1,3,3,4,2,16,20180101,100
18,0,0,0,0,1,3,3,4,2,17,20180101,130
24,0,0,0,0,1,3,3,4,2,13,20180101,200
...,...,...,...,...,...,...,...,...,...,...,...,...
1794,0,0,0,0,0,3,3,4,2,5,20180107,530
1800,0,0,0,0,0,3,3,4,2,6,20180107,600
1806,0,0,0,0,0,3,3,4,2,7,20180107,630
1812,0,0,0,0,0,3,3,4,2,13,20180107,700


In [6]:
#Splitting data into training and testing

X = data_filtered.drop(["people_concentration"], axis=1)  # features
y = data_filtered["people_concentration"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False, random_state=False)

In [7]:
#Random forest parameters dictionary Grid Search CV
parameters_rf = {'bootstrap': [True, False],
 'max_depth': [10, 20, 30, None],
 'max_features': ['auto', 'sqrt', 'log2'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [10, 50, 100],
 'random_state': [None, 1]
}

#Decision Tree parameters dictionary Grid Search CV
parameters_dt = {'max_depth': [10, 20, 30, 50, None],
    'min_samples_split': [2, 5, 10, 50, 500],
    'splitter': ['best', 'random'],
    'random_state': [None, 1],
    'min_samples_leaf': [1, 2, 4, 10, 50],
    'min_weight_fraction_leaf': [0.0, 0.2, 0.5, 0.75],
    'max_features': ['sqrt', 'log2', 'auto']
}

#XGBoost parameters dictionary Grid Search CV
parameters_xgb = {
    'booster': ['gbtree', 'gblinear', 'dart'],
    'learning_rate': [0.01, 0.1, 0.25],
    'max_depth': [1, 2, 5, 10, None],
    'gamma': [0.1, 0.5, 0.9],
    'subsample': [0.2, 0.5, 1]
}

#LightGBM parameters dictionary Grid Search CV
parameters_lgbm = {
    'learning_rate': [0.01, 0.1, 0.25],
    'max_depth': [5, 8, 10, None],
    'boosting_type' : ['gbdt', 'dart', 'rf', 'goss'],
    'num_leaves': [10, 20, 30],
    'subsample': [0.2, 0.5, 1]
}

#Decision Tree parameters dictionary Grid Search CV
parameters_gbt = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 0.25],
    'loss': ['ls', 'lad', 'quantile'],
    'max_depth': [8, 16, 32, None],
    'subsample': [0.2, 0.5, 1]
}

#Grid Search CV
gbt_model = GradientBoostingRegressor()
CV_gbt = GridSearchCV(estimator = gbt_model, param_grid = parameters_gbt, verbose=1, n_jobs=-1)
CV_gbt.fit(X_train, y_train)

#Best parameteres from Grid Search CV
print(CV_gbt.best_params_)

Fitting 5 folds for each of 324 candidates, totalling 1620 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed: 16.1min
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed: 26.1min
[Parallel(n_jobs=-1)]: Done 1226 tasks      | elapsed: 69.2min
[Parallel(n_jobs=-1)]: Done 1620 out of 1620 | elapsed: 124.8min finished


{'learning_rate': 0.01, 'loss': 'ls', 'max_depth': 8, 'n_estimators': 150, 'subsample': 0.2}


In [9]:
#Testing Random Forest parameters

rf_model = RandomForestRegressor(bootstrap = True, max_depth = 10, max_features = 'sqrt', min_samples_leaf = 4, min_samples_split = 5, n_estimators = 100, random_state = None)
rf_model.fit(X_train, y_train)
print("Random Forest score: " + str(r2_score(y_test, rf_model.predict(X_test))))

Random Forest score: 0.7348167796253234


In [10]:
#Testing Decision Tree parameters

dt_model = DecisionTreeRegressor(max_depth = 20, min_samples_leaf = 10, min_samples_split = 500, min_weight_fraction_leaf = 0.0, random_state = None, splitter = 'random', max_features = 'auto')
dt_model.fit(X_train, y_train)
print("Decision Tree score: " + str(r2_score(y_test, dt_model.predict(X_test))))

Decision Tree score: 0.7444337069130583


In [11]:
#Testing XGBoost parameters

xgb_model = XGBRegressor(booster='dart', gamma= 0.1, learning_rate= 0.1, max_depth= 2, subsample= 0.2)
xgb_model.fit(X_train, y_train)
print("XGBoost score: " + str(r2_score(y_test, xgb_model.predict(X_test))))

XGBoost score: 0.7404509996951192


In [12]:
#Testing LightGBM parameters

lgbm_model = LGBMRegressor(boosting_type='gbdt', learning_rate=0.1, max_depth= 5, num_leaves= 10, subsample= 1)
lgbm_model.fit(X_train, y_train)
print("Light GBM score: " + str(r2_score(y_test, lgbm_model.predict(X_test))))

Light GBM score: 0.7422874141579745


In [13]:
#Testing Gradient Boost parameters

gb_model = GradientBoostingRegressor(learning_rate= 0.01, loss= 'ls', max_depth= 8, n_estimators= 150, subsample= 0.2)
gb_model.fit(X_train, y_train)
print("Gradient Boost score: " + str(r2_score(y_test, gb_model.predict(X_test))))

Gradient Boost score: 0.7435714107043585


In [None]:
#Risultati score di regressione in seguito all'utilizzo di Grid Search Cross Validation sui modelli individuati, allenati tutti
#con i best_params_ individuati

#Random Forest score: 0.7348167796253234
#Decision Tree score: 0.7444337069130583
#XGBoost score: 0.7404509996951192
#Light GBM score: 0.7422874141579745
#Gradient Boost score: 0.7435714107043585

#In seguito ai test eseguiti con i modelli di default e con i best_params_ prodotti dalle operazioni di Grid Search Cross Validation
#sui parametri dei vari modelli, si è deciso di adottare in definitiva i modelli Random Forest, Decision Tree e Gradient Boost.
#Il gruppo ha deciso di scartare LightGBM e XGBoost, in quanto più onerosi dal punto di vista computazionale con risultati che non
#ne giustificassero l'utilizzo in relazione ai costi