# Tuning a Random Forest Regression Model

Custom built expanding window validation functions will be used 

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

# Import your helper functions
from timeseries_model_utils import (
    expanding_window_splits,
    run_expanding_cv,
    expanding_window_grid_search
)

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_absolute_error


In [2]:
df = pd.read_csv("New_csv.csv")   # replace with your file
df = df.sort_values("Year").reset_index(drop=True)
df.head()


Unnamed: 0,Year,Homicide,Assault,Sexual offences,Harm or endanger persons,"Robbery, blackmail, and extortion",Burglary,Theft,Fraud and related offences,Drug offences,...,"Public order, health, and safety offences",Traffic and vehicle offences,Offences against justice procedures and orders,Offences against government,Environmental offences,Miscellaneous offences,Total,Population,Crime_Index,Crime_Index_Lagged
0,1980,353,7208,1401,2176,426,11893,29103,19794,7839,...,28539,173948,27664,1376,3521,9651,331751,3112900,10.657297,
1,1981,366,7933,1092,2299,494,12974,29654,25007,8340,...,22637,142990,17981,2027,3360,9836,294409,3124900,9.421389,10.657297
2,1982,329,7760,1298,2231,504,14387,32682,26131,9518,...,18268,104739,13957,1203,4025,7878,252119,3156100,7.988308,9.421389
3,1983,407,8741,1449,2693,657,13851,32304,28481,10346,...,16067,105649,15332,736,2252,7774,254658,3199300,7.959804,7.988308
4,1984,406,9349,1682,2651,729,13905,33492,28069,12819,...,17456,116414,16389,1300,2235,8552,274022,3227100,8.491277,7.959804


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 22 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Year                                            45 non-null     int64  
 1   Homicide                                        45 non-null     int64  
 2   Assault                                         45 non-null     int64  
 3   Sexual offences                                 45 non-null     int64  
 4   Harm or endanger persons                        45 non-null     int64  
 5   Robbery, blackmail, and extortion               45 non-null     int64  
 6   Burglary                                        45 non-null     int64  
 7   Theft                                           45 non-null     int64  
 8   Fraud and related offences                      45 non-null     int64  
 9   Drug offences                                

In [4]:
target_col = "Total"  

feature_cols = ["Year","Population"]

X = df[feature_cols]
y = df[target_col]


In [5]:
param_grid = {
    "n_estimators": [100, 200, 400],
    "max_depth": [3, 5, None]
}

best_params, best_score, results_df = expanding_window_grid_search(
    RandomForestRegressor,
    param_grid,
    X, y,
    initial_train_size=10,
    horizon=1,
    step=1
)

print("\nBest Parameters:", best_params)
print("Best Score:", best_score)

results_df



[GridSearch] Params {'n_estimators': 100, 'max_depth': 3} → Score = 16098.5026
[GridSearch] Params {'n_estimators': 100, 'max_depth': 5} → Score = 16025.2644
[GridSearch] Params {'n_estimators': 100, 'max_depth': None} → Score = 15694.0060
[GridSearch] Params {'n_estimators': 200, 'max_depth': 3} → Score = 16455.8616
[GridSearch] Params {'n_estimators': 200, 'max_depth': 5} → Score = 15794.9587
[GridSearch] Params {'n_estimators': 200, 'max_depth': None} → Score = 15580.1537
[GridSearch] Params {'n_estimators': 400, 'max_depth': 3} → Score = 16215.6352
[GridSearch] Params {'n_estimators': 400, 'max_depth': 5} → Score = 15984.2024
[GridSearch] Params {'n_estimators': 400, 'max_depth': None} → Score = 15565.0770

Best Parameters: {'n_estimators': np.float64(400.0), 'max_depth': np.float64(nan)}
Best Score: 15565.077


Unnamed: 0,n_estimators,max_depth,score
0,100,3.0,16098.50258
1,100,5.0,16025.264399
2,100,,15694.006
3,200,3.0,16455.861587
4,200,5.0,15794.958694
5,200,,15580.153714
6,400,3.0,16215.635169
7,400,5.0,15984.202443
8,400,,15565.077


In [6]:

param_grid = {
    "alpha": [0.001, 0.01, 0.1, 1, 10, 50, 100],
    "max_iter": [1000, 5000, 10000]
}


In [7]:

X = X.ffill().bfill()


In [8]:
from sklearn.linear_model import Lasso

param_grid = {
    "alpha": [0.001, 0.01, 0.1, 1, 10, 50, 100],
    "max_iter": [1000, 5000, 10000],
}

best_params, best_score, results_df = expanding_window_grid_search(
    Lasso,
    param_grid,
    X,
    y,
    initial_train_size=10,
    horizon=1,
    step=1
)

print("\nBest Parameters (Lasso):", best_params)
print("Best Score:", best_score)
results_df


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

[GridSearch] Params {'alpha': 0.001, 'max_iter': 1000} → Score = 32173.1635
[GridSearch] Params {'alpha': 0.001, 'max_iter': 5000} → Score = 32173.2141
[GridSearch] Params {'alpha': 0.001, 'max_iter': 10000} → Score = 32173.2141
[GridSearch] Params {'alpha': 0.01, 'max_iter': 1000} → Score = 32173.1595
[GridSearch] Params {'alpha': 0.01, 'max_iter': 5000} → Score = 32173.2102
[GridSearch] Params {'alpha': 0.01, 'max_iter': 10000} → Score = 32173.2102
[GridSearch] Params {'alpha': 0.1, 'max_iter': 1000} → Score = 32173.1204
[GridSearch] Params {'alpha': 0.1, 'max_iter': 5000} → Score = 32173.1712
[GridSearch] Params {'alpha': 0.1, 'max_iter': 10000} → Score = 32173.1712
[GridSearch] Params {'alpha': 1, 'max_iter': 1000} → Score = 32172.7898
[GridSearch] Params {'alpha': 1, 'max_iter': 5000} → Score = 32172.8398
[GridSearch] Params {'alpha': 1, 'max_iter': 10000} → Score = 32172.8398
[GridSearch] Params {'alpha': 10, 'max_iter': 1000} → Score = 32170.1136
[GridSearch] Params {'alpha': 10

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[GridSearch] Params {'alpha': 50, 'max_iter': 5000} → Score = 32158.2297
[GridSearch] Params {'alpha': 50, 'max_iter': 10000} → Score = 32158.2297
[GridSearch] Params {'alpha': 100, 'max_iter': 1000} → Score = 32143.3432
[GridSearch] Params {'alpha': 100, 'max_iter': 5000} → Score = 32143.3432
[GridSearch] Params {'alpha': 100, 'max_iter': 10000} → Score = 32143.3432

Best Parameters (Lasso): {'alpha': np.float64(100.0), 'max_iter': np.float64(1000.0)}
Best Score: 32143.343217253165


Unnamed: 0,alpha,max_iter,score
0,0.001,1000,32173.163451
1,0.001,5000,32173.21411
2,0.001,10000,32173.21411
3,0.01,1000,32173.159547
4,0.01,5000,32173.210216
5,0.01,10000,32173.210216
6,0.1,1000,32173.120406
7,0.1,5000,32173.171155
8,0.1,10000,32173.171155
9,1.0,1000,32172.789753


# train a lasso with alpha 0.001 and max iter 1000 and save it

In [10]:
from sklearn.linear_model import Lasso

# Initialize model
lasso = Lasso(alpha=100, max_iter=1000)

# Train model
lasso.fit(X, y)


In [11]:
import joblib

joblib.dump(lasso, "Lasso_last.joblib")

['Lasso_last.joblib']

In [12]:
lasso.predict(X[-1:])

array([206849.97750722])

In [13]:
X[-1:]

Unnamed: 0,Year,Population
44,2024,5269939


# Train Random forest & Save it

In [14]:
rd = RandomForestRegressor(n_estimators=400, max_depth=None)

rd.fit(X, y)

In [15]:
X[-1:]

Unnamed: 0,Year,Population
44,2024,5269939


In [16]:
rd.predict(X[-1:])

array([219593.195])

In [17]:
joblib.dump(rd, "RandomForest_last.joblib")

['RandomForest_last.joblib']