In [2]:
## Runned using python 3.12.1

## Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV


In [3]:
# Import data
train_data = pd.read_csv('train_data_features_extracted_filtered.csv', index_col=0)
evaluation_data = pd.read_csv('evaluation_data_features_extracted_filtered.csv', index_col=0)

In [4]:
# Random state
random_state = 42

# Define the RF Regressor
# https://scikit-learn.org/1.6/modules/generated/sklearn.ensemble.RandomForestRegressor.html#randomforestregressor
rf = RandomForestRegressor(random_state=random_state)

param_grid = {
    'n_estimators': [1000],                    # number of trees
    'max_depth': [15,20,30],                   # maximum depth of trees.
    'min_samples_split': [2,5,10],             # minimum number of samples required to split an internal node
    'min_samples_leaf': [3,5],                 # minimum number of samples required to be at a leaf node
    'max_features': [0.5,0.65],                # fraction of features to consider when looking for the best split
    'max_samples': [0.5,0.65],                 # fraction of samples to draw from X to train each base estimator
    'criterion': ['poisson','squared_error']   # impurity criterion
}

In [None]:
X = train_data.drop(columns=['age'])
y = train_data['age']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Since the train_data is not balanced, we assign weights to each sample based on the age group.
age_bins = [0,45,101]                                       # define the age bins
y_train_bins = pd.cut(y_train, bins=age_bins, right=False)  # right=False: the intervals are left-closed, right-open [0, 10), [10, 20), ..., [90, 100)
group_counts = y_train_bins.value_counts()/len(y_train)     # number of samples in each age group (this is a series with the age groups as index)
group_weights = 1 - group_counts                            # Weight = 1 - (number of samples in the age group / total number of samples)

# map the weight to each sample based on the age group
train_sample_weights = y_train_bins.map(group_weights)

# Create and fit the GridSearchCV
grid_search = GridSearchCV( estimator=rf,
                            param_grid=param_grid,
                            cv=5, n_jobs=-1,
                            scoring='neg_mean_squared_error')

grid_search.fit(X_train, y_train, **{'sample_weight': train_sample_weights})

# Get the best model and make predictions
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"Best Parameters: {grid_search.best_params_}")
print(f"RMSE: {rmse}\n")


In [None]:
# Plot predicted vs true age
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([0, 100], [0, 100], color='red', linestyle='--')
plt.xlabel('True Age', fontsize=22)
plt.ylabel('Predicted Age', fontsize=22)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
plt.grid(True)
plt.savefig('RF.png', dpi=300)
plt.show()

In [None]:
## Evaluation data

y_eval = best_rf.predict(evaluation_data)
y_eval = pd.DataFrame(y_eval, columns=['Predicted'])
display(y_eval)

# Save the predictions to a CSV file
y_eval.to_csv('evaluation_predictions.csv', index=True, index_label='Id')