In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel

In [12]:
# Define features and target
X = df.drop('Age', axis=1)
y = df['Age']


In [13]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [14]:
# Define categorical and numerical columns
categorical_cols = [cname for cname in X.columns if X[cname].dtype == "object"]
numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]

In [16]:
# Define the RandomForestRegressor model
model = RandomForestRegressor()

In [25]:
# Preprocessing pipeline
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])


In [17]:
# Define hyperparameters to tune
param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [None, 10, 20, 30],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4],
}

In [19]:
# Hyperparameter tuning using GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)


Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [21]:
# Best parameters found during grid search
print("Best parameters found during grid search:")
print(grid_search.best_params_)


Best parameters found during grid search:
{'model__max_depth': 10, 'model__min_samples_leaf': 2, 'model__min_samples_split': 2, 'model__n_estimators': 200}


In [22]:
# Evaluate model on test set
best_model = grid_search.best_estimator_
preds = best_model.predict(X_test)
score_mae = mean_absolute_error(y_test, preds)
score_mse = mean_squared_error(y_test, preds)
score_r2 = r2_score(y_test, preds)
print('\nEvaluation on test set:')
print('MAE:', score_mae)
print('MSE:', score_mse)
print('R^2:', score_r2)


Evaluation on test set:
MAE: 4.467169780777708
MSE: 58.4537184168133
R^2: 0.534825536518641


In [23]:
# Cross-validation score
cv_scores = cross_val_score(best_model, X, y, cv=5, scoring='r2')
print('\nCross-validation R^2 scores:', cv_scores)
print('Mean CV R^2:', cv_scores.mean())


Cross-validation R^2 scores: [0.61661353 0.64248122 0.55369289 0.62336384 0.61996199]
Mean CV R^2: 0.6112226954759363


In [24]:
# Predict biological ages on the test set
biological_ages = best_model.predict(X_test)

# Print biological ages
print("Biological Ages:")
print(biological_ages)

# Write biological ages to a file
with open('biological_ages.txt', 'w') as file:
    for age in biological_ages:
        file.write(str(age) + '\n')


Biological Ages:
[37.05845905 43.38915803 31.06721311 36.58433645 33.47977078 38.38983184
 31.06721311 29.54787616 34.48059699 29.86286514 24.02713483 26.2094816
 25.33827582 31.51305632 29.03175375 47.02285713 35.07694191 43.38915803
 27.28966784 33.07855964 42.02979563 28.03582494 39.58158267 38.19888258
 35.07694191 39.58158267 42.12463542 37.78842049 29.71054929 17.036
 42.15199909 22.00071429 37.98591073 29.71054929 39.63914813 42.82598628
 31.92167207 33.47977078 35.14430984 42.12463542 34.92408565 40.50392038
 45.93364723 39.53818885 37.91824096 33.47977078 47.96994519 35.14430984
 29.71054929 39.53818885 36.58433645 31.06721311 29.86286514 36.98040528
 45.93364723 31.06721311 20.72138482 24.07697024 36.58433645 42.15199909
 27.28966784 39.58158267 42.02979563 35.06529373 45.93364723 45.93364723
 47.96994519 42.15199909 35.14430984 36.58433645 38.38983184 39.53818885
 47.09200061 47.09200061 32.03047952 51.94606144 51.94606144 15.20543632
 39.63914813 36.58433645 44.04913937 38.

In [None]:
# Preprocessing of training data, fit model
#my_pipeline.fit(X_train, y_train)

#preds = my_pipeline.predict(X_test)

In [None]:
#score_mae = mean_absolute_error(y_test, preds)
#score_mse = mean_squared_error(y_test, preds)
#score_r2 = r2_score(y_test, preds)

#print('MAE:', score_mae)
#print('MSE:', score_mse)
#print('R^2:', score_r2)