In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel

In [21]:
df = pd.read_csv('cancer.csv')

X = df.drop('Age', axis=1)
y = df['Age']


In [22]:
categorical_cols = [cname for cname in X.columns if X[cname].dtype == "object"]

numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]

In [23]:
# Define the model
model = GradientBoostingRegressor(random_state=0)

In [24]:
# Preprocessing pipeline
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])


In [25]:
# Define hyperparameters to tune
param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [None, 10, 20, 30],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4],
}

In [26]:
# Pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('feature_selection', SelectFromModel(RandomForestRegressor())),
                           ('model', model)])


In [27]:
# Hyperparameter tuning using GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [28]:
# Best parameters found during grid search
print("Best parameters found during grid search:")
print(grid_search.best_params_)


Best parameters found during grid search:
{'model__max_depth': 10, 'model__min_samples_leaf': 2, 'model__min_samples_split': 5, 'model__n_estimators': 300}


In [29]:
# Evaluate model on test set
best_model = grid_search.best_estimator_
preds = best_model.predict(X_test)
score_mae = mean_absolute_error(y_test, preds)
score_mse = mean_squared_error(y_test, preds)
score_r2 = r2_score(y_test, preds)
print('\nEvaluation on test set:')
print('MAE:', score_mae)
print('MSE:', score_mse)
print('R^2:', score_r2)


Evaluation on test set:
MAE: 4.488660738208847
MSE: 61.6557620862966
R^2: 0.5093436854689848


In [30]:
# Cross-validation score
cv_scores = cross_val_score(best_model, X, y, cv=5, scoring='r2')
print('\nCross-validation R^2 scores:', cv_scores)
print('Mean CV R^2:', cv_scores.mean())


Cross-validation R^2 scores: [0.62705357 0.64325072 0.54768339 0.6235215  0.6127044 ]
Mean CV R^2: 0.6108427149599744


In [5]:
# Preprocessing of training data, fit model
#my_pipeline.fit(X_train, y_train)

#preds = my_pipeline.predict(X_test)

In [7]:
#score_mae = mean_absolute_error(y_test, preds)
#score_mse = mean_squared_error(y_test, preds)
#score_r2 = r2_score(y_test, preds)

#print('MAE:', score_mae)
#print('MSE:', score_mse)
#print('R^2:', score_r2)

MAE: 3.9199500000000005
MSE: 61.46458550000001
R^2: 0.5108650679980924
