In [1]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from eumap.misc import find_files, nan_percentile, GoogleSheet, ttprint

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV, GroupKFold
import joblib

from sklearn.metrics import r2_score, mean_squared_error
from tool_kit import calc_ccc, accuracy_plot, uncertainty_plot
from sklearn.model_selection import train_test_split, cross_val_score, HalvingGridSearchCV, KFold, GroupKFold

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# read in necessary material
folder = '/mnt/primus/xuemeng_tmp_harbour/soc'
cal = pd.read_csv(f'{folder}/data/006.0_cal.pnts_oc.csv',low_memory=False)
# cal = pd.read_csv(f'{folder}/data/006.1_cal.pnts_oc.org.csv',low_memory=False)
# cal = pd.read_csv(f'{folder}/data/006.2_cal.pnts_oc.mnr.csv',low_memory=False)

cal = pd.concat([cal] * 10, ignore_index=True)

# covariates
# /SOC-EU/features/002_selected.covar_rank.freq.txt
with open(f'/mnt/primus/xuemeng_tmp_harbour/soc/SOC-EU/features/002_selected.covar_rank.freq.txt', 'r') as file:
    lines = file.readlines()
covs = [line.strip() for line in lines]

# dataset
cal = cal.dropna(subset=covs,how='any')

# target variable
tgt = 'oc_log1p'
# tgt= 'oc'

# spatial cross validation
spatial_cv_column = 'tile_id'
cv = GroupKFold(n_splits=5)

# score
from tool_kit import calc_ccc
from sklearn.metrics import make_scorer
ccc_scorer = make_scorer(calc_ccc, greater_is_better=True)

### Parameter fine tuning

In [2]:
fitting_score =  ccc_scorer #'neg_root_mean_squared_error' 
score_name = 'ccc'
space = 'log1p'

model_index = 5
sample_weights = cal['oc_qa'].values**2

In [6]:
# # random forest
param_grid = {
    'n_estimators': [200, 500, 800, 1000],
    'max_depth': [10, 20, 30],
    'max_features': [0.3, 0.5, 0.7, 'log2', 'sqrt'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

tune_rf = HalvingGridSearchCV(
    estimator=RandomForestRegressor(),
    param_grid=param_grid,
    scoring=fitting_score,
    n_jobs=90, 
    cv=cv,
    verbose=1
)
tune_rf.fit(cal[covs], cal[tgt], sample_weight=sample_weights, groups=cal[spatial_cv_column])
if isinstance(sample_weights,int):
    weight = ''
    ttprint(f'start fine tuning rf{weight}')
    tune_rf.fit(cal[covs], cal[tgt], groups=cal[spatial_cv_column])
else:
    weight = '.weighted'
    ttprint(f'start fine tuning rf{weight}')
    tune_rf.fit(cal[covs], cal[tgt], sample_weight=sample_weights, groups=cal[spatial_cv_column])
ttprint("Finish fine tuning\nBest parameters found: ", tune_rf.best_params_)
# joblib.dump(tune_rf.best_params_, f'{folder}/SOC-EU/model/test_best.params_rf.{space}.{score_name}{weight}.joblib')
joblib.dump(tune_rf.best_estimator_, f'{folder}/SOC-EU/model/test_model.org.balanced_rf.{space}.{score_name}{weight}.joblib')


# # simple ANN
# model_index = model_index+1
# from sklearn.preprocessing import StandardScaler
# from sklearn.neural_network import MLPRegressor
# from sklearn.model_selection import HalvingGridSearchCV
# from sklearn.pipeline import Pipeline
# import joblib

# from sklearn import set_config
# set_config(enable_metadata_routing=True)

# pipeline = Pipeline([
#     ('scaler', StandardScaler()),
#     ('mlp', MLPRegressor(max_iter=5000, early_stopping=True, random_state=42))
# ])
# pipeline['mlp'].set_score_request(sample_weight=True)

# param_grid_ann = {
#     'mlp__hidden_layer_sizes': [(50,), (100,), (100, 50), (100, 100)],  # NN structure
#     'mlp__activation': ['tanh', 'relu'],  # commonly used activation functions in NN
#     'mlp__solver': ['adam', 'sgd'],  # optimizer
#     'mlp__alpha': [0.0001, 0.001, 0.01],  # regularization to prevent overfitting
#     'mlp__learning_rate': ['constant', 'adaptive'],  # how aggressive the weights update
#     'mlp__learning_rate_init': [0.001, 0.01]  # initial learning rate
    
# }

# # Define the HalvingGridSearchCV with the pipeline
# tune_ann = HalvingGridSearchCV(
#     estimator=pipeline,
#     param_grid=param_grid_ann,
#     scoring=fitting_score,
#     n_jobs=-1,
#     cv=cv,
#     verbose=1
# )

# if isinstance(sample_weights,int):
#     weight = ''
#     ttprint(f'start fine tuning ann{weight}')
#     tune_ann.fit(cal[covs], cal[tgt], groups=cal[spatial_cv_column])
# else:
#     weight = '.weighted'
#     ttprint(f'start fine tuning ann{weight}')
#     tune_ann.fit(cal[covs], cal[tgt], mlp__sample_weight = sample_weights, groups=cal[spatial_cv_column])
# ttprint("Finish fine tuning\nBest parameters found: ", tune_ann.best_params_)

# joblib.dump(tune_ann.best_params_, f'{folder}/SOC-EU/model/00{int(model_index)}.0_best.params_ann.log1p.{score_name}{weight}.joblib')
# joblib.dump(tune_ann.best_estimator_, f'{folder}/SOC-EU/model/00{int(model_index)}.1_model_ann.log1p.{score_name}{weight}.joblib')


# # cubist
# # model_index = model_index+1
# from cubist import Cubist
# # https://pypi.org/project/cubist/
# # rule-based predictive model
# from sklearn.preprocessing import StandardScaler
# from sklearn.pipeline import Pipeline
# from sklearn.model_selection import HalvingGridSearchCV
# import joblib
# from cubist import Cubist
# import warnings
# tgt='oc'
# from sklearn.base import BaseEstimator, TransformerMixin

# pipeline = Pipeline([
#     ('scaler', StandardScaler()),
#     ('cubist', Cubist())
# ])


# # Define the parameter grid for Cubist within the pipeline
# param_cubist = {
#     'cubist__n_rules': [100, 300, 500],  # number of rules to be generated
#     'cubist__n_committees': [1, 5, 10],  # committee: ensembles of models
#     'cubist__neighbors': [None, 3, 6, 9],  # number of nearest neighbors to use when making a prediction
#     'cubist__unbiased': [False, True],  # whether or not to use an unbiased method of rule generation
#     'cubist__extrapolation': [0.02, 0.05],  # limits the extent to which predictions can extrapolate beyond the range of the calibration data, a fraction of the total range of the target variable
#     'cubist__sample': [None]  # fraction of the calibration data used in building each model, since the calibration dataset could be very small
# }


# # Define the HalvingGridSearchCV with the pipeline
# tune_cubist = HalvingGridSearchCV(
#     estimator=pipeline,
#     param_grid=param_cubist,
#     scoring=fitting_score,
#     n_jobs=90,
#     cv=cv
# )

# # Ensure the data retains feature names
# X_cal = pd.DataFrame(cal[covs].values, columns=covs)
# y_cal = cal[tgt]

# # Start fine-tuning process
# warnings.filterwarnings('ignore')
# if isinstance(sample_weights,int):
#     weight = ''
#     ttprint(f'start fine tuning cubist{weight}')
#     tune_cubist.fit(X_cal, y_cal, groups=cal[spatial_cv_column])
# else:
#     weight = '.weighted'
#     ttprint(f'start fine tuning cubist{weight}')
#     fit_params = {'cubist__sample_weight': sample_weights}
#     tune_cubist.fit(X_cal, y_cal, **fit_params, groups=cal[spatial_cv_column])
    
# ttprint("Finish fine tuning\nBest parameters found: ", tune_cubist.best_params_)

# # Save the best parameters and model
# joblib.dump(tune_cubist.best_params_, f'{folder}/SOC-EU/model/00{int(model_index)}.0_best.params_cubist.log1p.{score_name}{weight}.joblib')
# joblib.dump(tune_cubist.best_estimator_, f'{folder}/SOC-EU/model/00{int(model_index)}.1_model_cubist.log1p.{score_name}{weight}.joblib')


In [8]:
import joblib
params = joblib.load('/mnt/primus/xuemeng_tmp_harbour/soc/SOC-EU/model/test_base.model/004.0_best.params_rf.log1p.ccc.weighted.joblib')

ttprint('start training')
params['n_jobs'] = 90
rf_regressor = RandomForestRegressor(**params)
rf_regressor.fit(cal[covs], cal[tgt], sample_weight=sample_weights)
ttprint('finish training')

[12:29:43] start training
[12:31:18] finish training


In [9]:
joblib.dump(rf_regressor, 'benchmark_rf.weighted.joblib')

['benchmark_rf.weighted.joblib']

### enxemble machine learning
- loop through each possible combination
- record the metrics
- select the optimal combination of model stacking

In [None]:
import itertools
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from tool_kit import calc_ccc, accuracy_plot, uncertainty_plot
from sklearn.model_selection import cross_val_predict

# Load models
model_list = find_files(f'{folder}/SOC-EU/model/','0*model*.joblib')
model_list = [str(i) for i in model_list]
models = [joblib.load(path) for path in model_list]
model_names = ["rf", "lasso", "ann", "cubist"]
print(model_list)

# Generate all combinations of models (2, 3, and 4)
combinations = []
for r in range(2, 5):
    combinations.extend(itertools.combinations(zip(models, model_names), r))
    
# training dataset
sampled_train = train.groupby(spatial_cv_column, group_keys=False).apply(lambda x: x.sample(min(len(x), 10))) # 44% data

results = []
# Loop through each combination of models
for combination in combinations:
    estimators = [(name, model) for model, name in combination]
    combi_name = ''
    for _, name in combination:
        combi_name = combi_name+' + '+name
    combi_name = combi_name[3::]
    if 'rf' not in combi_name:
        continue
    
    ttprint(f'fitting {combi_name}')
    # Define the Stacking Regressor
    stacking_regressor = StackingRegressor(
        estimators=estimators,
        final_estimator=LinearRegression()
    )
    
    # Fit the stacking regressor
#     y_pred = cross_val_predict(stacking_regressor, sampled_train[covs], sampled_train[tgt], cv=cv, groups=sampled_train[spatial_cv_column], n_jobs=90)  
    stacking_regressor.fit(sampled_train[covs], sampled_train[tgt])
    ttprint('finish fitting')
    y_pred = stacking_regressor.predict(test[covs])
    r2, rmse, ccc = accuracy_plot(test[tgt], y_pred, combi_name) # visuliazation
    error_spatial_plot(test[tgt], y_pred, test['lat'], test['lon'], combi_name)
    sorted_plot(test[tgt],y_pred,combi_name)
    
    # Store the results
    results.append({
        "Models": combi_name,
        "R2_CV": r2,
        "RMSE_CV": rmse,
        "CCC_CV": ccc
    })

# Create a DataFrame to store the results
results_df = pd.DataFrame(results)

results_df.to_csv(f'{folder}/SOC-EU/model/011_metrics_cv.eml.csv', index=False)
results_df

['/mnt/inca/soc_eu_model/SOC-EU/model/002_model_rf.joblib', '/mnt/inca/soc_eu_model/SOC-EU/model/004_model_lasso.joblib', '/mnt/inca/soc_eu_model/SOC-EU/model/006_model_ann.joblib', '/mnt/inca/soc_eu_model/SOC-EU/model/008_model_cubist.joblib']
[07:17:21] fitting rf + lasso


### mapie build

In [None]:
mapie = MapieRegressor(model, method="minmax", cv=5, n_jobs=90) # this cv is to compute the conformal scores, and spatial cross validation
mapie.fit(X[covs], X[tgt], groups=X[spatial_cv_column])

In [3]:
sampled_train = train.groupby(spatial_cv_column, group_keys=False).apply(lambda x: x.sample(frac=0.4))

In [7]:
print(len(sampled_train[spatial_cv_column].unique())) 
print(len(train[spatial_cv_column].unique()))

5481
5481


In [13]:
len(sampled_train)/len(train)

0.4473296612392309

### lasso

In [None]:
# #lasso linear regression

# from sklearn.linear_model import Lasso

# param_grid_lasso = {
#     'alpha': [0.001, 0.01, 0.1, 1, 10, 100]
# }

# tune_lasso = HalvingGridSearchCV(
#     estimator=Lasso(),
#     param_grid=param_grid_lasso,
#     scoring=ccc_scorer,
#     n_jobs=-1,
#     cv=cv,
#     verbose=1
# )

# ttprint(f'start parameter fine tuning for Lasso, training size: {len(train)}')
# tune_lasso.fit(train[covs], train[tgt], groups=train[spatial_cv_column])
# ttprint("Finish fine tuning\nBest parameters found: ", tune_lasso.best_params_)
# joblib.dump(tune_lasso.best_params_, f'{folder}/SOC-EU/model/003_best.params_lasso.joblib')
# joblib.dump(tune_lasso.best_estimator_, f'{folder}/SOC-EU/model/004_model_lasso.joblib')

# print(f'best parames in the initial test: alpha = 0.001, with negative_rmse as score')


# # train cubist with rmse in normal space
# from cubist import Cubist
# # https://pypi.org/project/cubist/
# # rule-based predictive model
# from sklearn.preprocessing import StandardScaler
# from sklearn.pipeline import Pipeline
# from sklearn.model_selection import HalvingGridSearchCV
# import joblib
# from cubist import Cubist

# tgt = 'oc'
# warnings.filterwarnings('ignore', message="X does not have valid feature names")

# # Define a pipeline that includes scalibration and the Cubist model
# pipeline = Pipeline([
#     ('scaler', StandardScaler()),
#     ('cubist', Cubist())
# ])

# # Define the parameter grid for Cubist within the pipeline
# param_cubist = {
#     'cubist__n_rules': [100, 300, 500],  # number of rules to be generated
#     'cubist__n_committees': [1, 5, 10],  # committee: ensembles of models
#     'cubist__neighbors': [None, 3, 6, 9],  # number of nearest neighbors to use when making a prediction
#     'cubist__unbiased': [False, True],  # whether or not to use an unbiased method of rule generation
#     'cubist__extrapolation': [0.02, 0.05],  # limits the extent to which predictions can extrapolate beyond the range of the calibration data, a fraction of the total range of the target variable
#     'cubist__sample': [None]  # fraction of the calibration data used in building each model
# }

# # Define the HalvingGridSearchCV with the pipeline
# tune_cubist = HalvingGridSearchCV(
#     estimator=pipeline,
#     param_grid=param_cubist,
#     scoring='neg_mean_squared_error',
#     n_jobs=90,
#     cv=cv
# )

# # Ensure the data retains feature names
# X_cal = pd.DataFrame(cal[covs].values, columns=covs)
# y_cal = cal[tgt]

# # Start fine-tuning process
# ttprint('start fine tuning cubist')
# tune_cubist.fit(X_cal, y_cal, groups=cal[spatial_cv_column])
# ttprint("Finish fine tuning\nBest parameters found: ", tune_cubist.best_params_)

# # Save the best parameters and model
# joblib.dump(tune_cubist.best_params_, f'{folder}/SOC-EU/model/004.0_best.params_cubist.normal.rmse.joblib')
# joblib.dump(tune_cubist.best_estimator_, f'{folder}/SOC-EU/model/004.1_model_cubist.normal.rmse.joblib')