In [1]:
import pandas as pd
import re
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from scipy import stats
import xgboost as xgb
from catboost import CatBoostRegressor,CatBoostClassifier
import lightgbm as lgb
from sklearn.ensemble import GradientBoostingRegressor

In [2]:
def evaluate(model, X_test, y_test):
    y_pred = model.predict(X_test)
    MSE =  metrics.mean_squared_error(y_test,y_pred)
    MAE = metrics.mean_absolute_error(y_test,y_pred)
    R2 = metrics.r2_score(y_test,y_pred)
    PCC = stats.pearsonr(y_test,y_pred)
    print('Model Performance')
    print('MSE: {:0.3f}.'.format(MSE))
    print('MAE = {:0.3f}.'.format(MAE))
    print('R2 = {:0.3f}.'.format(R2))
    print('PCC = {:0.3f}.'.format(PCC[0]))
    
    return y_pred

# My paper's dataset

In [3]:
#EC 
EC_train = pd.read_csv('My_paper/EC_X_train_40.csv')
EC_test = pd.read_csv('My_paper/EC_X_test_40.csv')
EC_val = pd.read_csv('My_paper/EC_X_val_40.csv')

#SA 
SA_train = pd.read_csv('My_paper/SA_X_train_40.csv')
SA_test = pd.read_csv('My_paper/SA_X_test_40.csv')
SA_val = pd.read_csv('My_paper/SA_X_val_40.csv')

#PA 
PA_train = pd.read_csv('My_paper/PA_X_train_40.csv')
PA_test = pd.read_csv('My_paper/PA_X_test_40.csv')
PA_val = pd.read_csv('My_paper/PA_X_val_40.csv')

Three_concat_train = pd.concat([SA_train, EC_train, PA_train], axis=0)


In [4]:
#my paper
models = [
    ('RF', RandomForestRegressor(n_estimators = 400, random_state=42)),
    ('XGBoost', xgb.XGBRegressor(learning_rate=0.01, n_estimators=400, max_depth=None)),
    ('CatBoost', CatBoostRegressor(random_state=42, loss_function='RMSE', learning_rate=0.05, iterations=400, depth=5, verbose=False)),
    ('LGBM', lgb.LGBMRegressor(random_state=42, learning_rate=0.01, n_estimators=400))
]

datasets = [
    ('SA', SA_train, SA_test),
    ('EC', EC_train, EC_test),
    ('PA', PA_train, PA_test)
]

for model_name, model in models:
    print(model_name)
    for dataset_name, train_data, test_data in datasets:
        print(f'  {model_name}:{dataset_name}')
        X_train = train_data.iloc[:, 5:250]
        y_train = train_data.iloc[:, 2]
        X_test = test_data.iloc[:, 5:250]
        y_test = test_data.iloc[:, 2]

        model.fit(X_train, y_train)
        y_pred = evaluate(model, X_test, y_test)


RF
  RF:SA
Model Performance
MSE: 0.369.
MAE = 0.482.
R2 = 0.421.
PCC = 0.670.
  RF:EC
Model Performance
MSE: 0.294.
MAE = 0.409.
R2 = 0.480.
PCC = 0.697.
  RF:PA
Model Performance
MSE: 0.312.
MAE = 0.428.
R2 = 0.451.
PCC = 0.681.
XGBoost
  XGBoost:SA
Model Performance
MSE: 0.385.
MAE = 0.496.
R2 = 0.396.
PCC = 0.647.
  XGBoost:EC
Model Performance
MSE: 0.302.
MAE = 0.415.
R2 = 0.466.
PCC = 0.688.
  XGBoost:PA
Model Performance
MSE: 0.318.
MAE = 0.429.
R2 = 0.440.
PCC = 0.672.
CatBoost
  CatBoost:SA
Model Performance
MSE: 0.369.
MAE = 0.483.
R2 = 0.420.
PCC = 0.659.
  CatBoost:EC
Model Performance
MSE: 0.306.
MAE = 0.425.
R2 = 0.459.
PCC = 0.682.
  CatBoost:PA
Model Performance
MSE: 0.315.
MAE = 0.420.
R2 = 0.445.
PCC = 0.671.
LGBM
  LGBM:SA
Model Performance
MSE: 0.397.
MAE = 0.491.
R2 = 0.377.
PCC = 0.620.
  LGBM:EC
Model Performance
MSE: 0.299.
MAE = 0.418.
R2 = 0.472.
PCC = 0.692.
  LGBM:PA
Model Performance
MSE: 0.310.
MAE = 0.420.
R2 = 0.453.
PCC = 0.680.


In [5]:
models = [
    ('Random Forest', RandomForestRegressor(n_estimators = 400, random_state=42)),
    ('XGBoost', xgb.XGBRegressor(learning_rate=0.01, n_estimators=1000, max_depth=None, random_state=42)),
    ('CatBoost', CatBoostRegressor(random_state=42, loss_function='RMSE', learning_rate=0.05, iterations=400, depth=5, verbose=False)),
    ('LGBM', lgb.LGBMRegressor(random_state=42, learning_rate=0.01, n_estimators=400))
]

X_test = {
    'SA_X_test': SA_test.iloc[:, 5:-12],
    'EC_X_test': EC_test.iloc[:, 5:-12],
    'PA_X_test': PA_test.iloc[:, 5:-12]
}

y_test = {
    'SA_y_test': SA_test.iloc[:, 2],
    'EC_y_test': EC_test.iloc[:, 2],
    'PA_y_test': PA_test.iloc[:, 2]
}

for model_name, model in models:
    print(f"Fitting {model_name}...")
    model.fit(Three_concat_train.iloc[:,5:-12],Three_concat_train.iloc[:,2])
    
    for X, y in zip(X_test, y_test):
        print(f'Evaluating {model_name} for {X} and {y}')
        best_pred = evaluate(model, X_test[X], y_test[y])


Fitting Random Forest...
Evaluating Random Forest for SA_X_test and SA_y_test
Model Performance
MSE: 0.336.
MAE = 0.451.
R2 = 0.473.
PCC = 0.691.
Evaluating Random Forest for EC_X_test and EC_y_test
Model Performance
MSE: 0.281.
MAE = 0.391.
R2 = 0.503.
PCC = 0.716.
Evaluating Random Forest for PA_X_test and PA_y_test
Model Performance
MSE: 0.251.
MAE = 0.366.
R2 = 0.558.
PCC = 0.761.
Fitting XGBoost...
Evaluating XGBoost for SA_X_test and SA_y_test
Model Performance
MSE: 0.334.
MAE = 0.468.
R2 = 0.475.
PCC = 0.702.
Evaluating XGBoost for EC_X_test and EC_y_test
Model Performance
MSE: 0.289.
MAE = 0.406.
R2 = 0.489.
PCC = 0.706.
Evaluating XGBoost for PA_X_test and PA_y_test
Model Performance
MSE: 0.274.
MAE = 0.380.
R2 = 0.517.
PCC = 0.722.
Fitting CatBoost...
Evaluating CatBoost for SA_X_test and SA_y_test
Model Performance
MSE: 0.396.
MAE = 0.509.
R2 = 0.379.
PCC = 0.637.
Evaluating CatBoost for EC_X_test and EC_y_test
Model Performance
MSE: 0.317.
MAE = 0.433.
R2 = 0.440.
PCC = 0.6

# My new collection dataset


In [55]:
#EC 
EC_train = pd.read_csv('New_collection/EC_X_train.csv')
EC_test = pd.read_csv('New_collection/EC_X_test.csv')
EC_val = pd.read_csv('New_collection/EC_X_val.csv')

#SA 
SA_train = pd.read_csv('New_collection/SA_X_train.csv')
SA_test = pd.read_csv('New_collection/SA_X_test.csv')
SA_val = pd.read_csv('New_collection/SA_X_val.csv')

#PA 
PA_train = pd.read_csv('New_collection/PA_X_train.csv')
PA_test = pd.read_csv('New_collection/PA_X_test.csv')
PA_val = pd.read_csv('New_collection/PA_X_val.csv')

Three_concat_train = pd.concat([SA_train, EC_train, PA_train], axis=0)


In [56]:
#new_collection
models = [
    ('RF', RandomForestRegressor(n_estimators = 400, random_state=42)),
    ('XGBoost', xgb.XGBRegressor(learning_rate=0.01, n_estimators=400, max_depth=None)),
    ('CatBoost', CatBoostRegressor(random_state=42, loss_function='RMSE', learning_rate=0.05, iterations=400, depth=5, verbose=False)),
    ('LGBM', lgb.LGBMRegressor(random_state=42, learning_rate=0.01, n_estimators=400))
]

datasets = [
    ('SA', SA_train, SA_test),
    ('EC', EC_train, EC_test),
    ('PA', PA_train, PA_test)
]

for model_name, model in models:
    print(model_name)
    for dataset_name, train_data, test_data in datasets:
        print(f'  {model_name}:{dataset_name}')
        X_train = train_data.iloc[:, 9:254]
        y_train = train_data.iloc[:, 8]
        X_test = test_data.iloc[:, 9:254]
        y_test = test_data.iloc[:, 8]

        model.fit(X_train, y_train)
        y_pred = evaluate(model, X_test, y_test)


RF
  RF:SA
Model Performance
MSE: 0.355.
MAE = 0.471.
R2 = 0.411.
PCC = 0.649.
  RF:EC
Model Performance
MSE: 0.315.
MAE = 0.428.
R2 = 0.497.
PCC = 0.708.
  RF:PA
Model Performance
MSE: 0.316.
MAE = 0.434.
R2 = 0.456.
PCC = 0.685.
XGBoost
  XGBoost:SA
Model Performance
MSE: 0.376.
MAE = 0.494.
R2 = 0.376.
PCC = 0.624.
  XGBoost:EC
Model Performance
MSE: 0.335.
MAE = 0.452.
R2 = 0.465.
PCC = 0.686.
  XGBoost:PA
Model Performance
MSE: 0.331.
MAE = 0.453.
R2 = 0.431.
PCC = 0.673.
CatBoost
  CatBoost:SA
Model Performance
MSE: 0.387.
MAE = 0.503.
R2 = 0.359.
PCC = 0.604.
  CatBoost:EC
Model Performance
MSE: 0.332.
MAE = 0.449.
R2 = 0.471.
PCC = 0.688.
  CatBoost:PA
Model Performance
MSE: 0.332.
MAE = 0.454.
R2 = 0.429.
PCC = 0.662.
LGBM
  LGBM:SA
Model Performance
MSE: 0.382.
MAE = 0.500.
R2 = 0.367.
PCC = 0.613.
  LGBM:EC
Model Performance
MSE: 0.336.
MAE = 0.450.
R2 = 0.463.
PCC = 0.684.
  LGBM:PA
Model Performance
MSE: 0.327.
MAE = 0.449.
R2 = 0.437.
PCC = 0.672.


In [57]:
#new_collection
models = [
    ('Random Forest', RandomForestRegressor(n_estimators = 400, random_state=42)),
    ('XGBoost', xgb.XGBRegressor(learning_rate=0.01, n_estimators=1000, max_depth=None, random_state=42)),
    ('CatBoost', CatBoostRegressor(random_state=42, loss_function='RMSE', learning_rate=0.05, iterations=400, depth=5, verbose=False)),
    ('LGBM', lgb.LGBMRegressor(random_state=42, learning_rate=0.01, n_estimators=400))
]

X_test = {
    'SA_X_test': SA_test.iloc[:, 9:],
    'EC_X_test': EC_test.iloc[:, 9:],
    'PA_X_test': PA_test.iloc[:, 9:]
}

y_test = {
    'SA_y_test': SA_test.iloc[:, 8],
    'EC_y_test': EC_test.iloc[:, 8],
    'PA_y_test': PA_test.iloc[:, 8]
}

for model_name, model in models:
    print(f"Fitting {model_name}...")
    model.fit(Three_concat_train.iloc[:,9:],Three_concat_train.iloc[:,8])
    
    for X, y in zip(X_test, y_test):
        print(f'Evaluating {model_name} for {X} and {y}')
        best_pred = evaluate(model, X_test[X], y_test[y])


Fitting Random Forest...
Evaluating Random Forest for SA_X_test and SA_y_test
Model Performance
MSE: 0.295.
MAE = 0.416.
R2 = 0.511.
PCC = 0.719.
Evaluating Random Forest for EC_X_test and EC_y_test
Model Performance
MSE: 0.251.
MAE = 0.374.
R2 = 0.600.
PCC = 0.777.
Evaluating Random Forest for PA_X_test and PA_y_test
Model Performance
MSE: 0.237.
MAE = 0.372.
R2 = 0.593.
PCC = 0.777.
Fitting XGBoost...
Evaluating XGBoost for SA_X_test and SA_y_test
Model Performance
MSE: 0.334.
MAE = 0.463.
R2 = 0.446.
PCC = 0.676.
Evaluating XGBoost for EC_X_test and EC_y_test
Model Performance
MSE: 0.279.
MAE = 0.410.
R2 = 0.554.
PCC = 0.750.
Evaluating XGBoost for PA_X_test and PA_y_test
Model Performance
MSE: 0.267.
MAE = 0.404.
R2 = 0.541.
PCC = 0.745.
Fitting CatBoost...
Evaluating CatBoost for SA_X_test and SA_y_test
Model Performance
MSE: 0.388.
MAE = 0.507.
R2 = 0.356.
PCC = 0.605.
Evaluating CatBoost for EC_X_test and EC_y_test
Model Performance
MSE: 0.327.
MAE = 0.449.
R2 = 0.478.
PCC = 0.7