# Train All Single Models
1. Linear Regression
   - Elastic Net
   - Ridge
   - Lasso
2. Boosting Decision Trees
   - GBR
   - LightGBM
   - XGBoostRegressor (the optimization version of GBDT by using L1/L2 regularization.)
3. RandomForest
   - RandomForest(RF)
   - ExtremeRandomTrees(ERT)
4. Kernel Ridge Regression(KRR)
5. K-Nearest Neighbor(KNN)
6. Support Vector Regressor(SVR)

In [None]:
# import libraries
import os
import sys
import numpy as np
import pandas as pd
# Set random seed for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [None]:
current_path = os.getcwd()
current_path

In [None]:
# To import the custom module from a specific path
sys.path.insert(0, os.path.join(current_path, '../'))
# Importing the custom module
from Utools.draw import plot_feature_importance, plot_feature_importance
from Utools.SingleModel import SingleModel

In [None]:
from sklearn.base import clone
# Import sklearn models
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, ExtraTreesRegressor
from sklearn.kernel_approximation import Nystroem
from sklearn.linear_model import ElasticNet, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline

## Load Data

In [None]:
# Load data
file_dir = os.path.join(current_path, '../Data/composition_data/feature_data')
# dft data
dft_train = pd.read_csv(os.path.join(file_dir, 'dft', 'train.csv'))
dft_test = pd.read_csv(os.path.join(file_dir, 'dft', 'test.csv'))
# exp data
exp_train = pd.read_csv(os.path.join(file_dir, 'exp', 'train.csv'))
exp_test = pd.read_csv(os.path.join(file_dir, 'exp', 'test.csv'))

In [None]:
dft_train_X = dft_train.drop(columns=['composition', 'band_gap'])
dft_train_y = dft_train['band_gap']
exp_train_X = exp_train.drop(columns=['composition', 'band_gap'])
exp_train_y = exp_train['band_gap']
dft_test_X = dft_test.drop(columns=['composition', 'band_gap'])
dft_test_y = dft_test['band_gap']
exp_test_X = exp_test.drop(columns=['composition', 'band_gap'])
exp_test_y = exp_test['band_gap']

In [None]:
# save model performance as df
metrics_df = pd.DataFrame(columns=['Model', 'Train_set', 'Test_set', 'R²', 'RMSE', 'MAE'])

## Model Train

In [None]:
models = {
    # Linear models
    'ElasticNet': ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=RANDOM_SEED),
    'Ridge': Ridge(alpha=0.1, random_state=RANDOM_SEED),
    'Lasso': Lasso(alpha=0.1, random_state=RANDOM_SEED), 
    # GBDT models
    'GBR': GradientBoostingRegressor(n_estimators=500, learning_rate=0.1, max_features='sqrt', random_state=RANDOM_SEED),   
    'LightGBM': LGBMRegressor(objective='regression', n_estimators=500, learning_rate=0.05, 
                              reg_alpha=0.1, reg_lambda=0.1, max_depth=-1, random_state=RANDOM_SEED),
    'XGBoost': XGBRegressor(objective='reg:squarederror',n_estimators=500, learning_rate=0.1, max_depth=4, random_state=RANDOM_SEED),
    # Random Forest models
    'RandomForest': RandomForestRegressor(n_estimators=500, max_depth=None, max_features=0.25, random_state=RANDOM_SEED, n_jobs=-1),
    'ExtraRandomTrees': ExtraTreesRegressor(n_estimators=500, max_depth=None, max_features=0.25, random_state=RANDOM_SEED, n_jobs=-1),
    'KernelRidge': Pipeline([
    ('nystroem', Nystroem(kernel='rbf',  n_components=8000, random_state=RANDOM_SEED)),
    ('ridge', Ridge(alpha=1.0))]),
    'KNN': KNeighborsRegressor(n_neighbors=10, weights='uniform', algorithm='auto', n_jobs=-1),
    # SVM models
    'SVR': SVR(kernel='rbf', C=25, epsilon=0.07, gamma=0.01)
}

In [None]:
model_number = 0
model_nums = len(models)
# Loop through each model
for model_name, model in models.items():
    model_number += 1
    # Create directories for saving models and figures
    file_path = os.path.join(current_path, model_name)
    fig_path = os.path.join(file_path, 'figures')
    model_path = os.path.join(file_path, 'model')
    # Create directories if they don't exist
    os.makedirs(file_path, exist_ok=True)
    os.makedirs(fig_path, exist_ok=True)
    os.makedirs(model_path, exist_ok=True)

    # record model metrics
    model_metrics = pd.DataFrame(columns=['Model', 'Train_set', 'Test_set', 'R²', 'RMSE', 'MAE'])
    # Train the model on dft data
    print("#" * 100)
    print(f"[{model_number}/{model_nums} - DFT - {model_name}] Training {model_name} on DFT data:")
    print("-" * 100)
    
    dft_model = SingleModel(clone(model), random_state=RANDOM_SEED)
    dft_model.train(dft_train_X, dft_train_y)
    # evaluate on the dft test set
    print(f"Evaluating DFT {model_name} on DFT test set:")
    
    metrics = dft_model.evaluate(dft_test_X, dft_test_y, fig_path=os.path.join(fig_path, 'dft_train_dft_test.png'))
    model_metrics.loc[len(model_metrics)] = {
    'Model': model_name,
    'Train_set': 'dft',
    'Test_set': 'dft',
    'R²': metrics['r2'],
    'RMSE': metrics['rmse'],
    'MAE': metrics['mae']
}
    # evaluate on the exp test set
    print(f"Evaluating DFT {model_name} on EXP test set:")
    metrics = dft_model.evaluate(exp_test_X, exp_test_y, fig_path=os.path.join(fig_path, 'dft_train_exp_test.png'))
    model_metrics.loc[len(model_metrics)] = {
    'Model': model_name,
    'Train_set': 'dft',
    'Test_set': 'exp',
    'R²': metrics['r2'],
    'RMSE': metrics['rmse'],
    'MAE': metrics['mae']
}
    ################################################
    # Train the model on exp data
    print("#" * 100)
    print(f"[{model_number}/{model_nums} - EXP - {model_name}] Training {model_name} on EXP data:")
    print("-" * 100)

    exp_model = SingleModel(clone(model), random_state=RANDOM_SEED)
    exp_model.train(exp_train_X, exp_train_y)
    # evaluate on the exp test set
    print(f"Evaluating EXP {model_name} on EXP test set:")
    metrics = exp_model.evaluate(exp_test_X, exp_test_y, fig_path=os.path.join(fig_path, 'exp_train_exp_test.png'))
    model_metrics.loc[len(model_metrics)] = {
    'Model': model_name,
    'Train_set': 'exp',
    'Test_set': 'exp',
    'R²': metrics['r2'],
    'RMSE': metrics['rmse'],
    'MAE': metrics['mae']
}
    # evaluate on the dft test set
    print(f"Evaluating EXP {model_name} on DFT test set:")
    metrics = exp_model.evaluate(dft_test_X, dft_test_y, fig_path=os.path.join(fig_path, 'exp_train_dft_test.png'))
    model_metrics.loc[len(model_metrics)] = {
    'Model': model_name,
    'Train_set': 'exp',
    'Test_set': 'dft',
    'R²': metrics['r2'],
    'RMSE': metrics['rmse'],
    'MAE': metrics['mae']
}
    # save models
    dft_model.save_model(os.path.join(model_path, f'dft_{model_name}.pkl'))
    exp_model.save_model(os.path.join(model_path, f'exp_{model_name}.pkl'))
    # save model metrics
    model_metrics.to_csv(os.path.join(file_path, f'{model_name}_metrics.csv'), index=False)
    # append model metrics to the main metrics dataframe
    metrics_df = pd.concat([metrics_df, model_metrics], ignore_index=True)
    #################################################
    # Feature importance plot
    # For models that support feature importance
    if model_name in ['LightGBM', 'XGBoost', 'RandomForest', 'ExtraRandomTrees']:
        # 
        print(f"Plotting feature importance for {model_name} on DFT data:")
        dft_importances_df = plot_feature_importance(dft_model.get_model(), dft_train_X.columns, top_n=10,
                            fig_path=os.path.join(fig_path, f'dft_feature_importance.png'))
        print(f"Plotting feature importance for {model_name} on EXP data:")
        exp_importances_df = plot_feature_importance(exp_model.get_model(), exp_train_X.columns, top_n=10,
                            fig_path=os.path.join(fig_path, f'exp_feature_importance.png'))  
# save metrics to csv
metrics_df.to_csv(os.path.join(current_path, 'metrics.csv'), index=False)

In [None]:
metrics_df