<a href="https://colab.research.google.com/github/ARoDias/DataMiningProject_CTCT_2024Analysis/blob/main/CTCT_Data_Mining_Prediction_and_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Prediction and Classification notebook**

This notebook answers the following research question:
* **RQ5:** Can student performance in one theme of the CTCT course predict outcomes in subsequent themes?

**Important:**
Create a folder called **inputs** and store files there to load.

# 1. Import Libraries

In [None]:
# Install additional libraries that are not included by default in Google Colaboratory

# Install CatBoost
!pip install catboost

# Install XGBoost
!pip install xgboost

# Install LightGBM
!pip install lightgbm

# Install Imbalanced-learn (for SMOTE and other imbalanced data techniques)
#!pip install imbalanced-learn

# Install Python-docx (for working with .docx files if needed)
!pip install python-docx

# If you need to use Google Colab-specific commands
# !pip install google-colab




In [None]:
# Data manipulation and analysis
import pandas as pd  # Main library for data manipulation and analysis
import numpy as np  # Library for numerical operations
import os  # Provides a way to use operating system dependent functionality
import shutil  # Provides functions to operate on files and collections of files

# Model selection and evaluation
from sklearn.model_selection import train_test_split  # Function to split datasets into training and testing sets
from sklearn.model_selection import GridSearchCV  # Performs exhaustive search over specified parameter values for an estimator
from sklearn.model_selection import cross_val_score  # Evaluates a score by cross-validation

# Metrics for model evaluation
from sklearn.metrics import r2_score  # R^2 (coefficient of determination) regression score function
from sklearn.metrics import confusion_matrix  # Confusion matrix to evaluate the accuracy of a classification
from sklearn.metrics import classification_report  # Builds a text report showing the main classification metrics
from sklearn.metrics import accuracy_score  # Classification accuracy metric
from sklearn.metrics import mean_squared_error  # Mean squared error regression loss
from sklearn.metrics import mean_absolute_error  # Mean absolute error regression loss
from sklearn.metrics import f1_score  # F1 score for classification tasks
from sklearn.metrics import precision_score  # Precision metric for classification tasks
from sklearn.metrics import recall_score  # Recall metric for classification tasks
from sklearn.metrics import roc_auc_score  # ROC AUC metric for evaluating classification models

# Preprocessing tools
from sklearn.preprocessing import StandardScaler  # Standardize features by removing the mean and scaling to unit variance
from sklearn.preprocessing import MinMaxScaler  # Transforms features by scaling each feature to a given range
from sklearn.pipeline import Pipeline  # Allows assembling several steps that can be cross-validated together
from sklearn.decomposition import PCA  # Principal Component Analysis (PCA) for dimensionality reduction
from sklearn.impute import SimpleImputer  # Impute missing values for both numerical and categorical data
from sklearn.impute import KNNImputer  # K-Nearest Neighbors imputer for handling missing values
from sklearn.feature_selection import VarianceThreshold  # Removes features with low variance

# Machine learning models
from sklearn.linear_model import LogisticRegression  # Logistic Regression classifier
from sklearn.linear_model import LinearRegression  # Linear Regression model
from sklearn.tree import DecisionTreeClassifier  # Decision Tree classifier
from sklearn.tree import DecisionTreeRegressor  # Decision Tree regressor
from sklearn.ensemble import RandomForestClassifier  # Random Forest classifier
from sklearn.ensemble import RandomForestRegressor  # Random Forest regressor
from sklearn.ensemble import StackingRegressor  # Stacking Regressor
from sklearn.naive_bayes import GaussianNB  # Naive Bayes classifier for Gaussian-distributed data
from sklearn.svm import SVC  # Support Vector Classifier
from sklearn.svm import SVR  # Support Vector Regressor
from sklearn.neighbors import KNeighborsClassifier  # k-Nearest Neighbors classifier
from sklearn.neighbors import KNeighborsRegressor  # k-Nearest Neighbors regressor
from sklearn.neural_network import MLPClassifier  # Multi-layer Perceptron classifier
from sklearn.neural_network import MLPRegressor  # Multi-layer Perceptron regressor
import xgboost as xgb  # Extreme Gradient Boosting for classification and regression tasks
import lightgbm as lgb  # LightGBM for classification and regression tasks
import catboost as cb  # CatBoost for classification and regression tasks

# Error handling
from sklearn.exceptions import ConvergenceWarning  # Handles convergence warnings in iterative algorithms
import warnings  # Python's built-in warnings module
warnings.filterwarnings('ignore', category=ConvergenceWarning)  # Ignore convergence warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Document generation and display (Optional)
from docx import Document  # Library to create and update Microsoft Word (.docx) files
from IPython.display import display as ipy_display  # Function to display objects in IPython notebooks

# Date and time utilities
from datetime import datetime  # Supplies classes for manipulating dates and times
import time  # Provides various time-related functions

# Data serialization
import pickle  # Python object serialization module
from io import StringIO  # In-memory file-like object

# Optional: Integration with Google Colaboratory (uncomment if needed)
from google.colab import files  # For file handling in Google Colaboratory

# Ensure directories exist
if not os.path.exists('inputs'):
    os.makedirs('inputs')

if not os.path.exists('best_models'):
    os.makedirs('best_models')


# 2. Load Datasets

## 2.1 Loading

###### load_feather_files

In [None]:
# Define a function to load Feather files and organize datasets by specific features or target columns

def load_feather_files(output_dir):
    """
    Load all Feather files from the specified output directory.

    Args:
    - output_dir (str): Directory where the Feather files are stored.

    Returns:
    - datasets (dict): Dictionary with filenames (without .feather extension) as keys and loaded DataFrames as values.
    """
    datasets = {}
    for file_name in os.listdir(output_dir):
        if file_name.endswith('.feather'):
            file_path = os.path.join(output_dir, file_name)
            dataset_name = os.path.splitext(file_name)[0]  # Remove the .feather extension
            datasets[dataset_name] = pd.read_feather(file_path)
            print(f"Loaded dataset from '{file_path}' with shape {datasets[dataset_name].shape}")
    return datasets

# Load all Feather files from the "inputs" folder
input_dir = 'inputs'
loaded_datasets = load_feather_files(input_dir)


Loaded dataset from 'inputs/predict_finalgrade_Lasso_less_restricted.feather' with shape (7890, 90)
Loaded dataset from 'inputs/predict_total_ap_Lasso_more_restricted_regression.feather' with shape (7890, 63)
Loaded dataset from 'inputs/predict_total_ap_RF_more_restricted.feather' with shape (7890, 63)
Loaded dataset from 'inputs/predict_passed_DT_less_restricted.feather' with shape (7890, 4)
Loaded dataset from 'inputs/predict_total_pp_RF_less_restricted.feather' with shape (7890, 144)
Loaded dataset from 'inputs/predict_total_spa_DT_less_restricted.feather' with shape (7890, 108)
Loaded dataset from 'inputs/combined_df_Passed.feather' with shape (7890, 68)
Loaded dataset from 'inputs/predict_passed_RF_more_restricted.feather' with shape (7890, 58)
Loaded dataset from 'inputs/predict_total_spa_DT_restricted.feather' with shape (7890, 108)
Loaded dataset from 'inputs/predict_total_pp_MI_more_restricted.feather' with shape (7890, 62)
Loaded dataset from 'inputs/predict_total_spa_MI_rest

In [None]:
#for file_name, df in loaded_datasets.items():
  #print("-"*50)
  #print(f"Dataset '{file_name}':")
  #print('Information:')
  #print(df.info())

## 2.2 Check Missing Values in Datasets

###### check_missing_values

In [None]:
def check_missing_values_in_single_df(dataset):
    """
    Check and print the missing values for a single dataset (DataFrame).

    Args:
    - dataset (DataFrame): A DataFrame representing the dataset.

    Functionality:
    - The function calculates the sum of missing values for each column.
    - If there are columns with missing values, it prints the count of missing values per column.
    """
    print("Missing values in the dataset:")
    missing_values = dataset.isnull().sum()
    # Filter and print only columns with missing values
    print(missing_values[missing_values > 0])


In [None]:
#for file_name, df in loaded_datasets.items():
  #print("-"*50)
  #print(f"Dataset '{file_name}':")
  #check_missing_values_in_single_df(df)

## 2.3  Organizing Loaded Datasets

###### create_filtered_dataframe

In [None]:
# Function to create filtered DataFrame with relevant columns and Year, keeping target column last
def create_filtered_dataframe(base_df, selected_columns, target_column):
    # Filter only the columns that exist in the base DataFrame
    filtered_columns = [col for col in selected_columns if col in base_df.columns]

    # Ensure 'Year' column is included if it exists
    if 'Year' in base_df.columns and 'Year' not in filtered_columns:
        filtered_columns.insert(0, 'Year')  # Ensure Year is the first column

    # Add target column as the last one
    if target_column in base_df.columns and target_column not in filtered_columns:
        filtered_columns.append(target_column)

    # Create and return the filtered DataFrame
    return base_df[filtered_columns]

# Define the target column mapping based on the filename suffix to organize datasets by target variables
target_column_mapping = {
    'predict_passed': 'Passed',
    'predict_finalgrade': 'FinalGrade',
    'predict_total_ap': 'Total_AP',
    'predict_total_pp': 'Total_PP',
    'predict_total_spa': 'Total_SPA'
}

# Filter datasets based on feature selection method
mi_datasets = {name: df for name, df in loaded_datasets.items() if '_MI_' in name}
rf_datasets = {name: df for name, df in loaded_datasets.items() if '_RF_' in name}
dt_datasets = {name: df for name, df in loaded_datasets.items() if '_DT_' in name}
lasso_datasets = {name: df for name, df in loaded_datasets.items() if '_Lasso_' in name}
fr_datasets = {name: df for name, df in loaded_datasets.items() if '_FR_' in name}

# Datasets without feature selection
without_feature_selection_datasets = {name: df for name, df in loaded_datasets.items() if 'combined_df' in name.lower()}

# Update the handling for FinalGrade specifically
for target_key, target_column in target_column_mapping.items():
    if target_column == 'FinalGrade':
        # Use the specific dataset for FinalGrade
        base_key = 'filtered_combined_df_FinalGrade'
    else:
        base_key = f'combined_df_{target_column}'

    if base_key in without_feature_selection_datasets:
        base_df = without_feature_selection_datasets[base_key]
    else:
        print(f"No base dataset found for {target_column}")
        continue

    # Iterate through each feature selection method and replace datasets
    for method, datasets in [('MI', mi_datasets), ('RF', rf_datasets), ('DT', dt_datasets), ('Lasso', lasso_datasets), ('FR', fr_datasets)]:
        for dataset_name, dataset_df in datasets.items():
            if f'predict_{target_column.lower()}' in dataset_name.lower() and f'_{method}_' in dataset_name:
                selected_columns = list(dataset_df.columns)
                # FinalGrade handling: Ensure we're removing the correct target column
                selected_columns.remove(target_column)

                # Create the filtered DataFrame using the utility function
                filtered_df = create_filtered_dataframe(base_df, selected_columns, target_column)

                # Replace the original DataFrame in loaded_datasets with the filtered DataFrame
                loaded_datasets[dataset_name] = filtered_df

                # Optional: print to verify the replacement
                print(f'Replaced dataset: {dataset_name}')
                print(loaded_datasets[dataset_name].columns)
                print(len(loaded_datasets[dataset_name]))


Replaced dataset: predict_passed_MI_restricted
Index(['Year', '24', '55', '1', '42', '20', '38', '9', '31', '133', '18',
       '144', '51', '1_8', '43', '19', '139', '23', '33', '10', '7', '50',
       '37', '1_11', '49', '135', '140', '44', '25', '94_1', '12', '2_11',
       '56', '5', '41', '4', '147', '52', '30', '11', '137', '36', '72_1',
       '39', '35', '46', '53', '27', '8', '32', '29', '22', '34', '28', '45',
       'Passed'],
      dtype='object')
7890
Replaced dataset: predict_passed_MI_more_restricted
Index(['Year', '10', '24', '41', '7', '55', '35', '50', '1', '147', '3', '46',
       '52', '42', '37', '53', '27', '45', '1_11', '5', '49', '30', '20',
       '135', '38', '140', '44', '11', '25', '31', '133', '18', '9', '144',
       '137', '8', '36', '51', '72_1', '32', '94_1', '12', '1_8', '43', '29',
       '19', '34', '28', '56', '139', '23', '39', '33', 'Passed'],
      dtype='object')
7890
Replaced dataset: predict_passed_MI_less_restricted
Index(['Year', '24', '55',

In [None]:
# Filter datasets based on the target column
passed_datasets = {name: df for name, df in loaded_datasets.items() if 'predict_passed' in name.lower()}
finalgrade_datasets = {name: df for name, df in loaded_datasets.items() if 'predict_finalgrade' in name.lower()}
total_ap_datasets = {name: df for name, df in loaded_datasets.items() if 'predict_total_ap' in name.lower()}
total_pp_datasets = {name: df for name, df in loaded_datasets.items() if 'predict_total_pp' in name.lower()}
total_spa_datasets = {name: df for name, df in loaded_datasets.items() if 'predict_total_spa' in name.lower()}
without_feature_selection_datasets = {name: df for name, df in loaded_datasets.items() if 'combined_df' in name.lower()}

# Combine feature selection methods with target columns
mi_finalgrade_datasets = {name: df for name, df in mi_datasets.items() if 'predict_finalgrade' in name.lower()}
rf_finalgrade_datasets = {name: df for name, df in rf_datasets.items() if 'predict_finalgrade' in name.lower()}
dt_finalgrade_datasets = {name: df for name, df in dt_datasets.items() if 'predict_finalgrade' in name.lower()}
lasso_finalgrade_datasets = {name: df for name, df in lasso_datasets.items() if 'predict_finalgrade' in name.lower()}
fr_finalgrade_datasets = {name: df for name, df in fr_datasets.items() if 'predict_finalgrade' in name.lower()}

mi_passed_datasets = {name: df for name, df in mi_datasets.items() if 'predict_passed' in name.lower()}
rf_passed_datasets = {name: df for name, df in rf_datasets.items() if 'predict_passed' in name.lower()}
dt_passed_datasets = {name: df for name, df in dt_datasets.items() if 'predict_passed' in name.lower()}
lasso_passed_datasets = {name: df for name, df in lasso_datasets.items() if 'predict_passed' in name.lower()}
fr_passed_datasets = {name: df for name, df in fr_datasets.items() if 'predict_passed' in name.lower()}

mi_total_ap_datasets = {name: df for name, df in mi_datasets.items() if 'predict_total_ap' in name.lower()}
rf_total_ap_datasets = {name: df for name, df in rf_datasets.items() if 'predict_total_ap' in name.lower()}
dt_total_ap_datasets = {name: df for name, df in dt_datasets.items() if 'predict_total_ap' in name.lower()}
lasso_total_ap_datasets = {name: df for name, df in lasso_datasets.items() if 'predict_total_ap' in name.lower()}
fr_total_ap_datasets = {name: df for name, df in fr_datasets.items() if 'predict_total_ap' in name.lower()}

mi_total_pp_datasets = {name: df for name, df in mi_datasets.items() if 'predict_total_pp' in name.lower()}
rf_total_pp_datasets = {name: df for name, df in rf_datasets.items() if 'predict_total_pp' in name.lower()}
dt_total_pp_datasets = {name: df for name, df in dt_datasets.items() if 'predict_total_pp' in name.lower()}
lasso_total_pp_datasets = {name: df for name, df in lasso_datasets.items() if 'predict_total_pp' in name.lower()}
fr_total_pp_datasets = {name: df for name, df in fr_datasets.items() if 'predict_total_pp' in name.lower()}

mi_total_spa_datasets = {name: df for name, df in mi_datasets.items() if 'predict_total_spa' in name.lower()}
rf_total_spa_datasets = {name: df for name, df in rf_datasets.items() if 'predict_total_spa' in name.lower()}
dt_total_spa_datasets = {name: df for name, df in dt_datasets.items() if 'predict_total_spa' in name.lower()}
lasso_total_spa_datasets = {name: df for name, df in lasso_datasets.items() if 'predict_total_spa' in name.lower()}
fr_total_spa_datasets = {name: df for name, df in fr_datasets.items() if 'predict_total_spa' in name.lower()}

# Print the count of loaded datasets in each category
print(f'Number of loaded datasets = {len(loaded_datasets)}')
print(f'Number of MI datasets = {len(mi_datasets)}')
print(f'Number of RF datasets = {len(rf_datasets)}')
print(f'Number of DT datasets = {len(dt_datasets)}')
print(f'Number of Lasso datasets = {len(lasso_datasets)}')
print(f'Number of FR datasets = {len(fr_datasets)}')

print(f'Number of datasets for predicting Passed column = {len(passed_datasets)}')
print(f'Number of datasets for predicting FinalGrade column = {len(finalgrade_datasets)}')
print(f'Number of datasets for predicting Total_AP column = {len(total_ap_datasets)}')
print(f'Number of datasets for predicting Total_PP column = {len(total_pp_datasets)}')
print(f'Number of datasets for predicting Total_SPA column = {len(total_spa_datasets)}')

print(f"Available datasets: {list(loaded_datasets.keys())}")
print(f"MI FinalGrade datasets: {list(mi_finalgrade_datasets.keys())}")
print(f"RF FinalGrade datasets: {list(rf_finalgrade_datasets.keys())}")
print(f"DT FinalGrade datasets: {list(dt_finalgrade_datasets.keys())}")
print(f"Lasso FinalGrade datasets: {list(lasso_finalgrade_datasets.keys())}")
print(f"FR FinalGrade datasets: {list(fr_finalgrade_datasets.keys())}")

print(f"MI Passed datasets: {list(mi_passed_datasets.keys())}")
print(f"RF Passed datasets: {list(rf_passed_datasets.keys())}")
print(f"DT Passed datasets: {list(dt_passed_datasets.keys())}")
print(f"Lasso Passed datasets: {list(lasso_passed_datasets.keys())}")
print(f"FR Passed datasets: {list(fr_passed_datasets.keys())}")

print(f'Number of MI Total_AP datasets = {len(mi_total_ap_datasets)}')
print(f'Number of RF Total_AP datasets = {len(rf_total_ap_datasets)}')
print(f'Number of DT Total_AP datasets = {len(dt_total_ap_datasets)}')
print(f'Number of Lasso Total_AP datasets = {len(lasso_total_ap_datasets)}')
print(f'Number of FR Total_AP datasets = {len(fr_total_ap_datasets)}')

print(f'Number of MI Total_PP datasets = {len(mi_total_pp_datasets)}')
print(f'Number of RF Total_PP datasets = {len(rf_total_pp_datasets)}')
print(f'Number of DT Total_PP datasets = {len(dt_total_pp_datasets)}')
print(f'Number of Lasso Total_PP datasets = {len(lasso_total_pp_datasets)}')
print(f'Number of FR Total_PP datasets = {len(fr_total_pp_datasets)}')

print(f'Number of MI Total_SPA datasets = {len(mi_total_spa_datasets)}')
print(f'Number of RF Total_SPA datasets = {len(rf_total_spa_datasets)}')
print(f'Number of DT Total_SPA datasets = {len(dt_total_spa_datasets)}')
print(f'Number of Lasso Total_SPA datasets = {len(lasso_total_spa_datasets)}')
print(f'Number of FR Total_SPA datasets = {len(fr_total_spa_datasets)}')

print(f'Number of datasets without feature selection = {len(without_feature_selection_datasets)}')


Number of loaded datasets = 59
Number of MI datasets = 12
Number of RF datasets = 18
Number of DT datasets = 12
Number of Lasso datasets = 6
Number of FR datasets = 6
Number of datasets for predicting Passed column = 9
Number of datasets for predicting FinalGrade column = 9
Number of datasets for predicting Total_AP column = 12
Number of datasets for predicting Total_PP column = 12
Number of datasets for predicting Total_SPA column = 12
Available datasets: ['predict_finalgrade_Lasso_less_restricted', 'predict_total_ap_Lasso_more_restricted_regression', 'predict_total_ap_RF_more_restricted', 'predict_passed_DT_less_restricted', 'predict_total_pp_RF_less_restricted', 'predict_total_spa_DT_less_restricted', 'combined_df_Passed', 'predict_passed_RF_more_restricted', 'predict_total_spa_DT_restricted', 'predict_total_pp_MI_more_restricted', 'predict_total_spa_MI_restricted', 'predict_total_pp_MI_less_restricted', 'predict_finalgrade_Lasso_restricted', 'predict_total_pp_DT_more_restricted', '

# 3. Auxiliary Methods

###### evaluate_combined_dfs

In [None]:
def evaluate_combined_dfs(datasets):
    """
    Evaluate the state of combined dataframes.

    Args:
    - datasets (dict): Dictionary containing the dataframes to be evaluated.

    Returns:
    - summary (dict): Dictionary containing the evaluation summary of each dataframe.
    """
    summary = {}

    # Loop through each dataframe in the provided datasets
    for df_name, df in datasets.items():
        if 'combined_df' in df_name:
            print(f"Evaluating dataframe: {df_name}")
            df_summary = {}

            # Shape of the dataframe
            df_summary['shape'] = df.shape
            print(f"Shape: {df.shape}")

            # Checking for missing data
            missing_data = df.isnull().sum()
            total_missing = missing_data.sum()
            df_summary['total_missing_values'] = total_missing
            df_summary['missing_percentage'] = (total_missing / df.size) * 100
            print(f"Total missing values: {total_missing}")
            print(f"Missing data percentage: {df_summary['missing_percentage']:.2f}%")

            # Numerical and categorical columns
            numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
            categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()
            df_summary['numerical_columns'] = numerical_columns
            df_summary['categorical_columns'] = categorical_columns
            print(f"Numerical columns: {numerical_columns}")
            print(f"Categorical columns: {categorical_columns}")

            # Adding dataframe summary to the main summary
            summary[df_name] = df_summary
            print("-" * 80)

    return summary

## 3.1 Output handling methods

In [None]:
# Custom print function to capture output to a Word document
def print_to_doc(*args, **kwargs):
    text = ' '.join(map(str, args))
    doc.add_paragraph(text)

# Custom display function to capture output to a Word document
def custom_display(*args, **kwargs):
    if output_to_docx == 1:
        for arg in args:
            if isinstance(arg, pd.DataFrame):
                buf = StringIO()
                arg.to_string(buf)
                text = buf.getvalue()
            else:
                text = str(arg)
            doc.add_paragraph(text)
    else:
        ipy_display(*args, **kwargs)

# Function to save the document with the current date
def save_document(filename='output_modeling.docx'):
    current_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    doc.add_paragraph(f"Document saved on {current_date}")
    doc.save(filename)
    print(f"Document saved as {filename}")

print("Auxiliary methods defined successfully.")


Auxiliary methods defined successfully.


### Set Printing Mode

In [None]:
# Change Printing Mode
output_to_docx = 0  # Control output (0 for notebook, 1 for docx)

if output_to_docx == 1:
    print = print_to_doc
    display = custom_display

# Initialize a Word document to capture output
doc = Document()
doc.add_heading('Prediction Analysis Report', 0)

<docx.text.paragraph.Paragraph at 0x7cd6ed591ae0>

# 4. Feature Selection and Predictive Modeling Methods

## 4.1 Methods

###### get_target_column

In [None]:
# Define the target column based on the dataset name suffix
def get_target_column(dataset_name, target_column_mapping):
    for key, column in target_column_mapping.items():
        if key in dataset_name:
            return column
    raise ValueError(f"Unknown target column for dataset: {dataset_name}")

###### get_models

In [None]:
def get_models(task_type, class_weight_dict):
    if task_type == 'classification':
        return {
            'Logistic Regression': LogisticRegression(max_iter=10000, solver='saga', tol=1e-3, random_state=2024, C=0.3, class_weight=class_weight_dict),
            'Decision Tree': DecisionTreeClassifier(random_state=2024, class_weight=class_weight_dict),
            'Random Forest': RandomForestClassifier(random_state=2024, class_weight=class_weight_dict),
            'Naive Bayes': GaussianNB(),
            'Support Vector Machine': SVC(random_state=2024, probability=True, class_weight=class_weight_dict),
            'k-Nearest Neighbors': KNeighborsClassifier(),
            'Neural Network': MLPClassifier(max_iter=1000, random_state=42)
        }
    else:
        return {
            'Linear Regression': LinearRegression(),
            'Decision Tree Regressor': DecisionTreeRegressor(random_state=2024),
            'Random Forest Regressor': RandomForestRegressor(random_state=2024),
            'Support Vector Regressor': SVR(),
            'k-Nearest Neighbors Regressor': KNeighborsRegressor(),
            'Neural Network Regressor': MLPRegressor(max_iter=1000, random_state=2024)
        }

###### get_test_scores

In [None]:
def get_test_scores(task_type, y_test, y_pred, model, X_test):
    if task_type == 'classification':
        return {
            'Accuracy': accuracy_score(y_test, y_pred),
            'F1-Score': f1_score(y_test, y_pred, average='weighted'),
            'Precision': precision_score(y_test, y_pred, average='weighted'),
            'Recall': recall_score(y_test, y_pred, average='weighted'),
            'ROC AUC': roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]) if hasattr(model, 'predict_proba') else None
        }
    else:
        return {
            'MSE': mean_squared_error(y_test, y_pred),
            'R2': r2_score(y_test, y_pred),
            'MAE': mean_absolute_error(y_test, y_pred)
        }

###### is_better_model

In [None]:
def is_better_model(best_model_info, test_scores, task_type):
    if task_type == 'classification':
        return test_scores['F1-Score'] > best_model_info.get('test_scores', {}).get('F1-Score', 0)
    else:
        return test_scores['R2'] > best_model_info.get('test_scores', {}).get('R2', 0)


### 4.1.1 Dataset Processing

###### preprocess_data_with_indicators

In [None]:
# Function to preprocess data with missing data indicators, imputation, and scaling
def preprocess_data_with_indicators(df, target_column, apply_pca=False, n_components=None):
    X = df.drop(columns=[target_column])  # Remove the target column from the feature set
    y = df[target_column]  # Extract the target column

    # Create indicator variables for missing data
    indicator_columns = pd.get_dummies(X.isnull().astype(int), prefix_sep='_missing_', drop_first=True)

    # Impute missing values with zero (this can be adjusted based on the specific strategy needed)
    X_imputed = X.fillna(0)

    # Combine imputed data with the indicator variables
    X_combined = pd.concat([X_imputed, indicator_columns], axis=1)

    # Scale numeric features
    scaler = StandardScaler()
    X_combined_scaled = scaler.fit_transform(X_combined)

    # If PCA is requested, apply it to reduce dimensionality
    if apply_pca and n_components:
        pca = PCA(n_components=n_components)
        X_combined_scaled = pca.fit_transform(X_combined_scaled)

    return X_combined_scaled, y

###### preprocess_data_remove_missing

In [None]:
def preprocess_data_remove_missing(df, target_column):
    """
    Preprocess data by removing columns with any missing values and applying a variance threshold.

    Args:
    - df: DataFrame containing the dataset.
    - target_column: The name of the target column.

    Returns:
    - X_train, X_test, y_train, y_test: Processed training and testing data.
    """
    # Separate features and target
    X = df.drop(columns=[target_column])
    y = df[target_column]

    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2024)

    X_train = X_train.dropna(axis=1)
    X_test = X_test[X_train.columns]

    # Apply variance threshold to remove low variance features
    if X_train.shape[1] == 0 or X_test.shape[1] == 0:
        raise ValueError("No features remaining after preprocessing. Check data or adjust preprocessing steps.")

    pipeline = Pipeline([
        ('variance_threshold', VarianceThreshold(threshold=0.01))  # Adjust the threshold as needed
    ])

    # Apply the pipeline
    X_train = pipeline.fit_transform(X_train)
    X_test = pipeline.transform(X_test)

    # Check if after the variance threshold, no features remain
    if X_train.shape[1] == 0 or X_test.shape[1] == 0:
        raise ValueError("All features removed after applying variance threshold. Adjust the threshold or preprocessing.")

    return X_train, X_test, y_train, y_test


###### preprocess_data_with_custom_imputation

In [None]:
def preprocess_data_with_custom_imputation(df, target_column, imputation_strategy='', apply_pca=False, n_components=None, apply_scaling=True, columns_to_scale=None, group_col='Year'):
    """
    Preprocess data with custom imputation, optional Min-Max scaling, and PCA, ensuring no data leakage.

    Args:
    - df: DataFrame containing the dataset.
    - target_column: The name of the target column.
    - imputation_strategy: Strategy for imputing missing values ('mean', 'median', 'zero', 'knn', 'most_frequent', 'constant'). If empty or None, uses remove_missing approach.
    - apply_pca: Whether to apply PCA for dimensionality reduction.
    - n_components: Number of PCA components if PCA is applied.
    - apply_scaling: Whether to apply Min-Max scaling after imputation.
    - columns_to_scale: List of columns to scale if apply_scaling is True.
    - group_col: Column to group by before applying scaling, if applicable. Defaults to 'Year'.

    Returns:
    - X_train, X_test, y_train, y_test: Processed training and testing data.
    """

    # If no imputation strategy is provided, use preprocess_data_remove_missing
    if not imputation_strategy:
        return preprocess_data_remove_missing(df, target_column)

    # Separate features and target
    X = df.drop(columns=[target_column])
    y = df[target_column]

    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2024)

    # Choose the imputation strategy
    if imputation_strategy == 'mean':
        imputer = SimpleImputer(strategy='mean')
    elif imputation_strategy == 'median':
        imputer = SimpleImputer(strategy='median')
    elif imputation_strategy == 'zero':
        imputer = SimpleImputer(strategy='constant', fill_value=0)
    elif imputation_strategy == 'knn':
        imputer = KNNImputer(n_neighbors=5)
    elif imputation_strategy == 'most_frequent':
        imputer = SimpleImputer(strategy='most_frequent')
    elif imputation_strategy == 'constant':
        imputer = SimpleImputer(strategy='constant', fill_value=0)
    else:
        raise ValueError("Unsupported imputation strategy. Choose from 'mean', 'median', 'zero', 'knn', 'most_frequent', or 'constant'.")

    # Apply imputation
    X_train = imputer.fit_transform(X_train)
    X_test = imputer.transform(X_test)

    if apply_scaling:
        scaler = MinMaxScaler()

        # Check if the group_col exists in the dataframe before applying scaling
        if group_col in df.columns and columns_to_scale:
            print(f"Applying scaling grouped by '{group_col}'.")
            # Apply scaling within each group
            X_train = pd.DataFrame(X_train, columns=df.drop(columns=[target_column]).columns)
            X_test = pd.DataFrame(X_test, columns=df.drop(columns=[target_column]).columns)

            X_train[columns_to_scale] = X_train.groupby(group_col)[columns_to_scale].transform(lambda x: scaler.fit_transform(x))
            X_test[columns_to_scale] = X_test.groupby(group_col)[columns_to_scale].transform(lambda x: scaler.transform(x))
        else:
            if group_col not in df.columns:
                print(f"Column '{group_col}' not found. Scaling without grouping.")
            else:
                print(f"No specific columns to scale provided. Scaling all columns except the target.")

            # Apply scaling to the entire dataset or selected columns
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)

    # Optionally apply PCA
    if apply_pca and n_components:
        pca = PCA(n_components=n_components)
        X_train = pca.fit_transform(X_train)
        X_test = pca.transform(X_test)

    return X_train, X_test, y_train, y_test


### 4.1.2 Model Training and Evaluation Functions

###### perform_grid_search_cv

In [None]:
# Function to perform grid search cross-validation
def perform_grid_search_cv(model, param_grid, X, y, cv=5):
    grid_search = GridSearchCV(model, param_grid, cv=cv, n_jobs=-1, error_score='raise')
    grid_search.fit(X, y)
    return grid_search.best_estimator_

###### evaluate_model

In [None]:
# Define a function to evaluate a model and print the classification report and accuracy score
def evaluate_model(model, X_test, y_test):
    predictions = model.predict(X_test)
    print(classification_report(y_test, predictions, zero_division=0))
    print("Confusion Matrix:\n", confusion_matrix(y_test, predictions))
    print("Accuracy Score:", accuracy_score(y_test, predictions))

###### perform_cross_validation

In [None]:
# Define a function to perform cross-validation and return the scores
def perform_cross_validation(model, X, y, cv=5):
    scores = cross_val_score(model, X, y, cv=cv)
    return scores

###### convert_to_multiclass

In [None]:
def convert_to_multiclass(y, bins=3, strategy='uniform'):
    """
    Convert a continuous target variable into a multiclass target by binning the values.

    Args:
    - y (pd.Series or np.ndarray): The continuous target variable.
    - bins (int): The number of bins to create for multiclass conversion.
    - strategy (str): The binning strategy ('uniform', 'quantile', 'kmeans').

    Returns:
    - y_multiclass (pd.Series or np.ndarray): The multiclass target variable.
    """
    try:
        labels = range(bins)

        if strategy == 'uniform':
            y_multiclass = pd.cut(y, bins=bins, labels=labels)
        elif strategy == 'quantile':
            y_multiclass = pd.qcut(y, q=bins, labels=labels, duplicates='drop')
        elif strategy == 'kmeans':
            from sklearn.cluster import KMeans
            kmeans = KMeans(n_clusters=bins)
            y_multiclass = kmeans.fit_predict(y.values.reshape(-1, 1))
            y_multiclass = pd.Series(y_multiclass).map(dict(enumerate(labels)))
        else:
            raise ValueError(f"Unsupported strategy: {strategy}")

        if len(pd.unique(y_multiclass)) < bins:
            print(f"Warning: Only {len(pd.unique(y_multiclass))} unique bins were created instead of {bins}.")

        return y_multiclass
    except ValueError as e:
        print(f"Error in convert_to_multiclass: {e}")
        return y  #


###### determine_task_type

In [None]:
# Function to determine task type
def determine_task_type(target_column):
    if target_column == 'Passed':
        return 'classification'
    elif target_column in ['Total_AP', 'Total_PP', 'Total_SPA']:
        return 'both'
    elif target_column == 'FinalGrade':
        return 'regression'
    else:
        return 'both'

###### train_and_evaluate_models

In [None]:
def train_and_evaluate_models(X_train, X_test, y_train, y_test, task_type='classification', scoring=None, random_state=2024, cv=5):
    # Define models for classification and regression with random_state where applicable
    if task_type == 'classification':
        models = {
            'Logistic Regression': LogisticRegression(solver='saga', max_iter=1000, random_state=random_state),
            'Decision Tree': DecisionTreeClassifier(random_state=random_state),
            'Random Forest': RandomForestClassifier(random_state=random_state),
            'Support Vector Machine': SVC(probability=True, random_state=random_state),
            'k-Nearest Neighbors': KNeighborsClassifier(),
            'Neural Network': MLPClassifier(max_iter=1000, random_state=random_state)
        }
    elif task_type == 'regression':
        models = {
            'Linear Regression': LinearRegression(),
            'Decision Tree Regressor': DecisionTreeRegressor(random_state=random_state),
            'Random Forest Regressor': RandomForestRegressor(random_state=random_state),
            'Support Vector Machine': SVR(),
            'k-Nearest Neighbors': KNeighborsRegressor(),
            'Neural Network': MLPRegressor(max_iter=1000, random_state=random_state)
        }

    # Refined hyperparameter grids based on your findings and best practices
    param_grids = {
        'Logistic Regression': {'C': [0.1, 0.15, 0.3]},
        'Decision Tree': {'max_depth': [5, 10, None]},
        'Random Forest': {'n_estimators': [100, 200], 'max_depth': [10, 20, None]},
        'Support Vector Machine': {'C': [0.1, 0.3, 1]},
        'k-Nearest Neighbors': {'n_neighbors': [3, 5, 7]},
        'Neural Network': {'hidden_layer_sizes': [(50,), (100,)]}
    }

    results = {}
    test_scores = {}

    for model_name, model in models.items():
        print(f"Evaluating model: {model_name}")

        # Perform grid search if the model has a hyperparameter grid
        if model_name in param_grids:
            grid_search = GridSearchCV(model, param_grids[model_name], cv=cv, scoring=scoring)
            grid_search.fit(X_train, y_train)
            model = grid_search.best_estimator_
            print(f"Best parameters for {model_name}: {grid_search.best_params_}")

        # Continue with evaluation after optimization
        scores = cross_val_score(model, X_train, y_train, cv=cv, scoring=scoring)
        results[model_name] = scores
        print(f"Cross-validation scores for {model_name}: {scores}")

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        if task_type == 'classification':
            test_scores[model_name] = {
                'Accuracy': accuracy_score(y_test, y_pred),
                'F1-Score': f1_score(y_test, y_pred, average='weighted')
            }
        elif task_type == 'regression':
            test_scores[model_name] = {
                'MSE': mean_squared_error(y_test, y_pred),
                'R2': r2_score(y_test, y_pred),
                'MAE': mean_absolute_error(y_test, y_pred)
            }

        print(f"Test set scores for {model_name}: {test_scores[model_name]}")

    return results, test_scores


###### analyze_datasets_for_target_column

In [None]:
def analyze_datasets_for_target_column(datasets_dict, target_column, exclusion_type, feature_set_name=None, imputation_strategy=None, apply_pca=False, apply_scaling=True, columns_to_scale=None, group_col=None, cv=5):
    """
    Analyze datasets for a specific target column. Optionally apply feature selection and exclusions.

    Parameters:
    - datasets_dict: Dictionary containing the datasets.
    - target_column: The target column to predict (e.g., 'Total_AP').
    - exclusion_type: The exclusion type to filter datasets (e.g., 'more_restricted').
    - feature_set_name: The feature selection method to use (e.g., 'MI'). If None, use the original dataset without feature selection.
    - imputation_strategy: Strategy for imputing missing data.
    - apply_pca: Boolean to apply PCA for dimensionality reduction.
    - apply_scaling: Boolean to apply scaling to the features.
    - columns_to_scale: Specific columns to scale (optional).
    - group_col: Column to group by (optional).
    - cv: Number of cross-validation folds.

    Returns:
    - results: Dictionary with the results of the analysis.
    - best_model_info: Tuple containing information about the best regression and classification models.
    """
    print(f"Processing datasets for target column: {target_column} with exclusion type: {exclusion_type}")

    # Filter datasets based on exclusion type, feature set, and target column
    target_column_lower = target_column.lower()

    if feature_set_name:
        selected_datasets = {
            name: df for name, df in datasets_dict.items()
            if (f"_{exclusion_type}" in name.lower() and
                f"_more_{exclusion_type}" not in name.lower() and
                f"_less_{exclusion_type}" not in name.lower() and
                f"_{feature_set_name.lower()}_" in name.lower() and
                target_column_lower in name.lower())
        }
    else:
        # Adjust for FinalGrade specifically
        base_key = f'filtered_combined_df_{target_column}' if target_column == 'FinalGrade' else f'combined_df_{target_column}'
        selected_datasets = {
            base_key: datasets_dict.get(base_key)
        }

    if not selected_datasets or all(df is None for df in selected_datasets.values()):
        print(f"No datasets found for the given criteria: {exclusion_type}, {feature_set_name}, {target_column}")
        return {}, None

    results = {}
    best_regression_model_info = None
    best_classification_model_info = None
    best_regression_score = float('inf')
    best_classification_score = float('-inf')

    start_time = time.time()

    for dataset_name, df in selected_datasets.items():
        if df is None:
            continue

        print("-" * 80)
        print(f"Processing dataset: {dataset_name}")

        # Determine the task type (classification, regression, or both)
        task_type = determine_task_type(target_column)

        # Handle missing data, scaling, and preprocessing
        X_train, X_test, y_train, y_test = preprocess_data_with_custom_imputation(
            df,
            target_column,
            imputation_strategy=imputation_strategy,
            apply_pca=apply_pca,
            n_components=None,
            apply_scaling=apply_scaling,
            columns_to_scale=columns_to_scale,
            group_col=group_col
        )

        # Check and handle missing values in the target variable
        if y_train.isna().sum() > 0 or y_test.isna().sum() > 0:
            print(f"Warning: NaN values found in the target variable {target_column}.")
            print("Dropping rows with NaN values in the target variable.")
            valid_indices_train = ~y_train.isna()
            valid_indices_test = ~y_test.isna()
            X_train, y_train = X_train[valid_indices_train], y_train[valid_indices_train]
            X_test, y_test = X_test[valid_indices_test], y_test[valid_indices_test]

        # Perform regression analysis if applicable
        if task_type in ['regression', 'both']:
            print("Performing regression analysis...")
            regression_results, regression_test_scores = train_and_evaluate_models(
                X_train,
                X_test,
                y_train,
                y_test,
                task_type='regression',
                scoring='neg_mean_squared_error',
                cv=cv
            )
            results.update({f"{dataset_name}_regression": regression_results})

            # Identify the best model based on MSE (for regression)
            for model_name, test_scores in regression_test_scores.items():
                if test_scores['MSE'] < best_regression_score:
                    best_regression_score = test_scores['MSE']
                    best_regression_model_info = {
                        'model_name': model_name,
                        'params': regression_results[model_name],
                        'dataset': dataset_name,
                        'test_scores': test_scores
                    }

        # Perform classification analysis if applicable
        if task_type in ['classification', 'both']:
            if task_type == 'both':
                # Convert to multiclass if the task type is 'both'
                y_train_class = convert_to_multiclass(y_train)
                y_test_class = convert_to_multiclass(y_test)
            else:
                y_train_class, y_test_class = y_train, y_test

            print("Performing classification analysis...")
            classification_results, classification_test_scores = train_and_evaluate_models(
                X_train,
                X_test,
                y_train_class,
                y_test_class,
                task_type='classification',
                scoring='f1_weighted'
            )
            results.update({f"{dataset_name}_classification": classification_results})

            # Identify the best model based on F1-Score (for classification)
            for model_name, test_scores in classification_test_scores.items():
                if test_scores['F1-Score'] > best_classification_score:
                    best_classification_score = test_scores['F1-Score']
                    best_classification_model_info = {
                        'model_name': model_name,
                        'params': classification_results[model_name],
                        'dataset': dataset_name,
                        'test_scores': test_scores
                    }

    # Save the best regression model
    if best_regression_model_info:
        os.makedirs("best_models", exist_ok=True)
        regression_model_save_path = os.path.join("best_models", f"{best_regression_model_info['dataset']}_regression_{best_regression_model_info['model_name']}.pkl")
        with open(regression_model_save_path, 'wb') as f:
            pickle.dump(best_regression_model_info['params'], f)
        print(f"Best regression model saved to {regression_model_save_path}")

    # Save the best classification model
    if best_classification_model_info:
        os.makedirs("best_models", exist_ok=True)
        classification_model_save_path = os.path.join("best_models", f"{best_classification_model_info['dataset']}_classification_{best_classification_model_info['model_name']}.pkl")
        with open(classification_model_save_path, 'wb') as f:
            pickle.dump(best_classification_model_info['params'], f)
        print(f"Best classification model saved to {classification_model_save_path}")

    end_time = time.time()
    print(f"Processing complete for datasets. Total time: {end_time - start_time:.2f} seconds")
    print(f"Best regression model: {best_regression_model_info}")
    print(f"Best classification model: {best_classification_model_info}")

    return results, (best_regression_model_info, best_classification_model_info)


###### analyze_final_grade_with_advanced_ensembles

In [None]:
def analyze_final_grade_with_advanced_ensembles(
    datasets_dict, target_column='FinalGrade', exclusion_type='more_restricted',
    feature_set_name=None, imputation_strategy=None, apply_pca=False, apply_scaling=True,
    columns_to_scale=None, group_col=None, cv=5
):
    print(f"Processing datasets for target column: {target_column} with exclusion type: {exclusion_type}")

    # Step 1: Filter datasets based on exclusion type, feature set, and target column
    if feature_set_name:
        selected_datasets = {
            name: df for name, df in datasets_dict.items()
            if (f"_{exclusion_type}" in name and
                f"_more_{exclusion_type}" not in name and
                f"_less_{exclusion_type}" not in name and
                f"_{feature_set_name}_" in name and
                f"{target_column.lower()}" in name.lower())
        }
    else:
        base_key = f'filtered_combined_df_{target_column}' if target_column == 'FinalGrade' else f'combined_df_{target_column}'
        selected_datasets = {
            base_key: datasets_dict.get(base_key)
        }

    if not selected_datasets:
        print(f"No datasets found for the given criteria: {exclusion_type}, {feature_set_name}, {target_column}")
        return {}, None

    results = {}
    best_model_info = None
    best_score = float('inf')

    start_time = time.time()

    for dataset_name, df in selected_datasets.items():
        if df is None:
            continue

        # Preprocess the data (imputation, scaling, PCA)
        X_train, X_test, y_train, y_test = preprocess_data_with_custom_imputation(
            df, target_column, imputation_strategy=imputation_strategy,
            apply_pca=apply_pca, n_components=None, apply_scaling=apply_scaling,
            columns_to_scale=columns_to_scale, group_col=group_col
        )

        # Check and handle missing values in the target variable
        if y_train.isna().sum() > 0 or y_test.isna().sum() > 0:
            print(f"Warning: NaN values found in the target variable {target_column}.")
            print("Dropping rows with NaN values in the target variable.")
            valid_indices_train = ~y_train.isna()
            valid_indices_test = ~y_test.isna()
            X_train, y_train = X_train[valid_indices_train], y_train[valid_indices_train]
            X_test, y_test = X_test[valid_indices_test], y_test[valid_indices_test]

        # Check for NaNs and infinities in features after preprocessing
        if np.any(np.isnan(X_train)) or np.any(np.isinf(X_train)):
            raise ValueError("Training features contain NaN or infinity values.")
        if np.any(np.isnan(X_test)) or np.any(np.isinf(X_test)):
            raise ValueError("Test features contain NaN or infinity values.")

        # Define advanced ensemble models for regression analysis
        ensemble_models = {
            'XGBoost': xgb.XGBRegressor(random_state=2024, verbosity=0),  # Suppress XGBoost logs
            'LightGBM': lgb.LGBMRegressor(random_state=2024, verbosity=-1),  # Suppress LightGBM logs
            'CatBoost': cb.CatBoostRegressor(random_state=2024, silent=True),  # Ensure CatBoost is silent
            'Stacking': StackingRegressor(
                estimators=[
                    ('xgb', xgb.XGBRegressor(random_state=2024, verbosity=0)),
                    ('lgbm', lgb.LGBMRegressor(random_state=2024, verbosity=-1)),
                    ('mlp', MLPRegressor(random_state=2024, max_iter=1000))
                ],
                final_estimator=LinearRegression()
            )
        }

        param_grids = {
            'XGBoost': {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1], 'max_depth': [3, 6, 10]},
            'LightGBM': {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1], 'num_leaves': [31, 50]},
            'CatBoost': {'iterations': [200, 500], 'learning_rate': [0.01, 0.1], 'depth': [4, 6, 10]},
            'Stacking': {}
        }

        for model_name, model in ensemble_models.items():
            try:
                # Perform grid search if the model has a hyperparameter grid
                if model_name in param_grids and param_grids[model_name]:
                    grid_search = GridSearchCV(model, param_grids[model_name], cv=cv, scoring='neg_mean_squared_error')
                    grid_search.fit(X_train, y_train)
                    model = grid_search.best_estimator_

                # Evaluate the model
                scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='neg_mean_squared_error')
                results[f"{dataset_name}_{model_name}"] = scores

                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
                test_scores = {
                    'MSE': mean_squared_error(y_test, y_pred),
                    'R2': r2_score(y_test, y_pred),
                    'MAE': mean_absolute_error(y_test, y_pred)
                }

                if test_scores['MSE'] < best_score:
                    best_score = test_scores['MSE']
                    best_model_info = {
                        'model_name': f"{dataset_name}_{model_name}",
                        'params': model,
                        'dataset': dataset_name,
                        'test_scores': test_scores
                    }

                # Save the best model
                if best_model_info:
                    os.makedirs("best_models", exist_ok=True)
                    model_save_path = os.path.join("best_models", f"{best_model_info['dataset']}_{best_model_info['model_name']}.pkl")
                    with open(model_save_path, 'wb') as f:
                        pickle.dump(best_model_info['params'], f)

            except Exception as e:
                print(f"Error with model {model_name} on dataset {dataset_name}: {e}")

    end_time = time.time()
    print(f"Processing complete for datasets. Total time: {end_time - start_time:.2f} seconds")
    print(f"Best model: {best_model_info}")

    return results, best_model_info


###### analyze_final_grade_with_imputation_strategies

In [None]:
def analyze_final_grade_with_imputation_strategies(
    datasets_dict, target_column='FinalGrade', exclusion_type='more_restricted',
    feature_set_name=None, imputation_strategies=['mean', 'median', 'zero', 'knn', 'most_frequent', 'constant'],
    apply_pca=False, apply_scaling=True, columns_to_scale=None, group_col=None, cv=5
):
    """
    Analyze final grade prediction using different imputation strategies with advanced ensemble techniques.

    Parameters:
    - datasets_dict: Dictionary containing the datasets.
    - target_column: The target column to predict (default is 'FinalGrade').
    - exclusion_type: The exclusion type to filter datasets (e.g., 'more_restricted').
    - feature_set_name: The feature selection method to use (e.g., 'Lasso'). If None, use the original dataset without feature selection.
    - imputation_strategies: List of strategies for imputing missing data.
    - apply_pca: Boolean to apply PCA for dimensionality reduction.
    - apply_scaling: Boolean to apply scaling to the features.
    - columns_to_scale: Specific columns to scale (optional).
    - group_col: Column to group by (optional).
    - cv: Number of cross-validation folds (default is 5).

    Returns:
    - best_model_info_overall: Information about the best model found across all imputation strategies.
    """

    print(f"Processing datasets for target column: {target_column} with exclusion type: {exclusion_type}")

    # Step 1: Filter datasets based on exclusion type, feature set, and target column
    if feature_set_name:
        selected_datasets = {
            name: df for name, df in datasets_dict.items()
            if (f"_{exclusion_type}" in name and
                f"_{feature_set_name}_" in name and
                f"{target_column.lower()}" in name.lower())
        }
    else:
        # Use the original dataset without feature selection
        base_key = f'combined_df_{target_column}'
        selected_datasets = {base_key: datasets_dict.get(base_key)}

    if not selected_datasets:
        print(f"No datasets found for the given criteria: {exclusion_type}, {feature_set_name}, {target_column}")
        return None

    # Initialize variables to track the best model across all imputation strategies
    best_model_info_overall = None
    best_score_overall = float('inf')

    # Step 2: Iterate over each imputation strategy
    for imputation_strategy in imputation_strategies:
        print(f"Testing imputation strategy: {imputation_strategy}")

        # Call the updated method for the current imputation strategy
        _, best_model_info = analyze_final_grade_with_advanced_ensembles(
            datasets_dict=datasets_dict,
            target_column=target_column,
            exclusion_type=exclusion_type,
            feature_set_name=feature_set_name,
            imputation_strategy=imputation_strategy,
            apply_pca=apply_pca,
            apply_scaling=apply_scaling,
            columns_to_scale=columns_to_scale,
            group_col=group_col,
            cv=cv
        )

        # Step 3: Update the best model across all strategies if the current one is better
        if best_model_info and best_model_info['test_scores']['MSE'] < best_score_overall:
            best_score_overall = best_model_info['test_scores']['MSE']
            best_model_info_overall = best_model_info
            best_model_info_overall['imputation_strategy'] = imputation_strategy

    # Step 4: Save the best model found across all imputation strategies
    if best_model_info_overall:
        os.makedirs("best_models", exist_ok=True)
        model_save_path = os.path.join(
            "best_models", f"{best_model_info_overall['dataset']}_{best_model_info_overall['model_name']}_overall.pkl"
        )
        with open(model_save_path, 'wb') as f:
            pickle.dump(best_model_info_overall['params'], f)
        print(f"Best overall model saved to {model_save_path}")

    print(f"Best overall model: {best_model_info_overall}")

    return best_model_info_overall


## 4.2 Applying Predictive Modeling Methods

### Method Differences

#### 1. **analyze_datasets_for_target_column**
- **Purpose**: A general method designed for predicting any target column, applicable for both classification and regression tasks depending on the target column type.
- **Unique Aspects**: Versatile and adaptable to different types of target columns (e.g., `Total_AP`, `Total_PP`, `FinalGrade`). It supports different feature selection methods, imputation strategies, and can handle both regression and classification tasks.

#### 2. **analyze_final_grade_with_advanced_ensembles**
- **Purpose**: Focused on predicting the `FinalGrade` target column using advanced ensemble techniques, specifically optimized for regression tasks.
- **Unique Aspects**: Utilizes advanced models like XGBoost, LightGBM, and Stacking to improve prediction accuracy. The method is specifically tailored for enhancing the predictive performance for the `FinalGrade` column.

#### 3. **analyze_final_grade_with_imputation_strategies**
- **Purpose**: Extends the analysis of `FinalGrade` by testing various imputation strategies to determine the most effective approach for missing data, in conjunction with advanced ensemble models.
- **Unique Aspects**: Iteratively applies different imputation strategies (e.g., `mean`, `median`, `zero`, `knn`, `most_frequent`, `constant`) and evaluates ensemble models, selecting the best combination overall for predicting `FinalGrade`.

### When to Use Each Method

- **General Prediction**: Use `analyze_datasets_for_target_column` when you need a flexible tool to predict any column, with support for both regression and classification. This method is versatile and can be adapted to various target columns and tasks.
- **Optimized Predictions for FinalGrade**: Utilize `analyze_final_grade_with_advanced_ensembles` if you want to leverage advanced ensemble techniques specifically for predicting `FinalGrade` and achieve better predictive performance in regression tasks.
- **Best Imputation Strategy for FinalGrade**: Apply `analyze_final_grade_with_imputation_strategies` when you need to determine the most effective imputation strategy in addition to optimizing model performance for predicting `FinalGrade`.

### Common Parameters

These methods share a set of common parameters that allow for flexible and customizable analysis of datasets. Below are the shared parameters that you will find in each method:

#### **Parameters:**
- `datasets_dict`: A dictionary containing the datasets to be analyzed.
- `target_column`: The column you want to predict (e.g., 'Total_AP', 'Passed', 'FinalGrade').
- `exclusion_type`: Specifies the dataset filtering criteria (e.g., 'more_restricted', 'restricted', 'less_restricted').
- `feature_set_name`: The name of the feature selection method to be used (e.g., 'MI' for Mutual Information). If `None`, no feature selection is applied.
- `imputation_strategy`: Specifies how to handle missing data (e.g., 'mean', 'median', 'zero', 'knn', 'most_frequent', 'constant').
- `apply_pca`: A boolean indicating whether to apply PCA for dimensionality reduction.
- `apply_scaling`: A boolean indicating whether to scale the features.
- `columns_to_scale`: Specific columns to scale. If `None`, all columns except the target column are scaled.
- `group_col`: The column by which to group data before applying scaling (optional).
- `cv`: Number of cross-validation folds (default is 5).

### Example of Usage
```python
# Example for analyzing a target column with a specific feature set and exclusion type
results, best_model = analyze_datasets_for_target_column(
    datasets_dict=my_datasets_dict,   # Dictionary of datasets to analyze
    target_column='Total_AP',         # The target column to predict
    feature_set_name='MI',            # Feature selection method to apply
    exclusion_type='more_restricted', # The exclusion type for filtering datasets
    imputation_strategy='mean',       # Strategy for imputing missing values
    apply_pca=True,                   # Whether to apply PCA
    apply_scaling=True,               # Whether to apply Min-Max scaling
    group_col='Year',                 # Column to group by before scaling (if applicable)
    cv=5                              # Number of cross-validation folds
)
```

In [None]:
print(without_feature_selection_datasets.keys())

dict_keys(['combined_df_Total_SPA', 'combined_df_Total_PP', 'filtered_combined_df_FinalGrade', 'combined_df_Passed', 'combined_df_Total_AP'])


### 4.2.1 Predicting Total_AP feature

#### Several Imputations and Test - CPU

In [None]:
# Example for analyzing 'Total_AP' with 'MI' feature set and 'more_restricted' exclusion type
mi_more_restricted_total_ap_results_median, mi_more_restricted_total_ap_best_model_median = analyze_datasets_for_target_column(
    datasets_dict=mi_total_ap_datasets,  # This dictionary was created using your structure
    target_column='Total_AP',
    group_col='Year',
    imputation_strategy='median',
    feature_set_name='MI',
    exclusion_type='more_restricted'
)

Processing datasets for target column: Total_AP with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: predict_total_ap_MI_more_restricted
Column 'Year' not found. Scaling without grouping.
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-182.0651522  -202.92519892 -194.73290528 -174.8535481  -179.66125447]
Test set scores for Linear Regression: {'MSE': 157.79729581078337, 'R2': 0.793325348103151, 'MAE': 8.013893906197902}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-271.59360865 -287.7456871  -329.10705485 -260.03785542 -308.27860947]
Test set scores for Decision Tree Regressor: {'MSE': 230.98798894926023, 'R2': 0.6974640030226729, 'MAE': 7.77646991051817}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-149.22889691 -162.21479643 -165.

In [None]:
rf_more_restricted_total_ap_results_median, rf_more_restricted_total_ap_best_model_median = analyze_datasets_for_target_column(
    datasets_dict=mi_total_ap_datasets,  # This dictionary was created using your structure
    target_column='Total_AP',
    group_col='Year',
    imputation_strategy='median',
    feature_set_name='RF',
    exclusion_type='more_restricted'
)

Processing datasets for target column: Total_AP with exclusion type: more_restricted
No datasets found for the given criteria: more_restricted, RF, Total_AP


In [None]:
lasso_more_restricted_total_ap_results_median, lasso_more_restricted_total_ap_best_model_median = analyze_datasets_for_target_column(
    datasets_dict=mi_total_ap_datasets,  # This dictionary was created using your structure
    target_column='Total_AP',
    group_col='Year',
    imputation_strategy='median',
    feature_set_name='Lasso',
    exclusion_type='more_restricted'
)

Processing datasets for target column: Total_AP with exclusion type: more_restricted
No datasets found for the given criteria: more_restricted, Lasso, Total_AP


In [None]:
fr_more_restricted_total_ap_results_median, fr_more_restricted_total_ap_best_model_median = analyze_datasets_for_target_column(
    datasets_dict=mi_total_ap_datasets,  # This dictionary was created using your structure
    target_column='Total_AP',
    group_col='Year',
    imputation_strategy='median',
    feature_set_name='FR',
    exclusion_type='more_restricted'
)

Processing datasets for target column: Total_AP with exclusion type: more_restricted
No datasets found for the given criteria: more_restricted, FR, Total_AP


A testar antes...

In [None]:
# Analyze 'Total_AP' using the original dataset without feature selection
original_total_ap_results, original_total_ap_best_model = analyze_datasets_for_target_column(
    datasets_dict=loaded_datasets,
    target_column='Total_AP',
    exclusion_type='more_restricted'  # When using original dataframe only more_restricted available
)

Processing datasets for target column: Total_AP with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: combined_df_Total_AP
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-200.79392161 -219.86076691 -220.56014285 -192.3899982  -212.80426943]
Test set scores for Linear Regression: {'MSE': 190.43678728142854, 'R2': 0.7505758478463489, 'MAE': 8.76275384891383}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-354.36191135 -302.01173979 -343.09948464 -327.15033932 -322.63158691]
Test set scores for Decision Tree Regressor: {'MSE': 286.7096894179701, 'R2': 0.6244826316480938, 'MAE': 8.480553324595014}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-212.86366694 -192.833936   -221.04861179 -182.50848612 -215.69472974]
Test set scores for Random F

In [None]:
# Analyze 'Total_AP' using the original dataset without feature selection
original_total_ap_results_mean, original_total_ap_best_model_mean = analyze_datasets_for_target_column(
    datasets_dict=loaded_datasets,
    target_column='Total_AP',
    imputation_strategy='mean',
    exclusion_type='more_restricted'  # When using original dataframe only more_restricted available
)

Processing datasets for target column: Total_AP with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: combined_df_Total_AP
Column 'None' not found. Scaling without grouping.
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-167.19714914 -184.97225746 -179.2603239  -152.36950061 -175.22779997]
Test set scores for Linear Regression: {'MSE': 2.3857261581400292e+23, 'R2': -3.124699448880559e+20, 'MAE': 12295800185.869598}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-318.65696456 -302.13278375 -366.06097411 -287.02215034 -339.94054338]
Test set scores for Decision Tree Regressor: {'MSE': 324.70895040822154, 'R2': 0.5747131853648384, 'MAE': 9.057737089343552}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-176.8344524  -163.70263964 -177.8450

In [None]:
# Analyze 'Total_AP' using the original dataset without feature selection
original_total_ap_results_median, original_total_ap_best_model_median = analyze_datasets_for_target_column(
    datasets_dict=loaded_datasets,
    target_column='Total_AP',
    imputation_strategy='median',
    exclusion_type='more_restricted'  # When using original dataframe only more_restricted available
)

Processing datasets for target column: Total_AP with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: combined_df_Total_AP
Column 'None' not found. Scaling without grouping.
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-167.06263551 -181.84860998 -177.01012095 -150.67951069 -174.07183842]
Test set scores for Linear Regression: {'MSE': 3.342304840190633e+23, 'R2': -4.377576217832418e+20, 'MAE': 14553569685.334995}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-321.21245281 -243.6119102  -320.89251664 -308.56417138 -361.38701006]
Test set scores for Decision Tree Regressor: {'MSE': 305.85814574906976, 'R2': 0.5994029842038339, 'MAE': 8.82145648500211}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-179.75377033 -167.20026191 -185.452686

In [None]:
# Analyze 'Total_AP' using the original dataset without feature selection
original_total_ap_results_knn, original_total_ap_best_model_knn = analyze_datasets_for_target_column(
    datasets_dict=loaded_datasets,
    target_column='Total_AP',
    imputation_strategy='knn',
    exclusion_type='more_restricted'  # When using original dataframe only more_restricted available
)

Processing datasets for target column: Total_AP with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: combined_df_Total_AP
Column 'None' not found. Scaling without grouping.
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-178.66424449 -193.95289288 -188.79909246 -158.09985219 -184.54164346]
Test set scores for Linear Regression: {'MSE': 2.5432288229346357e+23, 'R2': -3.33098820846922e+20, 'MAE': 12695190415.500898}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-306.02539549 -337.55465332 -404.58273811 -329.52585121 -321.79215586]
Test set scores for Decision Tree Regressor: {'MSE': 246.08429196520183, 'R2': 0.6776916542335635, 'MAE': 8.287206027320096}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-170.16709114 -181.07153827 -178.24348

In [None]:
# Analyze 'Total_AP' using the original dataset without feature selection
original_total_ap_results_zero, original_total_ap_best_model_zero = analyze_datasets_for_target_column(
    datasets_dict=loaded_datasets,
    target_column='Total_AP',
    imputation_strategy='zero',
    exclusion_type='more_restricted'  # When using original dataframe only more_restricted available
)

Processing datasets for target column: Total_AP with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: combined_df_Total_AP
Column 'None' not found. Scaling without grouping.
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-163.07572659 -174.0228268  -170.5257389  -143.02765517 -167.08778843]
Test set scores for Linear Regression: {'MSE': 3.8050004681922454e+23, 'R2': -4.9835907718846965e+20, 'MAE': 15528298039.934677}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-310.76045009 -327.45640215 -348.52434244 -249.91526533 -308.51394212]
Test set scores for Decision Tree Regressor: {'MSE': 332.28800300993987, 'R2': 0.5647865383325197, 'MAE': 9.071120767698714}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-181.95008019 -167.48348668 -177.212

In [None]:
# Analyze 'Total_AP' using the original dataset without feature selection
original_total_ap_results, original_total_ap_best_model = analyze_datasets_for_target_column(
    datasets_dict=loaded_datasets,
    target_column='Total_AP',
    exclusion_type='more_restricted'  # When using original dataframe only more_restricted available
)

Processing datasets for target column: Total_AP with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: combined_df_Total_AP
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-200.79392161 -219.86076691 -220.56014285 -192.3899982  -212.80426943]
Test set scores for Linear Regression: {'MSE': 190.43678728142854, 'R2': 0.7505758478463489, 'MAE': 8.76275384891383}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-354.36191135 -302.01173979 -343.09948464 -327.15033932 -322.63158691]
Test set scores for Decision Tree Regressor: {'MSE': 286.7096894179701, 'R2': 0.6244826316480938, 'MAE': 8.480553324595014}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-212.86366694 -192.833936   -221.04861179 -182.50848612 -215.69472974]
Test set scores for Random F

In [None]:
rf_more_restricted_total_ap_results, rf_more_restricted_total_ap_best_model = analyze_datasets_for_target_column(
    datasets_dict=rf_total_ap_datasets,
    target_column='Total_AP',
    feature_set_name='RF',
    exclusion_type='more_restricted'
)

Processing datasets for target column: Total_AP with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: predict_total_ap_RF_more_restricted_regression
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-247.53847786 -280.12702593 -270.3252893  -248.64364402 -261.6316098 ]
Test set scores for Linear Regression: {'MSE': 239.23405303163355, 'R2': 0.6866637392096149, 'MAE': 10.367065808842053}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-368.09742719 -380.52221886 -406.11780575 -368.00467648 -359.42184907]
Test set scores for Decision Tree Regressor: {'MSE': 355.8814252776453, 'R2': 0.5338851067891017, 'MAE': 10.446852005820466}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-248.48920872 -259.35021075 -273.1279423  -226.7991639  -263.64932149]

##### Using selected features

In [None]:
dt_more_restricted_total_ap_results, dt_more_restricted_total_ap_best_model = analyze_datasets_for_target_column(
    datasets_dict=dt_total_ap_datasets,
    target_column='Total_AP',
    feature_set_name='DT',
    imputation_strategy='median',
    exclusion_type='more_restricted'
)


In [None]:
lasso_more_restricted_total_ap_results, lasso_more_restricted_total_ap_best_model = analyze_datasets_for_target_column(
    datasets_dict=lasso_total_ap_datasets,
    target_column='Total_AP',
    feature_set_name='Lasso',
    imputation_strategy='median',
    exclusion_type='more_restricted'
)


In [None]:
fr_more_restricted_total_ap_results, fr_more_restricted_total_ap_best_model = analyze_datasets_for_target_column(
    datasets_dict=fr_total_ap_datasets,
    target_column='Total_AP',
    feature_set_name='FR',
    imputation_strategy='median',
    exclusion_type='more_restricted'
)


#### Several Imputations and Test - T4 GPU

In [None]:
# Analyze 'Total_AP' using the original dataset without feature selection
original_total_ap_results, original_total_ap_best_model = analyze_datasets_for_target_column(
    datasets_dict=loaded_datasets,
    target_column='Total_AP',
    exclusion_type='more_restricted'  # When using original dataframe only more_restricted available
)

Processing datasets for target column: Total_AP with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: combined_df_Total_AP
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-200.79392161 -219.86076691 -220.56014285 -192.3899982  -212.80426943]
Test set scores for Linear Regression: {'MSE': 190.43678728142854, 'R2': 0.7505758478463489, 'MAE': 8.76275384891383}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-354.36191135 -302.01173979 -343.09948464 -327.15033932 -322.63158691]
Test set scores for Decision Tree Regressor: {'MSE': 286.7096894179701, 'R2': 0.6244826316480938, 'MAE': 8.480553324595014}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-212.86366694 -192.833936   -221.04861179 -182.50848612 -215.69472974]
Test set scores for Random F

In [None]:
# Analyze 'Total_AP' using the original dataset without feature selection
original_total_ap_results_mean, original_total_ap_best_model_mean = analyze_datasets_for_target_column(
    datasets_dict=loaded_datasets,
    target_column='Total_AP',
    imputation_strategy='mean',
    exclusion_type='more_restricted'  # When using original dataframe only more_restricted available
)

Processing datasets for target column: Total_AP with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: combined_df_Total_AP
Column 'None' not found. Scaling without grouping.
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-167.19714914 -184.97225746 -179.2603239  -152.36950061 -175.22779997]
Test set scores for Linear Regression: {'MSE': 2.3857261581400292e+23, 'R2': -3.124699448880559e+20, 'MAE': 12295800185.869598}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-318.65696456 -302.13278375 -366.06097411 -287.02215034 -339.94054338]
Test set scores for Decision Tree Regressor: {'MSE': 324.70895040822154, 'R2': 0.5747131853648384, 'MAE': 9.057737089343552}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-176.8344524  -163.70263964 -177.8450

In [None]:
# Analyze 'Total_AP' using the original dataset without feature selection
original_total_ap_results_median, original_total_ap_best_model_median = analyze_datasets_for_target_column(
    datasets_dict=loaded_datasets,
    target_column='Total_AP',
    imputation_strategy='median',
    exclusion_type='more_restricted'  # When using original dataframe only more_restricted available
)

Processing datasets for target column: Total_AP with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: combined_df_Total_AP
Column 'None' not found. Scaling without grouping.
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-167.06263551 -181.84860998 -177.01012095 -150.67951069 -174.07183842]
Test set scores for Linear Regression: {'MSE': 3.342304840190633e+23, 'R2': -4.377576217832418e+20, 'MAE': 14553569685.334995}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-321.21245281 -243.6119102  -320.89251664 -308.56417138 -361.38701006]
Test set scores for Decision Tree Regressor: {'MSE': 305.85814574906976, 'R2': 0.5994029842038339, 'MAE': 8.82145648500211}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-179.75377033 -167.20026191 -185.452686

In [None]:
# Analyze 'Total_AP' using the original dataset without feature selection
original_total_ap_results_knn, original_total_ap_best_model_knn = analyze_datasets_for_target_column(
    datasets_dict=loaded_datasets,
    target_column='Total_AP',
    imputation_strategy='knn',
    exclusion_type='more_restricted'  # When using original dataframe only more_restricted available
)

Processing datasets for target column: Total_AP with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: combined_df_Total_AP
Column 'None' not found. Scaling without grouping.
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-178.66424449 -193.95289288 -188.79909246 -158.09985219 -184.54164346]
Test set scores for Linear Regression: {'MSE': 2.5432288229346357e+23, 'R2': -3.33098820846922e+20, 'MAE': 12695190415.500898}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-306.02539549 -337.55465332 -404.58273811 -329.52585121 -321.79215586]
Test set scores for Decision Tree Regressor: {'MSE': 246.08429196520183, 'R2': 0.6776916542335635, 'MAE': 8.287206027320096}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-170.16709114 -181.07153827 -178.24348

In [None]:
# Analyze 'Total_AP' using the original dataset without feature selection
original_total_ap_results_zero, original_total_ap_best_model_zero = analyze_datasets_for_target_column(
    datasets_dict=loaded_datasets,
    target_column='Total_AP',
    imputation_strategy='zero',
    exclusion_type='more_restricted'  # When using original dataframe only more_restricted available
)

Processing datasets for target column: Total_AP with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: combined_df_Total_AP
Column 'None' not found. Scaling without grouping.
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-163.07572659 -174.0228268  -170.5257389  -143.02765517 -167.08778843]
Test set scores for Linear Regression: {'MSE': 3.8050004681922454e+23, 'R2': -4.9835907718846965e+20, 'MAE': 15528298039.934677}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-310.76045009 -327.45640215 -348.52434244 -249.91526533 -308.51394212]
Test set scores for Decision Tree Regressor: {'MSE': 332.28800300993987, 'R2': 0.5647865383325197, 'MAE': 9.071120767698714}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-181.95008019 -167.48348668 -177.212

##### **Using selected features**

In [None]:
# Example for analyzing 'Total_AP' with 'MI' feature set and 'more_restricted' exclusion type
mi_more_restricted_total_ap_results, mi_more_restricted_total_ap_best_model = analyze_datasets_for_target_column(
    datasets_dict=mi_total_ap_datasets,  # This dictionary was created using your structure
    target_column='Total_AP',
    feature_set_name='MI',
    imputation_strategy='median',
    exclusion_type='more_restricted'
)

Processing datasets for target column: Total_AP with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: predict_total_ap_MI_more_restricted
Column 'None' not found. Scaling without grouping.
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-182.0651522  -202.92519892 -194.73290528 -174.8535481  -179.66125447]
Test set scores for Linear Regression: {'MSE': 157.79729581078337, 'R2': 0.793325348103151, 'MAE': 8.013893906197902}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-271.59360865 -287.7456871  -329.10705485 -260.03785542 -308.27860947]
Test set scores for Decision Tree Regressor: {'MSE': 230.98798894926023, 'R2': 0.6974640030226729, 'MAE': 7.77646991051817}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-149.22889691 -162.21479643 -165.

In [None]:
rf_more_restricted_total_ap_results, rf_more_restricted_total_ap_best_model = analyze_datasets_for_target_column(
    datasets_dict=rf_total_ap_datasets,
    target_column='Total_AP',
    feature_set_name='RF',
    imputation_strategy='median',
    exclusion_type='more_restricted'
)

Processing datasets for target column: Total_AP with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: predict_total_ap_RF_more_restricted_regression
Column 'None' not found. Scaling without grouping.
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-186.50792099 -210.86066602 -202.88906156 -179.0533142  -192.17091088]
Test set scores for Linear Regression: {'MSE': 173.43917916934865, 'R2': 0.7728384266921589, 'MAE': 8.186496093239938}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-311.91505872 -341.09577201 -360.63305366 -299.80294424 -300.80977611]
Test set scores for Decision Tree Regressor: {'MSE': 274.1890188187719, 'R2': 0.6408815516949071, 'MAE': 8.627498617422573}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-176.6087383  -178.68

#### More Restricted - L4 GPU

**Removing missing data (no imputation)**

In [None]:
# Analyze 'Total_AP' using the original dataset without feature selection
original_total_ap_results, original_total_ap_best_model = analyze_datasets_for_target_column(
    datasets_dict=loaded_datasets,
    target_column='Total_AP',
    exclusion_type='more_restricted'  # When using original dataframe only more_restricted available
)


Processing datasets for target column: Total_AP with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: combined_df_Total_AP
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-200.79392161 -219.86076691 -220.56014285 -192.3899982  -212.80426943]
Test set scores for Linear Regression: {'MSE': 190.43678728142862, 'R2': 0.7505758478463488, 'MAE': 8.762753848913828}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-354.36191135 -302.01173979 -343.09948464 -327.15033932 -322.63158691]
Test set scores for Decision Tree Regressor: {'MSE': 286.7096894179701, 'R2': 0.6244826316480938, 'MAE': 8.480553324595014}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-212.86366694 -192.833936   -221.04861179 -182.50848612 -215.69472974]
Test set scores for Random 

**Removing missing data (no imputation) and PCA**

In [None]:
# Analyze 'Total_AP' using the original dataset without feature selection
original_total_ap_results_pca, original_total_ap_best_model_pca = analyze_datasets_for_target_column(
    datasets_dict=loaded_datasets,
    target_column='Total_AP',
    apply_pca=True,
    exclusion_type='more_restricted'  # When using original dataframe only more_restricted available
)


**Zero Imputation**

In [None]:
# Analyze 'Total_AP' using the original dataset without feature selection
original_total_ap_results_zero, original_total_ap_best_model_zero = analyze_datasets_for_target_column(
    datasets_dict=loaded_datasets,
    target_column='Total_AP',
    imputation_strategy='zero',
    exclusion_type='more_restricted'  # When using original dataframe only more_restricted available
)


**Using selected features**

In [None]:
# Example for analyzing 'Total_AP' with 'MI' feature set and 'more_restricted' exclusion type
mi_more_restricted_total_ap_results, mi_more_restricted_total_ap_best_model = analyze_datasets_for_target_column(
    datasets_dict=mi_total_ap_datasets,  # This dictionary was created using your structure
    target_column='Total_AP',
    feature_set_name='MI',
    exclusion_type='more_restricted'
)

Processing datasets for target column: Total_AP with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: predict_total_ap_MI_more_restricted
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-245.25475019 -275.39960014 -270.79829502 -246.13460971 -257.7444634 ]
Test set scores for Linear Regression: {'MSE': 238.22160089177677, 'R2': 0.6879897961137713, 'MAE': 10.257488238257768}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-289.51980115 -358.71452461 -293.83119115 -306.83965106 -313.52312419]
Test set scores for Decision Tree Regressor: {'MSE': 275.56590799301387, 'R2': 0.6390781742078289, 'MAE': 8.843642301967263}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-196.22086974 -215.65527301 -208.42204321 -184.74007188 -203.92056146]
Test set s

In [None]:
rf_more_restricted_total_ap_results, rf_more_restricted_total_ap_best_model = analyze_datasets_for_target_column(
    datasets_dict=rf_total_ap_datasets,
    target_column='Total_AP',
    feature_set_name='RF',
    exclusion_type='more_restricted'
)

Processing datasets for target column: Total_AP with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: predict_total_ap_RF_more_restricted_regression
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-247.53847786 -280.12702593 -270.3252893  -248.64364402 -261.6316098 ]
Test set scores for Linear Regression: {'MSE': 239.23405303163352, 'R2': 0.6866637392096149, 'MAE': 10.367065808842053}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-368.09742719 -380.52221886 -406.11780575 -368.00467648 -359.42184907]
Test set scores for Decision Tree Regressor: {'MSE': 355.8814252776453, 'R2': 0.5338851067891017, 'MAE': 10.446852005820466}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-248.48920872 -259.35021075 -273.1279423  -226.7991639  -263.64932149]

In [None]:
dt_more_restricted_total_ap_results, dt_more_restricted_total_ap_best_model = analyze_datasets_for_target_column(
    datasets_dict=dt_total_ap_datasets,
    target_column='Total_AP',
    feature_set_name='DT',
    exclusion_type='more_restricted'
)


Processing datasets for target column: Total_AP with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: predict_total_ap_DT_more_restricted
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-245.25475019 -275.39960014 -270.79829502 -246.13460971 -257.7444634 ]
Test set scores for Linear Regression: {'MSE': 238.22160089177677, 'R2': 0.6879897961137713, 'MAE': 10.257488238257768}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-289.51980115 -358.71452461 -293.83119115 -306.83965106 -313.52312419]
Test set scores for Decision Tree Regressor: {'MSE': 275.56590799301387, 'R2': 0.6390781742078289, 'MAE': 8.843642301967263}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-196.22086974 -215.65527301 -208.42204321 -184.74007188 -203.92056146]
Test set s

In [None]:
lasso_more_restricted_total_ap_results, lasso_more_restricted_total_ap_best_model = analyze_datasets_for_target_column(
    datasets_dict=lasso_total_ap_datasets,
    target_column='Total_AP',
    feature_set_name='Lasso',
    exclusion_type='more_restricted'
)


Processing datasets for target column: Total_AP with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: predict_total_ap_Lasso_more_restricted_regression
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-247.53847786 -280.12702593 -270.3252893  -248.64364402 -261.6316098 ]
Test set scores for Linear Regression: {'MSE': 239.23405303163352, 'R2': 0.6866637392096149, 'MAE': 10.367065808842053}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-368.09742719 -380.52221886 -406.11780575 -368.00467648 -359.42184907]
Test set scores for Decision Tree Regressor: {'MSE': 355.8814252776453, 'R2': 0.5338851067891017, 'MAE': 10.446852005820466}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-248.48920872 -259.35021075 -273.1279423  -226.7991639  -263.649321

In [None]:
fr_more_restricted_total_ap_results, fr_more_restricted_total_ap_best_model = analyze_datasets_for_target_column(
    datasets_dict=fr_total_ap_datasets,
    target_column='Total_AP',
    feature_set_name='FR',
    exclusion_type='more_restricted'
)


Processing datasets for target column: Total_AP with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: predict_total_ap_FR_more_restricted_regression
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-249.10899991 -286.52302084 -272.92861045 -252.59483523 -266.59844778]
Test set scores for Linear Regression: {'MSE': 244.04831214807925, 'R2': 0.6803582742019905, 'MAE': 10.523532605521442}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-319.03427932 -339.22061805 -385.00069775 -343.25391993 -343.78171522]
Test set scores for Decision Tree Regressor: {'MSE': 320.5097883456202, 'R2': 0.5802130283950209, 'MAE': 9.922527737037315}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-241.87971715 -256.1791351  -266.80550439 -225.03788527 -261.78949986]


In [None]:
print("Column to Predict/Classify: Total_AP. Feature Set: more_restricted")
print("Best Model Info without feature selection removing columns with missing data:", original_total_ap_best_model)
print("Best Model Info without feature selection using PCA:", original_total_ap_best_model_pca)
print("Best Model Info without feature selection using zero imputation:", original_total_ap_best_model_zero)
print("Best Model Info using MI Feature Selection method:", mi_more_restricted_total_ap_best_model)
print("Best Model Info using RF Feature Selection method:", rf_more_restricted_total_ap_best_model)
print("Best Model Info using DT Feature Selection method:", dt_more_restricted_total_ap_best_model)
print("Best Model Info using Lasso Feature Selection method:", lasso_more_restricted_total_ap_best_model)
print("Best Model Info using FR Feature Selection method:", fr_more_restricted_total_ap_best_model)


Column to Predict/Classify: Total_AP. Feature Set: more_restricted
Best Model Info without feature selection: {'model_name': 'Random Forest', 'params': array([0.78940792, 0.80663052, 0.80912303, 0.79466036, 0.8034326 ]), 'dataset': 'combined_df_Total_AP', 'test_scores': {'Accuracy': 0.8231939163498099, 'F1-Score': 0.8142354277268302}}
Best Model Info without feature selection using PCA: {'model_name': 'Random Forest', 'params': array([0.78940792, 0.80663052, 0.80912303, 0.79466036, 0.8034326 ]), 'dataset': 'combined_df_Total_AP', 'test_scores': {'Accuracy': 0.8231939163498099, 'F1-Score': 0.8142354277268302}}
Best Model Info using MI Feature Selection method: {'model_name': 'Neural Network', 'params': array([0.79463296, 0.77881104, 0.78843844, 0.7956285 , 0.79414547]), 'dataset': 'predict_total_ap_MI_more_restricted', 'test_scores': {'Accuracy': 0.8326996197718631, 'F1-Score': 0.833185216798152}}
Best Model Info using RF Feature Selection method: {'model_name': 'Neural Network', 'param

#### Restricted - L4 GPU

In [None]:
mi_restricted_total_ap_results, mi_restricted_total_ap_best_model = analyze_datasets_for_target_column(
    datasets_dict=rf_total_ap_datasets,
    target_column='Total_AP',
    feature_set_name='MI',
    exclusion_type='restricted'
)


Processing datasets for target column: Total_AP with exclusion type: restricted
No datasets found for the given criteria: restricted, MI, Total_AP


In [None]:
rf_restricted_total_ap_results, rf_restricted_total_ap_best_model = analyze_datasets_for_target_column(
    datasets_dict=rf_total_ap_datasets,
    target_column='Total_AP',
    feature_set_name='RF',
    exclusion_type='restricted'
)


Processing datasets for target column: Total_AP with exclusion type: restricted
--------------------------------------------------------------------------------
Processing dataset: predict_total_ap_RF_restricted
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-245.25475019 -275.39960014 -270.79829502 -246.13460971 -257.7444634 ]
Test set scores for Linear Regression: {'MSE': 238.22160089177663, 'R2': 0.6879897961137715, 'MAE': 10.257488238257762}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-283.77648167 -358.7143409  -293.86290119 -298.52507102 -317.26647689]
Test set scores for Decision Tree Regressor: {'MSE': 290.2103946850291, 'R2': 0.6198975908288237, 'MAE': 8.977355863437477}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-197.06990989 -216.06307053 -209.14376449 -183.90106605 -204.64034948]
Test set scores for R

In [None]:
# Example for analyzing 'Total_AP' with 'DT' feature set and 'restricted' exclusion type
dt_restricted_total_ap_results, dt_restricted_total_ap_best_model = analyze_datasets_for_target_column(
    datasets_dict=dt_total_ap_datasets,
    target_column='Total_AP',
    feature_set_name='DT',
    exclusion_type='restricted'
)


Processing datasets for target column: Total_AP with exclusion type: restricted
--------------------------------------------------------------------------------
Processing dataset: predict_total_ap_DT_restricted
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-245.25475019 -275.39960014 -270.79829502 -246.13460971 -257.7444634 ]
Test set scores for Linear Regression: {'MSE': 238.22160089177663, 'R2': 0.6879897961137715, 'MAE': 10.257488238257762}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-283.77648167 -358.7143409  -293.86290119 -298.52507102 -317.26647689]
Test set scores for Decision Tree Regressor: {'MSE': 290.2103946850291, 'R2': 0.6198975908288237, 'MAE': 8.977355863437477}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-197.06990989 -216.06307053 -209.14376449 -183.90106605 -204.64034948]
Test set scores for R

In [None]:
print("Column to Predict/Classify: Total_AP. Feature Set: restricted")
print("Best Model Info using MI Feature Selection method:", mi_restricted_total_ap_best_model)
print("Best Model Info using RF Feature Selection method:", rf_restricted_total_ap_best_model)
print("Best Model Info using DT Feature Selection method:", dt_restricted_total_ap_best_model)

Column to Predict/Classify: Total_AP. Feature Set: restricted
Best Model Info using MI Feature Selection method: None
Best Model Info using RF Feature Selection method: {'model_name': 'Random Forest', 'params': array([0.78545542, 0.78098583, 0.81147609, 0.79213062, 0.79083029]), 'dataset': 'predict_total_ap_RF_restricted', 'test_scores': {'Accuracy': 0.8288973384030418, 'F1-Score': 0.8200227508227805}}
Best Model Info using DT Feature Selection method: {'model_name': 'Random Forest', 'params': array([0.78545542, 0.78098583, 0.81147609, 0.79213062, 0.79083029]), 'dataset': 'predict_total_ap_DT_restricted', 'test_scores': {'Accuracy': 0.8288973384030418, 'F1-Score': 0.8200227508227805}}


#### Less Restricted - L4 GPU

In [None]:
mi_less_restricted_total_ap_results, mi_less_restricted_total_ap_best_model = analyze_datasets_for_target_column(
    datasets_dict=mi_total_ap_datasets,
    target_column='Total_AP',
    feature_set_name='MI',
    exclusion_type='less_restricted'
)


Processing datasets for target column: Total_AP with exclusion type: less_restricted
--------------------------------------------------------------------------------
Processing dataset: predict_total_ap_MI_less_restricted
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-74.23838131 -71.39632698 -65.34522975 -63.81411929 -67.51983334]
Test set scores for Linear Regression: {'MSE': 58.4429924942444, 'R2': 0.9234544225393962, 'MAE': 3.479253771312736}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-82.12429261 -80.87805396 -84.08242259 -95.49206158 -94.46355866]
Test set scores for Decision Tree Regressor: {'MSE': 72.36375919641264, 'R2': 0.9052217297830021, 'MAE': 2.966294555058158}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-52.63821555 -43.66917861 -42.86965633 -41.57614534 -45.05951159]
Test set scores for Random Fo

In [None]:
rf_less_restricted_total_ap_results, rf_less_restricted_total_ap_best_model = analyze_datasets_for_target_column(
    datasets_dict=rf_total_ap_datasets,
    target_column='Total_AP',
    feature_set_name='RF',
    exclusion_type='less_restricted'
)


Processing datasets for target column: Total_AP with exclusion type: less_restricted
--------------------------------------------------------------------------------
Processing dataset: predict_total_ap_RF_less_restricted
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-74.30402834 -71.65318537 -65.78548918 -63.46329178 -67.75895145]
Test set scores for Linear Regression: {'MSE': 58.33658488383781, 'R2': 0.923593789666868, 'MAE': 3.4834273917835987}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [ -80.27235278  -82.19792728  -90.29874588  -96.23532624 -101.36688671]
Test set scores for Decision Tree Regressor: {'MSE': 65.88277060325674, 'R2': 0.9137101899594305, 'MAE': 2.9149637565790707}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-53.50747303 -43.52616263 -42.70516592 -40.63833552 -45.03299733]
Test set scores for Ra

In [None]:
dt_less_restricted_total_ap_results, dt_less_restricted_total_ap_best_model = analyze_datasets_for_target_column(
    datasets_dict=dt_total_ap_datasets,
    target_column='Total_AP',
    feature_set_name='DT',
    exclusion_type='less_restricted'
)


Processing datasets for target column: Total_AP with exclusion type: less_restricted
--------------------------------------------------------------------------------
Processing dataset: predict_total_ap_DT_less_restricted
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-75.46884559 -73.90988407 -69.63466354 -67.63512739 -70.57189598]
Test set scores for Linear Regression: {'MSE': 61.414494550431634, 'R2': 0.9195625044306748, 'MAE': 3.085852589096458}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-83.40932824 -68.64765021 -87.79557614 -90.74958937 -84.54279786]
Test set scores for Decision Tree Regressor: {'MSE': 64.94297331715062, 'R2': 0.9149410873329349, 'MAE': 2.9231995079400583}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-53.60462354 -43.48152608 -43.26933153 -40.18149287 -44.68856339]
Test set scores for Random

In [None]:
print("Column to Predict/Classify: Total_AP. Feature Set: less_restricted")
print("Best Model Info using MI Feature Selection method:", mi_less_restricted_total_ap_best_model)
print("Best Model Info using RF Feature Selection method:", rf_less_restricted_total_ap_best_model)
print("Best Model Info using DT Feature Selection method:", dt_less_restricted_total_ap_best_model)

Column to Predict/Classify: Total_AP. Feature Set: less_restricted
Best Model Info using MI Feature Selection method: {'model_name': 'Support Vector Machine', 'params': array([0.96614891, 0.96952051, 0.97203003, 0.96545263, 0.96774045]), 'dataset': 'predict_total_ap_MI_less_restricted', 'test_scores': {'Accuracy': 0.9778200253485425, 'F1-Score': 0.9776532724743268}}
Best Model Info using RF Feature Selection method: {'model_name': 'Support Vector Machine', 'params': array([0.96611408, 0.96952051, 0.97444974, 0.9653758 , 0.96854845]), 'dataset': 'predict_total_ap_RF_less_restricted', 'test_scores': {'Accuracy': 0.9790874524714829, 'F1-Score': 0.9789181162978986}}
Best Model Info using DT Feature Selection method: {'model_name': 'Support Vector Machine', 'params': array([0.96611408, 0.96952051, 0.97444974, 0.9653758 , 0.96774045]), 'dataset': 'predict_total_ap_DT_less_restricted', 'test_scores': {'Accuracy': 0.9790874524714829, 'F1-Score': 0.9789181162978986}}


### 4.3.2 Predicting Total_PP feature

#### CPU Tests

In [None]:
# Analyze 'Total_PP' using the original dataset without feature selection
original_total_pp_results, original_total_pp_best_model = analyze_datasets_for_target_column(
    datasets_dict=loaded_datasets,
    target_column='Total_PP',
    group_col='Year',
    exclusion_type='more_restricted'  # When using original dataframe only more_restricted available
)

Processing datasets for target column: Total_PP with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: combined_df_Total_PP
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-13.92239594 -14.53165422 -13.00202569 -13.96216371 -14.29714277]
Test set scores for Linear Regression: {'MSE': 12.003530087648322, 'R2': 0.6113765452320685, 'MAE': 2.6125377517669355}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-17.78507223 -14.59411877 -17.33309909 -14.23705811 -16.71075504]
Test set scores for Decision Tree Regressor: {'MSE': 15.907688412916242, 'R2': 0.48497643749310815, 'MAE': 2.5611386348080667}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-10.51686467 -10.69583911 -10.36967894  -9.63550355 -11.49259873]
Test set scores for Random Forest Regr

In [None]:
# Analyze 'Total_PP' using the original dataset without feature selection
original_total_pp_results_zero, original_total_pp_best_model_zero = analyze_datasets_for_target_column(
    datasets_dict=loaded_datasets,
    target_column='Total_PP',
    group_col='Year',
    imputation_strategy='zero',
    exclusion_type='more_restricted'  # When using original dataframe only more_restricted available
)

Processing datasets for target column: Total_PP with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: combined_df_Total_PP
No specific columns to scale provided. Scaling all columns except the target.
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-8.73184816 -9.90397146 -8.47378366 -8.8973584  -9.43287418]
Test set scores for Linear Regression: {'MSE': 2.2104109587554017e+24, 'R2': -7.156374308024205e+22, 'MAE': 37426827291.835464}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-17.63760165 -18.96464495 -17.01712801 -16.00094746 -18.02787075]
Test set scores for Decision Tree Regressor: {'MSE': 15.976275534253743, 'R2': 0.4827558770536259, 'MAE': 2.68522014002052}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [ -9.87985039 -10.5549     

In [None]:
mi_more_restricted_total_pp_results, mi_more_restricted_total_pp_best_model = analyze_datasets_for_target_column(
    datasets_dict=mi_total_pp_datasets,
    target_column='Total_PP',
    feature_set_name='MI',
    group_col='Year',
    imputation_strategy='zero',
    exclusion_type='more_restricted'
)


Processing datasets for target column: Total_PP with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: predict_total_pp_MI_more_restricted
Column 'Year' not found. Scaling without grouping.
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-8.32258178 -9.39849158 -7.75409918 -8.02210934 -8.55002518]
Test set scores for Linear Regression: {'MSE': 6.878656851411008, 'R2': 0.7772982305839189, 'MAE': 1.8742662950197957}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-15.78453628 -17.79985029 -14.3555728  -14.23971829 -17.50493258]
Test set scores for Decision Tree Regressor: {'MSE': 14.26573804765775, 'R2': 0.5381358346741782, 'MAE': 2.5529858632077294}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [ -8.83721118 -10.27119779  -8.12361072  -8.506

In [None]:
dt_more_restricted_total_pp_results_zero, dt_more_restricted_total_pp_best_model_zero = analyze_datasets_for_target_column(
    datasets_dict=dt_total_pp_datasets,
    target_column='Total_PP',
    feature_set_name='DT',
    group_col='Year',
    imputation_strategy='zero',
    exclusion_type='more_restricted'
)

Processing datasets for target column: Total_PP with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: predict_total_pp_DT_more_restricted
Column 'Year' not found. Scaling without grouping.
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-8.22764161 -9.54050079 -8.03762141 -8.12853289 -8.62530522]
Test set scores for Linear Regression: {'MSE': 7.176215143676656, 'R2': 0.7676645535996696, 'MAE': 1.9029376413391168}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-16.0754799  -16.70788001 -15.39934085 -14.98348894 -15.49162889]
Test set scores for Decision Tree Regressor: {'MSE': 14.691581925803924, 'R2': 0.5243488138637558, 'MAE': 2.625281673333004}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [ -8.7226507  -10.13185355  -7.93172893  -8.330

#### Several Imputations and Test - T4 GPU

In [None]:
# Analyze 'Total_PP' using the original dataset without feature selection
original_total_pp_results, original_total_pp_best_model = analyze_datasets_for_target_column(
    datasets_dict=loaded_datasets,
    target_column='Total_PP',
    exclusion_type='more_restricted'  # When using original dataframe only more_restricted available
)

Processing datasets for target column: Total_PP with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: combined_df_Total_PP
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-13.92239594 -14.53165422 -13.00202569 -13.96216371 -14.29714277]
Test set scores for Linear Regression: {'MSE': 12.003530087648322, 'R2': 0.6113765452320685, 'MAE': 2.6125377517669355}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-17.78507223 -14.59411877 -17.33309909 -14.23705811 -16.71075504]
Test set scores for Decision Tree Regressor: {'MSE': 15.907688412916242, 'R2': 0.48497643749310815, 'MAE': 2.5611386348080667}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-10.51686467 -10.69583911 -10.36967894  -9.63550355 -11.49259873]
Test set scores for Random Forest Regr

In [None]:
# Analyze 'Total_PP' using the original dataset without feature selection
original_total_pp_results_mean, original_total_pp_best_model_mean = analyze_datasets_for_target_column(
    datasets_dict=loaded_datasets,
    target_column='Total_PP',
    imputation_strategy='mean',
    exclusion_type='more_restricted'  # When using original dataframe only more_restricted available
)

Processing datasets for target column: Total_PP with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: combined_df_Total_PP
Column 'None' not found. Scaling without grouping.
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-10.35852318 -11.60801841 -10.14791043 -10.52713391 -10.85557483]
Test set scores for Linear Regression: {'MSE': 2.4499675937647793e+24, 'R2': -7.931957210971414e+22, 'MAE': 39402762460.308716}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-18.5864338  -16.95110246 -17.41965441 -14.94560444 -17.40296474]
Test set scores for Decision Tree Regressor: {'MSE': 15.988117337937615, 'R2': 0.48237248961470525, 'MAE': 2.6813197839338523}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [ -9.85744419 -10.46151     -8.58208462  -8.93

In [None]:
# Analyze 'Total_PP' using the original dataset without feature selection
original_total_pp_results_median, original_total_pp_best_model_median = analyze_datasets_for_target_column(
    datasets_dict=loaded_datasets,
    target_column='Total_PP',
    imputation_strategy='median',
    exclusion_type='more_restricted'  # When using original dataframe only more_restricted available
)

Processing datasets for target column: Total_PP with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: combined_df_Total_PP
Column 'None' not found. Scaling without grouping.
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [ -9.392372   -10.54915249  -9.11496976  -9.67853586 -10.16841876]
Test set scores for Linear Regression: {'MSE': 2.2900913757241803e+24, 'R2': -7.414345743873896e+22, 'MAE': 38095432297.82949}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-16.27408687 -18.12695752 -20.53762949 -14.86089587 -19.1367335 ]
Test set scores for Decision Tree Regressor: {'MSE': 15.909127672785537, 'R2': 0.48492984035554687, 'MAE': 2.696257317882793}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [ -9.8921477  -10.32563594  -9.05477964  -8.8325

In [None]:
# Analyze 'Total_PP' using the original dataset without feature selection
original_total_pp_results_knn, original_total_pp_best_model_knn = analyze_datasets_for_target_column(
    datasets_dict=loaded_datasets,
    target_column='Total_PP',
    imputation_strategy='knn',
    exclusion_type='more_restricted'  # When using original dataframe only more_restricted available
)

Processing datasets for target column: Total_PP with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: combined_df_Total_PP
Column 'None' not found. Scaling without grouping.
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-11.10637192 -12.4145497  -10.87063605 -11.38677867 -11.62684002]
Test set scores for Linear Regression: {'MSE': 2.7735833270758973e+24, 'R2': -8.979687865023232e+22, 'MAE': 41924424485.32888}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-16.78093784 -16.96496693 -17.67198176 -16.02016642 -18.79559354]
Test set scores for Decision Tree Regressor: {'MSE': 16.82990780507504, 'R2': 0.4551188802897047, 'MAE': 2.7307750920393503}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [ -9.30259575 -10.17555244  -8.83871221  -8.91639

In [None]:
# Analyze 'Total_PP' using the original dataset without feature selection
original_total_pp_results_zero, original_total_pp_best_model_zero = analyze_datasets_for_target_column(
    datasets_dict=loaded_datasets,
    target_column='Total_PP',
    imputation_strategy='zero',
    exclusion_type='more_restricted'  # When using original dataframe only more_restricted available
)

Processing datasets for target column: Total_PP with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: combined_df_Total_PP
Column 'None' not found. Scaling without grouping.
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-8.73184816 -9.90397146 -8.47378366 -8.8973584  -9.43287418]
Test set scores for Linear Regression: {'MSE': 2.2104109587554017e+24, 'R2': -7.156374308024205e+22, 'MAE': 37426827291.835464}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-17.63760165 -18.96464495 -17.01712801 -16.00094746 -18.02787075]
Test set scores for Decision Tree Regressor: {'MSE': 15.976275534253743, 'R2': 0.4827558770536259, 'MAE': 2.68522014002052}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [ -9.87985039 -10.5549      -8.76449991  -9.17858814  

#### More Restricted - L4 GPU

In [None]:
# Analyze 'Total_PP' using the original dataset without feature selection
original_total_pp_results, original_total_pp_best_model = analyze_datasets_for_target_column(
    datasets_dict=loaded_datasets,  # Use the main dataset dictionary
    target_column='Total_PP',
    exclusion_type='more_restricted'  # When using original dataframe only more_restricted available
)

Processing datasets for target column: Total_PP with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: combined_df_Total_PP
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-13.92239594 -14.53165422 -13.00202569 -13.96216371 -14.29714277]
Test set scores for Linear Regression: {'MSE': 12.003530087648329, 'R2': 0.6113765452320683, 'MAE': 2.6125377517669364}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-17.78507223 -14.59411877 -17.33309909 -14.23705811 -16.71075504]
Test set scores for Decision Tree Regressor: {'MSE': 15.907688412916242, 'R2': 0.48497643749310815, 'MAE': 2.5611386348080667}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-10.51686467 -10.69583911 -10.36967894  -9.63550355 -11.49259873]
Test set scores for Random Forest Regr

In [None]:
mi_more_restricted_total_pp_results, mi_more_restricted_total_pp_best_model = analyze_datasets_for_target_column(
    datasets_dict=mi_total_pp_datasets,
    target_column='Total_PP',
    feature_set_name='MI',
    exclusion_type='more_restricted'
)


Processing datasets for target column: Total_PP with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: predict_total_pp_MI_more_restricted
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-13.49719437 -14.57016573 -12.27867205 -13.34393698 -13.85022322]
Test set scores for Linear Regression: {'MSE': 11.80714458518987, 'R2': 0.6177346758715104, 'MAE': 2.5989238323213617}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-17.07747569 -20.56306126 -17.18126404 -17.36203008 -16.92458014]
Test set scores for Decision Tree Regressor: {'MSE': 15.978281904775443, 'R2': 0.4826909192755394, 'MAE': 2.793854194336947}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-12.34316779 -14.33715205 -11.94624349 -11.77185138 -12.36259099]
Test set scores for Random

In [None]:
rf_more_restricted_total_pp_results, rf_more_restricted_total_pp_best_model = analyze_datasets_for_target_column(
    datasets_dict=rf_total_pp_datasets,
    target_column='Total_PP',
    feature_set_name='RF',
    exclusion_type='more_restricted'
)


Processing datasets for target column: Total_PP with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: predict_total_pp_RF_more_restricted
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-13.49719437 -14.57016573 -12.27867205 -13.34393698 -13.85022322]
Test set scores for Linear Regression: {'MSE': 11.80714458518987, 'R2': 0.6177346758715104, 'MAE': 2.5989238323213617}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-17.07747569 -20.56306126 -17.18126404 -17.36203008 -16.92458014]
Test set scores for Decision Tree Regressor: {'MSE': 15.978281904775443, 'R2': 0.4826909192755394, 'MAE': 2.793854194336947}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-12.34316779 -14.33715205 -11.94624349 -11.77185138 -12.36259099]
Test set scores for Random

In [None]:
dt_more_restricted_total_pp_results, dt_more_restricted_total_pp_best_model = analyze_datasets_for_target_column(
    datasets_dict=dt_total_pp_datasets,
    target_column='Total_PP',
    feature_set_name='DT',
    exclusion_type='more_restricted'
)

Processing datasets for target column: Total_PP with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: predict_total_pp_DT_more_restricted
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-13.75131351 -14.7251896  -12.65377983 -13.58399406 -14.22168553]
Test set scores for Linear Regression: {'MSE': 12.410501455872378, 'R2': 0.5982005363450154, 'MAE': 2.6624000568964763}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-16.74062635 -19.55588195 -17.1203406  -16.98335873 -17.02948865]
Test set scores for Decision Tree Regressor: {'MSE': 15.908000948448603, 'R2': 0.4849663189165364, 'MAE': 2.7877590417489846}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-12.33154393 -14.33737304 -11.94844161 -11.66447108 -12.29074008]
Test set scores for Rand

In [None]:
lasso_more_restricted_total_pp_results, lasso_more_restricted_total_pp_best_model = analyze_datasets_for_target_column(
    datasets_dict=lasso_total_pp_datasets,
    target_column='Total_PP',
    feature_set_name='Lasso',
    exclusion_type='more_restricted'
)


Processing datasets for target column: Total_PP with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: predict_total_pp_Lasso_more_restricted_regression
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-14.07259509 -14.8096461  -13.05723877 -14.07382443 -14.44467067]
Test set scores for Linear Regression: {'MSE': 12.179176340373877, 'R2': 0.6056898636431678, 'MAE': 2.639137693667176}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-18.18457669 -19.60916364 -19.50221679 -17.53878177 -19.37935029]
Test set scores for Decision Tree Regressor: {'MSE': 17.920182808049784, 'R2': 0.419820394326883, 'MAE': 2.9164365673026373}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-13.06670535 -14.4909809  -13.3149714  -12.69096625 -13.93424886]
Test set sco

In [None]:
fr_more_restricted_total_pp_results, fr_more_restricted_total_pp_best_model = analyze_datasets_for_target_column(
    datasets_dict=fr_total_pp_datasets,
    target_column='Total_PP',
    feature_set_name='FR',
    exclusion_type='more_restricted'
)


Processing datasets for target column: Total_PP with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: predict_total_pp_FR_more_restricted_regression
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-14.08428735 -14.80491935 -13.08837325 -14.06953453 -14.47990177]
Test set scores for Linear Regression: {'MSE': 12.222191979281321, 'R2': 0.6042971994786097, 'MAE': 2.6466078394776225}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-17.65371301 -16.90705732 -17.71388714 -16.0989159  -17.38888106]
Test set scores for Decision Tree Regressor: {'MSE': 16.04973849003883, 'R2': 0.4803774577497877, 'MAE': 2.734412268773845}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-12.81178706 -14.04308067 -12.6823436  -12.66841083 -13.96457843]
Test set scores

In [None]:
print("Column to Predict/Classify: Total_PP. Feature Set: more_restricted")
print("Best Model Info using MI Feature Selection method:", mi_more_restricted_total_pp_best_model)
print("Best Model Info using RF Feature Selection method:", rf_more_restricted_total_pp_best_model)
print("Best Model Info using DT Feature Selection method:", dt_more_restricted_total_pp_best_model)
print("Best Model Info using Lasso Feature Selection method:", lasso_more_restricted_total_pp_best_model)
print("Best Model Info using FR Feature Selection method:", fr_more_restricted_total_pp_best_model)


Column to Predict/Classify: Total_PP. Feature Set: more_restricted
Best Model Info using MI Feature Selection method: {'model_name': 'Neural Network', 'params': array([0.61880804, 0.62025956, 0.60358579, 0.65678492, 0.64384519]), 'dataset': 'predict_total_pp_MI_more_restricted', 'test_scores': {'Accuracy': 0.6375158428390367, 'F1-Score': 0.6362248650354846}}
Best Model Info using RF Feature Selection method: {'model_name': 'Neural Network', 'params': array([0.61880804, 0.62025956, 0.60358579, 0.65678492, 0.64384519]), 'dataset': 'predict_total_pp_RF_more_restricted', 'test_scores': {'Accuracy': 0.6375158428390367, 'F1-Score': 0.6362248650354846}}
Best Model Info using DT Feature Selection method: {'model_name': 'Random Forest', 'params': array([0.62387325, 0.63931746, 0.64905391, 0.65082863, 0.66396568]), 'dataset': 'predict_total_pp_DT_more_restricted', 'test_scores': {'Accuracy': 0.6273764258555133, 'F1-Score': 0.6266884689965861}}
Best Model Info using Lasso Feature Selection method

#### Restricted - L4 GPU

In [None]:
mi_restricted_total_pp_results, mi_restricted_total_pp_best_model = analyze_datasets_for_target_column(
    datasets_dict=mi_total_pp_datasets,
    target_column='Total_PP',
    feature_set_name='MI',
    exclusion_type='restricted'
)


Processing datasets for target column: Total_PP with exclusion type: restricted
--------------------------------------------------------------------------------
Processing dataset: predict_total_pp_MI_restricted
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-13.53566097 -14.57720999 -12.356423   -13.35859839 -13.9410284 ]
Test set scores for Linear Regression: {'MSE': 11.891709662789353, 'R2': 0.6149968168942379, 'MAE': 2.608585749647105}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-17.28301356 -17.83313985 -15.92170243 -15.2221133  -15.89422277]
Test set scores for Decision Tree Regressor: {'MSE': 14.490328360894642, 'R2': 0.5308645517432223, 'MAE': 2.65768515481242}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-12.14900579 -13.71506491 -11.53682476 -11.42656    -12.22864578]
Test set scores for Random Forest Reg

In [None]:
rf_restricted_total_pp_results, rf_restricted_total_pp_best_model = analyze_datasets_for_target_column(
    datasets_dict=rf_total_pp_datasets,
    target_column='Total_PP',
    feature_set_name='RF',
    exclusion_type='restricted'
)


Processing datasets for target column: Total_PP with exclusion type: restricted
--------------------------------------------------------------------------------
Processing dataset: predict_total_pp_RF_restricted
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-13.49719437 -14.57016573 -12.27867205 -13.34393698 -13.85022322]
Test set scores for Linear Regression: {'MSE': 11.807144585189867, 'R2': 0.6177346758715105, 'MAE': 2.5989238323213595}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-16.67763404 -20.66361549 -17.13055089 -17.07886706 -16.92200486]
Test set scores for Decision Tree Regressor: {'MSE': 16.013769864217775, 'R2': 0.48154196948323336, 'MAE': 2.790051912968126}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-12.32444554 -14.33698947 -11.9362552  -11.75980029 -12.33037598]
Test set scores for Random Forest 

In [None]:
dt_restricted_total_pp_results, dt_restricted_total_pp_best_model = analyze_datasets_for_target_column(
    datasets_dict=dt_total_pp_datasets,
    target_column='Total_PP',
    feature_set_name='DT',
    exclusion_type='restricted'
)


Processing datasets for target column: Total_PP with exclusion type: restricted
--------------------------------------------------------------------------------
Processing dataset: predict_total_pp_DT_restricted
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-13.49719437 -14.57016573 -12.27867205 -13.34393698 -13.85022322]
Test set scores for Linear Regression: {'MSE': 11.807144585189867, 'R2': 0.6177346758715105, 'MAE': 2.5989238323213595}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-16.67763404 -20.66361549 -17.13055089 -17.07886706 -16.92200486]
Test set scores for Decision Tree Regressor: {'MSE': 16.013769864217775, 'R2': 0.48154196948323336, 'MAE': 2.790051912968126}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-12.32444554 -14.33698947 -11.9362552  -11.75980029 -12.33037598]
Test set scores for Random Forest 

In [None]:
print("Column to Predict/Classify: Total_PP. Feature Set: restricted")
print("Best Model Info using MI Feature Selection method:", mi_restricted_total_pp_best_model)
print("Best Model Info using RF Feature Selection method:", rf_restricted_total_pp_best_model)
print("Best Model Info using DT Feature Selection method:", dt_restricted_total_pp_best_model)

Column to Predict/Classify: Total_PP. Feature Set: restricted
Best Model Info using MI Feature Selection method: {'model_name': 'Random Forest', 'params': array([0.63182214, 0.64694799, 0.65606413, 0.65661862, 0.66339998]), 'dataset': 'predict_total_pp_MI_restricted', 'test_scores': {'Accuracy': 0.6394169835234474, 'F1-Score': 0.6395088963549419}}
Best Model Info using RF Feature Selection method: {'model_name': 'Random Forest', 'params': array([0.62444188, 0.64922035, 0.64362218, 0.65296301, 0.66215869]), 'dataset': 'predict_total_pp_RF_restricted', 'test_scores': {'Accuracy': 0.6311787072243346, 'F1-Score': 0.630602653896984}}
Best Model Info using DT Feature Selection method: {'model_name': 'Random Forest', 'params': array([0.62444188, 0.64922035, 0.64362218, 0.65296301, 0.66215869]), 'dataset': 'predict_total_pp_DT_restricted', 'test_scores': {'Accuracy': 0.6311787072243346, 'F1-Score': 0.630602653896984}}


#### Less Restricted - L4 GPU

In [None]:
mi_less_restricted_total_pp_results, mi_less_restricted_total_pp_best_model = analyze_datasets_for_target_column(
    datasets_dict=mi_total_pp_datasets,
    target_column='Total_PP',
    feature_set_name='MI',
    exclusion_type='less_restricted'
)


Processing datasets for target column: Total_PP with exclusion type: less_restricted
--------------------------------------------------------------------------------
Processing dataset: predict_total_pp_MI_less_restricted
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-1.3726167  -1.50460784 -1.00714629 -1.37721692 -1.05327227]
Test set scores for Linear Regression: {'MSE': 0.9996773450151357, 'R2': 0.9676346824112365, 'MAE': 0.5586129644386143}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-4.26444972 -4.09738717 -4.01347068 -4.47147385 -3.62282092]
Test set scores for Decision Tree Regressor: {'MSE': 5.229404309252218, 'R2': 0.8306940413194683, 'MAE': 1.162230671736375}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-2.12777981 -2.03238377 -1.55015578 -2.04893716 -1.54362005]
Test set scores for Random Forest Regress

In [None]:
rf_less_restricted_total_pp_results, rf_less_restricted_total_pp_best_model = analyze_datasets_for_target_column(
    datasets_dict=rf_total_pp_datasets,
    target_column='Total_PP',
    feature_set_name='RF',
    exclusion_type='less_restricted'
)


Processing datasets for target column: Total_PP with exclusion type: less_restricted
--------------------------------------------------------------------------------
Processing dataset: predict_total_pp_RF_less_restricted
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-1.37276883 -1.5029492  -1.00776497 -1.36858087 -1.05519267]
Test set scores for Linear Regression: {'MSE': 1.0002341922667404, 'R2': 0.9676166540561513, 'MAE': 0.559615825255855}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-3.63895487 -4.34996041 -3.93977813 -3.63391442 -3.54675119]
Test set scores for Decision Tree Regressor: {'MSE': 5.56020278833967, 'R2': 0.8199841878983295, 'MAE': 1.1749049429657794}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-2.0666361  -2.00300222 -1.58969525 -2.07784208 -1.54387187]
Test set scores for Random Forest Regresso

In [None]:
dt_less_restricted_total_pp_results, dt_less_restricted_total_pp_best_model = analyze_datasets_for_target_column(
    datasets_dict=dt_total_pp_datasets,
    target_column='Total_PP',
    feature_set_name='DT',
    exclusion_type='less_restricted'
)

Processing datasets for target column: Total_PP with exclusion type: less_restricted
--------------------------------------------------------------------------------
Processing dataset: predict_total_pp_DT_less_restricted
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-1.37256398 -1.50388331 -1.00671781 -1.37660052 -1.0549386 ]
Test set scores for Linear Regression: {'MSE': 0.9989182694324558, 'R2': 0.9676592580630005, 'MAE': 0.5590792611170265}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-4.2557403  -4.36262866 -4.16719493 -4.40332805 -3.56339144]
Test set scores for Decision Tree Regressor: {'MSE': 5.414448669201521, 'R2': 0.8247030888310152, 'MAE': 1.1660329531051965}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-2.10744656 -2.02354339 -1.57953067 -2.04242758 -1.54041141]
Test set scores for Random Forest Regres

In [None]:
print("Column to Predict/Classify: Total_PP. Feature Set: less_restricted")
print("Best Model Info using MI Feature Selection method:", mi_less_restricted_total_pp_best_model)
print("Best Model Info using RF Feature Selection method:", rf_less_restricted_total_pp_best_model)
print("Best Model Info using DT Feature Selection method:", dt_less_restricted_total_pp_best_model)

Column to Predict/Classify: Total_PP. Feature Set: less_restricted
Best Model Info using MI Feature Selection method: {'model_name': 'Random Forest', 'params': array([0.90718654, 0.90646434, 0.90209358, 0.89144211, 0.90862407]), 'dataset': 'predict_total_pp_MI_less_restricted', 'test_scores': {'Accuracy': 0.9062103929024081, 'F1-Score': 0.9062467373667265}}
Best Model Info using RF Feature Selection method: {'model_name': 'Random Forest', 'params': array([0.90804161, 0.90488385, 0.90360846, 0.89605874, 0.90940219]), 'dataset': 'predict_total_pp_RF_less_restricted', 'test_scores': {'Accuracy': 0.9024081115335868, 'F1-Score': 0.9024006018517943}}
Best Model Info using DT Feature Selection method: {'model_name': 'Neural Network', 'params': array([0.90539846, 0.91867614, 0.91194189, 0.91423441, 0.91504433]), 'dataset': 'predict_total_pp_DT_less_restricted', 'test_scores': {'Accuracy': 0.9188846641318125, 'F1-Score': 0.9184544024331468}}


### 4.3.3 Predicting Total_SPA feature

#### CPU Tests

In [None]:
# Analyze 'Total_SPA' using the original dataset without feature selection
original_total_spa_results, original_total_spa_best_model = analyze_datasets_for_target_column(
    datasets_dict=loaded_datasets,
    target_column='Total_SPA',
    group_col='Year',
    exclusion_type='more_restricted'  # When using original dataframe only more_restricted available
)

Processing datasets for target column: Total_SPA with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: combined_df_Total_SPA
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-41.08611485 -41.67571183 -36.9647161  -43.15098941 -37.40749107]
Test set scores for Linear Regression: {'MSE': 34.177352829641045, 'R2': 0.7599340965367161, 'MAE': 4.224361155398544}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-39.5946318  -39.34141741 -40.16540616 -40.50519759 -35.59599497]
Test set scores for Decision Tree Regressor: {'MSE': 37.9212650625926, 'R2': 0.7336363994280253, 'MAE': 2.768247261583435}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-28.74651896 -29.11211397 -27.96492396 -28.80380947 -24.25517222]
Test set scores for Random Forest Regress

In [None]:
# Analyze 'Total_SPA' using the original dataset without feature selection
original_total_spa_results_zero, original_total_spa_best_model_zero = analyze_datasets_for_target_column(
    datasets_dict=loaded_datasets,
    target_column='Total_SPA',
    group_col='Year',
    imputation_strategy='zero',
    exclusion_type='more_restricted'  # When using original dataframe only more_restricted available
)

Processing datasets for target column: Total_SPA with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: combined_df_Total_SPA
No specific columns to scale provided. Scaling all columns except the target.
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-23.77778247 -25.96166354 -22.84757956 -27.05530389 -22.61638882]
Test set scores for Linear Regression: {'MSE': 3.7105348177551872e+22, 'R2': -2.6063250064932726e+20, 'MAE': 4849140373.876287}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-52.4107541  -51.59531669 -50.85697219 -51.06913342 -50.34671743]
Test set scores for Decision Tree Regressor: {'MSE': 48.39383574753212, 'R2': 0.66007578296967, 'MAE': 3.056277134005271}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-26.59775648 -27.8402

In [None]:
# Analyze 'Total_SPA' using the original dataset without feature selection
original_total_spa_results_knn, original_total_spa_best_model_knn = analyze_datasets_for_target_column(
    datasets_dict=loaded_datasets,
    target_column='Total_SPA',
    group_col='Year',
    imputation_strategy='knn',
    exclusion_type='more_restricted'  # When using original dataframe only more_restricted available
)

Processing datasets for target column: Total_SPA with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: combined_df_Total_SPA
No specific columns to scale provided. Scaling all columns except the target.
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-38.22684684 -39.21160438 -34.09947624 -40.99316432 -35.53946991]
Test set scores for Linear Regression: {'MSE': 1.5892364841299968e+23, 'R2': -1.116299130249192e+21, 'MAE': 10035540409.853456}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-45.4634289  -44.25665867 -54.64277808 -58.09473738 -51.50672337]
Test set scores for Decision Tree Regressor: {'MSE': 44.75245207366756, 'R2': 0.6856533069481978, 'MAE': 3.0363424668557752}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-24.96588457 -26.4

In [None]:
dt_more_restricted_total_spa_results_zero, dt_more_restricted_total_spa_best_model_zero = analyze_datasets_for_target_column(
    datasets_dict=dt_total_spa_datasets,
    target_column='Total_SPA',
    feature_set_name='DT',
    group_col='Year',
    imputation_strategy='zero',
    exclusion_type='more_restricted'
)

Processing datasets for target column: Total_SPA with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: predict_total_spa_DT_more_restricted
Column 'Year' not found. Scaling without grouping.
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-24.27397323 -26.09608914 -22.32110002 -27.25087396 -22.80329723]
Test set scores for Linear Regression: {'MSE': 19.429588859078947, 'R2': 0.8635241931513883, 'MAE': 2.2766407800611366}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-54.07974606 -51.19315722 -40.54849456 -48.94769335 -53.84198898]
Test set scores for Decision Tree Regressor: {'MSE': 43.37672615914027, 'R2': 0.6953165739143397, 'MAE': 2.962100774173018}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-25.34184357 -28.3440171  -23.61153897 

#### Several Imputations and Test - T4 GPU

In [None]:
# Analyze 'Total_SPA' using the original dataset without feature selection
original_total_spa_results, original_total_spa_best_model = analyze_datasets_for_target_column(
    datasets_dict=loaded_datasets,
    target_column='Total_SPA',
    exclusion_type='more_restricted'  # When using original dataframe only more_restricted available
)

Processing datasets for target column: Total_SPA with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: combined_df_Total_SPA
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-41.08611485 -41.67571183 -36.9647161  -43.15098941 -37.40749107]
Test set scores for Linear Regression: {'MSE': 34.177352829641045, 'R2': 0.7599340965367161, 'MAE': 4.224361155398544}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-39.5946318  -39.34141741 -40.16540616 -40.50519759 -35.59599497]
Test set scores for Decision Tree Regressor: {'MSE': 37.9212650625926, 'R2': 0.7336363994280253, 'MAE': 2.768247261583435}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-28.74651896 -29.11211397 -27.96492396 -28.80380947 -24.25517222]
Test set scores for Random Forest Regress

In [None]:
# Analyze 'Total_SPA' using the original dataset without feature selection
original_total_spa_results_mean, original_total_spa_best_model_mean = analyze_datasets_for_target_column(
    datasets_dict=loaded_datasets,
    target_column='Total_SPA',
    imputation_strategy='mean',
    exclusion_type='more_restricted'  # When using original dataframe only more_restricted available
)

Processing datasets for target column: Total_SPA with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: combined_df_Total_SPA
Column 'None' not found. Scaling without grouping.
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-33.19368593 -34.65878617 -30.44891976 -36.98840841 -30.67205946]
Test set scores for Linear Regression: {'MSE': 2.275338377761459e+22, 'R2': -1.5982254859371577e+20, 'MAE': 3797252677.374121}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-54.65277635 -53.76001448 -38.70836598 -52.83177034 -50.58113865]
Test set scores for Decision Tree Regressor: {'MSE': 40.15441479516395, 'R2': 0.7179504827688046, 'MAE': 2.833893365119601}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-25.69775434 -27.32801441 -22.56326827 -29.0336

In [None]:
# Analyze 'Total_SPA' using the original dataset without feature selection
original_total_spa_results_median, original_total_spa_best_model_median = analyze_datasets_for_target_column(
    datasets_dict=loaded_datasets,
    target_column='Total_SPA',
    imputation_strategy='median',
    exclusion_type='more_restricted'  # When using original dataframe only more_restricted available
)

Processing datasets for target column: Total_SPA with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: combined_df_Total_SPA
Column 'None' not found. Scaling without grouping.
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-29.0716771  -31.28328598 -26.89764604 -33.48949842 -27.88004546]
Test set scores for Linear Regression: {'MSE': 2.583650379924717e+23, 'R2': -1.814787604474627e+21, 'MAE': 12795680070.339197}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-58.46391576 -50.38579052 -37.8712312  -56.8469405  -52.0754671 ]
Test set scores for Decision Tree Regressor: {'MSE': 36.76391596115539, 'R2': 0.7417657609688051, 'MAE': 2.7030865371074495}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-25.73534395 -27.65642793 -22.81891537 -28.855

In [None]:
# Analyze 'Total_SPA' using the original dataset without feature selection
original_total_spa_results_knn, original_total_spa_best_model_knn = analyze_datasets_for_target_column(
    datasets_dict=loaded_datasets,
    target_column='Total_SPA',
    imputation_strategy='knn',
    exclusion_type='more_restricted'  # When using original dataframe only more_restricted available
)

Processing datasets for target column: Total_SPA with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: combined_df_Total_SPA
Column 'None' not found. Scaling without grouping.
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-38.22684684 -39.21160438 -34.09947624 -40.99316432 -35.53946991]
Test set scores for Linear Regression: {'MSE': 1.5892364841299968e+23, 'R2': -1.116299130249192e+21, 'MAE': 10035540409.853456}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-45.4634289  -44.25665867 -54.64277808 -58.09473738 -51.50672337]
Test set scores for Decision Tree Regressor: {'MSE': 44.75245207366756, 'R2': 0.6856533069481978, 'MAE': 3.0363424668557752}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-24.96588457 -26.41139283 -23.62259921 -30.15

In [None]:
# Analyze 'Total_SPA' using the original dataset without feature selection
original_total_spa_results_zero, original_total_spa_best_model_zero = analyze_datasets_for_target_column(
    datasets_dict=loaded_datasets,
    target_column='Total_SPA',
    imputation_strategy='zero',
    exclusion_type='more_restricted'  # When using original dataframe only more_restricted available
)

Processing datasets for target column: Total_SPA with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: combined_df_Total_SPA
Column 'None' not found. Scaling without grouping.
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-23.77778247 -25.96166354 -22.84757956 -27.05530389 -22.61638882]
Test set scores for Linear Regression: {'MSE': 3.7105348177551872e+22, 'R2': -2.6063250064932726e+20, 'MAE': 4849140373.876287}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-52.4107541  -51.59531669 -50.85697219 -51.06913342 -50.34671743]
Test set scores for Decision Tree Regressor: {'MSE': 48.39383574753212, 'R2': 0.66007578296967, 'MAE': 3.056277134005271}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-26.59775648 -27.84029868 -22.85979511 -28.94598

#### More Restricted - L4 GPU

In [None]:
# Analyze 'Total_SPA' using the original dataset without feature selection
original_total_spa_results, original_total_spa_best_model = analyze_datasets_for_target_column(
    datasets_dict=loaded_datasets,  # Use the main dataset dictionary
    target_column='Total_SPA',
    exclusion_type='more_restricted'  # When using original dataframe only more_restricted available
)

Processing datasets for target column: Total_SPA with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: combined_df_Total_SPA
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-41.08611485 -41.67571183 -36.9647161  -43.15098941 -37.40749107]
Test set scores for Linear Regression: {'MSE': 34.17735282964122, 'R2': 0.7599340965367148, 'MAE': 4.224361155398571}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-39.5946318  -39.34141741 -40.16540616 -40.50519759 -35.59599497]
Test set scores for Decision Tree Regressor: {'MSE': 37.9212650625926, 'R2': 0.7336363994280253, 'MAE': 2.768247261583435}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-28.74651896 -29.11211397 -27.96492396 -28.80380947 -24.25517222]
Test set scores for Random Forest Regresso

In [None]:
mi_more_restricted_total_spa_results, mi_more_restricted_total_spa_best_model = analyze_datasets_for_target_column(
    datasets_dict=mi_total_spa_datasets,
    target_column='Total_SPA',
    feature_set_name='MI',
    exclusion_type='more_restricted'
)

Processing datasets for target column: Total_SPA with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: predict_total_spa_MI_more_restricted
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-56.59756112 -55.74291067 -53.33120426 -61.53141922 -56.95883629]
Test set scores for Linear Regression: {'MSE': 47.30797397516009, 'R2': 0.667703008773848, 'MAE': 4.817974382758264}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-42.18349978 -46.46342    -44.54367989 -44.4359919  -39.25385382]
Test set scores for Decision Tree Regressor: {'MSE': 36.90605904563757, 'R2': 0.7407673305705421, 'MAE': 2.8564817144607346}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-31.51557319 -32.90186547 -29.98566509 -33.20265987 -27.55961578]
Test set scores for Random

In [None]:
rf_more_restricted_total_spa_results, rf_more_restricted_total_spa_best_model = analyze_datasets_for_target_column(
    datasets_dict=rf_total_spa_datasets,
    target_column='Total_SPA',
    feature_set_name='RF',
    exclusion_type='more_restricted'
)


Processing datasets for target column: Total_SPA with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: predict_total_spa_RF_more_restricted_regression
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-78.61993623 -77.37373247 -76.00180209 -82.81804374 -76.86288781]
Test set scores for Linear Regression: {'MSE': 70.14243107983546, 'R2': 0.5073109912219997, 'MAE': 6.528072441576014}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-76.39686447 -69.09319427 -72.62431406 -76.19739952 -66.6327697 ]
Test set scores for Decision Tree Regressor: {'MSE': 59.52850465565976, 'R2': 0.58186450766938, 'MAE': 4.293839737891116}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-55.37039213 -55.12268797 -55.95231631 -58.66483712 -51.37243039]
Test set scores f

In [None]:
dt_more_restricted_total_spa_results, dt_more_restricted_total_spa_best_model = analyze_datasets_for_target_column(
    datasets_dict=dt_total_spa_datasets,
    target_column='Total_SPA',
    feature_set_name='DT',
    exclusion_type='more_restricted'
)

Processing datasets for target column: Total_SPA with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: predict_total_spa_DT_more_restricted
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-56.82940228 -55.843724   -53.13801577 -61.45601849 -57.04445572]
Test set scores for Linear Regression: {'MSE': 47.2979657979218, 'R2': 0.6677733074339784, 'MAE': 4.815260844611008}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-41.30818841 -47.7466404  -42.08813624 -40.4501978  -42.0981799 ]
Test set scores for Decision Tree Regressor: {'MSE': 37.78231507078805, 'R2': 0.7346124011530578, 'MAE': 2.917927608628941}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-31.16059657 -33.34957404 -30.38637924 -32.36880139 -28.20297734]
Test set scores for Random 

In [None]:
lasso_more_restricted_total_spa_results, lasso_more_restricted_total_spa_best_model = analyze_datasets_for_target_column(
    datasets_dict=lasso_total_spa_datasets,
    target_column='Total_SPA',
    feature_set_name='Lasso',
    exclusion_type='more_restricted'
)


Processing datasets for target column: Total_SPA with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: predict_total_spa_Lasso_more_restricted_regression
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-78.61993623 -77.37373247 -76.00180209 -82.81804374 -76.86288781]
Test set scores for Linear Regression: {'MSE': 70.14243107983546, 'R2': 0.5073109912219997, 'MAE': 6.528072441576014}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-76.39686447 -69.09319427 -72.62431406 -76.19739952 -66.6327697 ]
Test set scores for Decision Tree Regressor: {'MSE': 59.52850465565976, 'R2': 0.58186450766938, 'MAE': 4.293839737891116}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-55.37039213 -55.12268797 -55.95231631 -58.66483712 -51.37243039]
Test set score

In [None]:
fr_more_restricted_total_spa_results, fr_more_restricted_total_spa_best_model = analyze_datasets_for_target_column(
    datasets_dict=fr_total_spa_datasets,
    target_column='Total_SPA',
    feature_set_name='FR',
    exclusion_type='more_restricted'
)


Processing datasets for target column: Total_SPA with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: predict_total_spa_FR_more_restricted_regression
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-80.1391861  -79.60665495 -77.94896346 -83.61738543 -79.36937864]
Test set scores for Linear Regression: {'MSE': 71.7687325766494, 'R2': 0.4958876507402955, 'MAE': 6.657310456381313}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-60.51692391 -64.09356087 -68.74509534 -65.31592644 -62.66279476]
Test set scores for Decision Tree Regressor: {'MSE': 56.644459542034014, 'R2': 0.6021223930381665, 'MAE': 4.28448592773232}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-53.06443433 -56.06865259 -53.80132658 -56.79321783 -50.92219448]
Test set scores 

In [None]:
print("Column to Predict/Classify: Total_SPA. Feature Set: more_restricted")
print("Best Model Info using MI Feature Selection method:", mi_more_restricted_total_spa_best_model)
print("Best Model Info using RF Feature Selection method:", rf_more_restricted_total_spa_best_model)
print("Best Model Info using DT Feature Selection method:", dt_more_restricted_total_spa_best_model)
print("Best Model Info using Lasso Feature Selection method:", lasso_more_restricted_total_spa_best_model)
print("Best Model Info using FR Feature Selection method:", fr_more_restricted_total_spa_best_model)


Column to Predict/Classify: Total_SPA. Feature Set: more_restricted
Best Model Info using MI Feature Selection method: {'model_name': 'Random Forest', 'params': array([0.9600862 , 0.96245377, 0.96360225, 0.96200979, 0.95964594]), 'dataset': 'predict_total_spa_MI_more_restricted', 'test_scores': {'Accuracy': 0.9714828897338403, 'F1-Score': 0.9705604208936631}}
Best Model Info using RF Feature Selection method: {'model_name': 'Random Forest', 'params': array([0.9600862 , 0.96245377, 0.96360225, 0.96200979, 0.95964594]), 'dataset': 'predict_total_spa_RF_more_restricted', 'test_scores': {'Accuracy': 0.9714828897338403, 'F1-Score': 0.9705604208936631}}
Best Model Info using DT Feature Selection method: {'model_name': 'Random Forest', 'params': array([0.95929463, 0.96245377, 0.96360225, 0.96121437, 0.95885415]), 'dataset': 'predict_total_spa_DT_more_restricted', 'test_scores': {'Accuracy': 0.973384030418251, 'F1-Score': 0.9724607202774468}}
Best Model Info using Lasso Feature Selection metho

#### Restricted - L4 GPU

In [None]:
mi_restricted_total_spa_results, mi_restricted_total_spa_best_model = analyze_datasets_for_target_column(
    datasets_dict=mi_total_spa_datasets,
    target_column='Total_SPA',
    feature_set_name='MI',
    exclusion_type='restricted'
)


Processing datasets for target column: Total_SPA with exclusion type: restricted
--------------------------------------------------------------------------------
Processing dataset: predict_total_spa_MI_restricted
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-56.59756112 -55.74291067 -53.33120426 -61.53141922 -56.95883629]
Test set scores for Linear Regression: {'MSE': 47.30797397516009, 'R2': 0.667703008773848, 'MAE': 4.817974382758266}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-41.36129019 -46.32327748 -42.39391761 -44.37885156 -39.23820405]
Test set scores for Decision Tree Regressor: {'MSE': 36.972598969591935, 'R2': 0.7402999460121151, 'MAE': 2.8659874178827875}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-31.50582472 -32.78780175 -30.02956185 -33.1305952  -27.5811895 ]
Test set scores for Random Forest R

In [None]:
rf_restricted_total_spa_results, rf_restricted_total_spa_best_model = analyze_datasets_for_target_column(
    datasets_dict=rf_total_spa_datasets,
    target_column='Total_SPA',
    feature_set_name='RF',
    exclusion_type='restricted'
)

Processing datasets for target column: Total_SPA with exclusion type: restricted
--------------------------------------------------------------------------------
Processing dataset: predict_total_spa_RF_restricted
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-56.59756112 -55.74291067 -53.33120426 -61.53141922 -56.95883629]
Test set scores for Linear Regression: {'MSE': 47.30797397516009, 'R2': 0.667703008773848, 'MAE': 4.817974382758266}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-41.36129019 -46.32327748 -42.39391761 -44.37885156 -39.23820405]
Test set scores for Decision Tree Regressor: {'MSE': 36.972598969591935, 'R2': 0.7402999460121151, 'MAE': 2.8659874178827875}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-31.50582472 -32.78780175 -30.02956185 -33.1305952  -27.5811895 ]
Test set scores for Random Forest R

In [None]:
dt_regression_restricted_total_spa_results, dt_regression_restricted_total_spa_best_model = analyze_datasets_for_target_column(
    datasets_dict=rf_total_spa_datasets,
    target_column='Total_SPA',
    feature_set_name='DT',
    exclusion_type='restricted'
)


Processing datasets for target column: Total_SPA with exclusion type: restricted
No datasets found for the given criteria: restricted, DT, Total_SPA


In [None]:
print("Column to Predict/Classify: Total_SPA. Feature Set: restricted")
print("Best Model Info using MI Feature Selection method:", mi_restricted_total_spa_best_model)
print("Best Model Info using RF Feature Selection method:", rf_restricted_total_spa_best_model)
#print("Best Model Info using DT Feature Selection method:", dt_restricted_total_spa_best_model)

Column to Predict/Classify: Total_SPA. Feature Set: restricted
Best Model Info using MI Feature Selection method: {'model_name': 'Random Forest', 'params': array([0.9600862 , 0.96087094, 0.96281039, 0.96200979, 0.96122811]), 'dataset': 'predict_total_spa_MI_restricted', 'test_scores': {'Accuracy': 0.9702154626108999, 'F1-Score': 0.969294386244122}}
Best Model Info using RF Feature Selection method: {'model_name': 'Random Forest', 'params': array([0.9600862 , 0.96087094, 0.96281039, 0.96200979, 0.96122811]), 'dataset': 'predict_total_spa_RF_restricted', 'test_scores': {'Accuracy': 0.9702154626108999, 'F1-Score': 0.969294386244122}}


#### Less Restricted - L4 GPU

In [None]:
mi_less_restricted_total_spa_results, mi_less_restricted_total_spa_best_model = analyze_datasets_for_target_column(
    datasets_dict=mi_total_spa_datasets,
    target_column='Total_SPA',
    feature_set_name='MI',
    exclusion_type='less_restricted'
)


Processing datasets for target column: Total_SPA with exclusion type: less_restricted
--------------------------------------------------------------------------------
Processing dataset: predict_total_spa_MI_less_restricted
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-56.59756112 -55.74291067 -53.33120426 -61.53141922 -56.95883629]
Test set scores for Linear Regression: {'MSE': 47.30797397516009, 'R2': 0.667703008773848, 'MAE': 4.817974382758266}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-41.36129019 -46.32327748 -42.39391761 -44.37885156 -39.23820405]
Test set scores for Decision Tree Regressor: {'MSE': 36.972598969591935, 'R2': 0.7402999460121151, 'MAE': 2.8659874178827875}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-31.50582472 -32.78780175 -30.02956185 -33.1305952  -27.5811895 ]
Test set scores for Rando

In [None]:
rf_less_restricted_total_spa_results, rf_less_restricted_total_spa_best_model = analyze_datasets_for_target_column(
    datasets_dict=rf_total_spa_datasets,
    target_column='Total_SPA',
    feature_set_name='RF',
    exclusion_type='less_restricted'
)


Processing datasets for target column: Total_SPA with exclusion type: less_restricted
--------------------------------------------------------------------------------
Processing dataset: predict_total_spa_RF_less_restricted
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-56.59756112 -55.74291067 -53.33120426 -61.53141922 -56.95883629]
Test set scores for Linear Regression: {'MSE': 47.30797397516009, 'R2': 0.667703008773848, 'MAE': 4.817974382758266}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-41.36129019 -46.32327748 -42.39391761 -44.37885156 -39.23820405]
Test set scores for Decision Tree Regressor: {'MSE': 36.972598969591935, 'R2': 0.7402999460121151, 'MAE': 2.8659874178827875}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-31.50582472 -32.78780175 -30.02956185 -33.1305952  -27.5811895 ]
Test set scores for Rando

In [None]:
dt_less_restricted_total_spa_results, dt_less_restricted_total_spa_best_model = analyze_datasets_for_target_column(
    datasets_dict=dt_total_spa_datasets,
    target_column='Total_SPA',
    feature_set_name='DT',
    exclusion_type='less_restricted'
)


Processing datasets for target column: Total_SPA with exclusion type: less_restricted
--------------------------------------------------------------------------------
Processing dataset: predict_total_spa_DT_less_restricted
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-56.82940228 -55.843724   -53.13801577 -61.45601849 -57.04445572]
Test set scores for Linear Regression: {'MSE': 47.297965797921805, 'R2': 0.6677733074339784, 'MAE': 4.815260844611011}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-40.97465713 -47.74743217 -42.08813624 -40.4501978  -42.0981799 ]
Test set scores for Decision Tree Regressor: {'MSE': 37.78231507078805, 'R2': 0.7346124011530578, 'MAE': 2.917927608628941}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-31.18854615 -33.39516573 -30.44958327 -32.34025715 -28.18541582]
Test set scores for Rando

In [None]:
print("Column to Predict/Classify: Total_SPA. Feature Set: less_restricted")
print("Best Model Info using MI Feature Selection method:", mi_less_restricted_total_spa_best_model)
print("Best Model Info using RF Feature Selection method:", rf_less_restricted_total_spa_best_model)
print("Best Model Info using DT Feature Selection method:", dt_less_restricted_total_spa_best_model)

Column to Predict/Classify: Total_SPA. Feature Set: less_restricted
Best Model Info using MI Feature Selection method: {'model_name': 'Random Forest', 'params': array([0.9600862 , 0.96087094, 0.96281039, 0.96200979, 0.96122811]), 'dataset': 'predict_total_spa_MI_less_restricted', 'test_scores': {'Accuracy': 0.9702154626108999, 'F1-Score': 0.969294386244122}}
Best Model Info using RF Feature Selection method: {'model_name': 'Random Forest', 'params': array([0.9600862 , 0.96087094, 0.96281039, 0.96200979, 0.96122811]), 'dataset': 'predict_total_spa_RF_less_restricted', 'test_scores': {'Accuracy': 0.9702154626108999, 'F1-Score': 0.969294386244122}}
Best Model Info using DT Feature Selection method: {'model_name': 'Random Forest', 'params': array([0.9600862 , 0.96166317, 0.96201924, 0.95884131, 0.96043804]), 'dataset': 'predict_total_spa_DT_less_restricted', 'test_scores': {'Accuracy': 0.9727503168567807, 'F1-Score': 0.971827151316785}}


### 4.3.4 Predicting Passed feature

#### CPU Tests

In [None]:
# Analyze 'Passed' using the original dataset without feature selection
original_passed_results_mean, original_passed_best_model_mean = analyze_datasets_for_target_column(
    datasets_dict=loaded_datasets,  # Use the main dataset dictionary
    target_column='Passed',
    group_col='Year',
    imputation_strategy='mean',
    cv=10,  # Use 10-fold cross-validation
    exclusion_type='more_restricted'  # When using original dataframe only more_restricted available
)

Processing datasets for target column: Passed with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: combined_df_Passed
No specific columns to scale provided. Scaling all columns except the target.
Performing classification analysis...
Evaluating model: Logistic Regression
Best parameters for Logistic Regression: {'C': 0.3}
Cross-validation scores for Logistic Regression: [0.9895015  0.99026909 0.99266178 0.99034072 0.98777583]
Test set scores for Logistic Regression: {'Accuracy': 0.9955640050697085, 'F1-Score': 0.9955248976246943}
Evaluating model: Decision Tree
Best parameters for Decision Tree: {'max_depth': 5}
Cross-validation scores for Decision Tree: [0.98778549 0.98798369 0.98787248 0.98637074 0.98690559]
Test set scores for Decision Tree: {'Accuracy': 0.9936628643852978, 'F1-Score': 0.9936258628759812}
Evaluating model: Random Forest
Best parameters for Random Forest: {'max_depth': 10, 'n_estimat

In [None]:
mi_more_restricted_passed_results, mi_more_restricted_passed_best_model = analyze_datasets_for_target_column(
    datasets_dict=mi_passed_datasets,
    target_column='Passed',
    feature_set_name='MI',
    group_col='Year',
    imputation_strategy='mean',
    cv=10,  # Use 10-fold cross-validation
    exclusion_type='more_restricted'
)

Processing datasets for target column: Passed with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: predict_passed_MI_more_restricted
Column 'Year' not found. Scaling without grouping.
Performing classification analysis...
Evaluating model: Logistic Regression
Best parameters for Logistic Regression: {'C': 0.15}
Cross-validation scores for Logistic Regression: [0.99273181 0.99273181 0.99266178 0.99272605 0.99103561]
Test set scores for Logistic Regression: {'Accuracy': 0.9974651457541192, 'F1-Score': 0.9974503451503924}
Evaluating model: Decision Tree
Best parameters for Decision Tree: {'max_depth': 5}
Cross-validation scores for Decision Tree: [0.98958587 0.98659102 0.9889065  0.99272605 0.99049128]
Test set scores for Decision Tree: {'Accuracy': 0.9942965779467681, 'F1-Score': 0.9942462969460354}
Evaluating model: Random Forest
Best parameters for Random Forest: {'max_depth': 20, 'n_estimators': 100}


#### T4 GPU Results

In [None]:
# Analyze 'Passed' using the original dataset without feature selection
original_passed_results, original_passed_best_model = analyze_datasets_for_target_column(
    datasets_dict=loaded_datasets,  # Use the main dataset dictionary
    target_column='Passed',
    cv=10,  # Use 10-fold cross-validation
    exclusion_type='more_restricted'  # When using original dataframe only more_restricted available
)

Processing datasets for target column: Passed with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: combined_df_Passed
Performing classification analysis...
Evaluating model: Logistic Regression
Best parameters for Logistic Regression: {'C': 0.1}
Cross-validation scores for Logistic Regression: [0.98746001 0.99088652 0.98731786 0.98487063 0.98655168]
Test set scores for Logistic Regression: {'Accuracy': 0.9917617237008872, 'F1-Score': 0.9914249158119939}
Evaluating model: Decision Tree
Best parameters for Decision Tree: {'max_depth': 5}
Cross-validation scores for Decision Tree: [0.98788634 0.98788634 0.98722133 0.98797418 0.98614594]
Test set scores for Decision Tree: {'Accuracy': 0.9923954372623575, 'F1-Score': 0.9922105878911961}
Evaluating model: Random Forest
Best parameters for Random Forest: {'max_depth': 10, 'n_estimators': 100}
Cross-validation scores for Random Forest: [0.99026909 0.99267129 0

In [None]:
# Analyze 'Passed' using the original dataset without feature selection
original_passed_results_mean, original_passed_best_model_mean = analyze_datasets_for_target_column(
    datasets_dict=loaded_datasets,  # Use the main dataset dictionary
    target_column='Passed',
    cv=10,  # Use 10-fold cross-validation,
    imputation_strategy='mean',
    exclusion_type='more_restricted'  # When using original dataframe only more_restricted available
)

Processing datasets for target column: Passed with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: combined_df_Passed
Column 'None' not found. Scaling without grouping.
Performing classification analysis...
Evaluating model: Logistic Regression
Best parameters for Logistic Regression: {'C': 0.3}
Cross-validation scores for Logistic Regression: [0.9895015  0.99026909 0.99266178 0.99034072 0.98777583]
Test set scores for Logistic Regression: {'Accuracy': 0.9955640050697085, 'F1-Score': 0.9955248976246943}
Evaluating model: Decision Tree
Best parameters for Decision Tree: {'max_depth': 5}
Cross-validation scores for Decision Tree: [0.98778549 0.98798369 0.98787248 0.98637074 0.98690559]
Test set scores for Decision Tree: {'Accuracy': 0.9936628643852978, 'F1-Score': 0.9936258628759812}
Evaluating model: Random Forest
Best parameters for Random Forest: {'max_depth': 10, 'n_estimators': 100}
Cross-validation

In [None]:
# Analyze 'Passed' using the original dataset without feature selection
original_passed_results_zero, original_passed_best_model_zero = analyze_datasets_for_target_column(
    datasets_dict=loaded_datasets,  # Use the main dataset dictionary
    target_column='Passed',
    cv=10,  # Use 10-fold cross-validation,
    imputation_strategy='zero',
    exclusion_type='more_restricted'  # When using original dataframe only more_restricted available
)

Processing datasets for target column: Passed with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: combined_df_Passed
Column 'None' not found. Scaling without grouping.
Performing classification analysis...
Evaluating model: Logistic Regression
Best parameters for Logistic Regression: {'C': 0.1}
Cross-validation scores for Logistic Regression: [0.99026909 0.99111665 0.99181119 0.99188449 0.98844296]
Test set scores for Logistic Regression: {'Accuracy': 0.9955640050697085, 'F1-Score': 0.9954979570845527}
Evaluating model: Decision Tree
Best parameters for Decision Tree: {'max_depth': 5}
Cross-validation scores for Decision Tree: [0.98798369 0.99111665 0.98881867 0.98787676 0.9875627 ]
Test set scores for Decision Tree: {'Accuracy': 0.9930291508238276, 'F1-Score': 0.9929676962673767}
Evaluating model: Random Forest
Best parameters for Random Forest: {'max_depth': 10, 'n_estimators': 100}
Cross-validation

In [None]:
# Analyze 'Passed' using the original dataset without feature selection
original_passed_results_median, original_passed_best_model_median = analyze_datasets_for_target_column(
    datasets_dict=loaded_datasets,  # Use the main dataset dictionary
    target_column='Passed',
    cv=10,  # Use 10-fold cross-validation,
    imputation_strategy='median',
    exclusion_type='more_restricted'  # When using original dataframe only more_restricted available
)

Processing datasets for target column: Passed with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: combined_df_Passed
Column 'None' not found. Scaling without grouping.
Performing classification analysis...
Evaluating model: Logistic Regression
Best parameters for Logistic Regression: {'C': 0.3}
Cross-validation scores for Logistic Regression: [0.98864728 0.99026909 0.99266178 0.99034072 0.98854239]
Test set scores for Logistic Regression: {'Accuracy': 0.9955640050697085, 'F1-Score': 0.9955248976246943}
Evaluating model: Decision Tree
Best parameters for Decision Tree: {'max_depth': 5}
Cross-validation scores for Decision Tree: [0.98778549 0.98882903 0.98550703 0.98562595 0.98614594]
Test set scores for Decision Tree: {'Accuracy': 0.9917617237008872, 'F1-Score': 0.9916390631570262}
Evaluating model: Random Forest
Best parameters for Random Forest: {'max_depth': 10, 'n_estimators': 200}
Cross-validation

In [None]:
# Analyze 'Passed' using the original dataset without feature selection
original_passed_results_knn, original_passed_best_model_knn = analyze_datasets_for_target_column(
    datasets_dict=loaded_datasets,  # Use the main dataset dictionary
    target_column='Passed',
    cv=10,  # Use 10-fold cross-validation,
    imputation_strategy='knn',
    exclusion_type='more_restricted'  # When using original dataframe only more_restricted available
)

Processing datasets for target column: Passed with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: combined_df_Passed
Column 'None' not found. Scaling without grouping.
Performing classification analysis...
Evaluating model: Logistic Regression
Best parameters for Logistic Regression: {'C': 0.3}
Cross-validation scores for Logistic Regression: [0.99026909 0.99111665 0.99103106 0.99034072 0.98679195]
Test set scores for Logistic Regression: {'Accuracy': 0.9955640050697085, 'F1-Score': 0.9955248976246943}
Evaluating model: Decision Tree
Best parameters for Decision Tree: {'max_depth': 5}
Cross-validation scores for Decision Tree: [0.98778549 0.99118804 0.98776963 0.98626032 0.99103561]
Test set scores for Decision Tree: {'Accuracy': 0.9904942965779467, 'F1-Score': 0.9903527651811843}
Evaluating model: Random Forest
Best parameters for Random Forest: {'max_depth': 10, 'n_estimators': 100}
Cross-validation

#### L4 GPU Results

##### More Restricted

In [None]:
# Analyze 'Passed' using the original dataset without feature selection
original_passed_results, original_passed_best_model = analyze_datasets_for_target_column(
    datasets_dict=loaded_datasets,  # Use the main dataset dictionary
    target_column='Passed',
    exclusion_type='more_restricted'  # When using original dataframe only more_restricted available
)


Processing datasets for target column: Passed with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: combined_df_Passed
Performing classification analysis...
Evaluating model: Logistic Regression
Best parameters for Logistic Regression: {'C': 0.1}
Cross-validation scores for Logistic Regression: [0.98746001 0.99088652 0.98731786 0.98487063 0.98655168]
Test set scores for Logistic Regression: {'Accuracy': 0.9917617237008872, 'F1-Score': 0.9914249158119939}
Evaluating model: Decision Tree
Best parameters for Decision Tree: {'max_depth': 5}
Cross-validation scores for Decision Tree: [0.98788634 0.98788634 0.98722133 0.98797418 0.98614594]
Test set scores for Decision Tree: {'Accuracy': 0.9923954372623575, 'F1-Score': 0.9922105878911961}
Evaluating model: Random Forest
Best parameters for Random Forest: {'max_depth': 10, 'n_estimators': 100}
Cross-validation scores for Random Forest: [0.99026909 0.99267129 0

In [None]:
mi_more_restricted_passed_results, mi_more_restricted_passed_best_model = analyze_datasets_for_target_column(
    datasets_dict=mi_passed_datasets,
    target_column='Passed',
    feature_set_name='MI',
    exclusion_type='more_restricted'
)

Processing datasets for target column: Passed with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: predict_passed_MI_more_restricted
Performing classification analysis...
Evaluating model: Logistic Regression
Best parameters for Logistic Regression: {'C': 0.3}
Cross-validation scores for Logistic Regression: [0.99195696 0.99273181 0.99181119 0.9926655  0.99181599]
Test set scores for Logistic Regression: {'Accuracy': 0.9961977186311787, 'F1-Score': 0.9961755177255888}
Evaluating model: Decision Tree
Best parameters for Decision Tree: {'max_depth': 5}
Cross-validation scores for Decision Tree: [0.99267129 0.98891528 0.98806746 0.99188449 0.99118106]
Test set scores for Decision Tree: {'Accuracy': 0.9942965779467681, 'F1-Score': 0.9942462969460354}
Evaluating model: Random Forest
Best parameters for Random Forest: {'max_depth': 10, 'n_estimators': 100}
Cross-validation scores for Random Forest: [0.993512

In [None]:
rf_more_restricted_passed_results, rf_more_restricted_passed_best_model = analyze_datasets_for_target_column(
    datasets_dict=rf_passed_datasets,
    target_column='Passed',
    feature_set_name='RF',
    exclusion_type='more_restricted'
)


Processing datasets for target column: Passed with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: predict_passed_RF_more_restricted
Performing classification analysis...
Evaluating model: Logistic Regression
Best parameters for Logistic Regression: {'C': 0.3}
Cross-validation scores for Logistic Regression: [0.99195696 0.99273181 0.99181119 0.9926655  0.99181599]
Test set scores for Logistic Regression: {'Accuracy': 0.9961977186311787, 'F1-Score': 0.9961755177255888}
Evaluating model: Decision Tree
Best parameters for Decision Tree: {'max_depth': 5}
Cross-validation scores for Decision Tree: [0.99267129 0.98891528 0.98806746 0.99188449 0.99118106]
Test set scores for Decision Tree: {'Accuracy': 0.9942965779467681, 'F1-Score': 0.9942462969460354}
Evaluating model: Random Forest
Best parameters for Random Forest: {'max_depth': 10, 'n_estimators': 100}
Cross-validation scores for Random Forest: [0.993512

In [None]:
dt_more_restricted_passed_results, dt_more_restricted_passed_best_model = analyze_datasets_for_target_column(
    datasets_dict=dt_passed_datasets,
    target_column='Passed',
    feature_set_name='DT',
    exclusion_type='more_restricted'
)


Processing datasets for target column: Passed with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: predict_passed_DT_more_restricted
Performing classification analysis...
Evaluating model: Logistic Regression
Best parameters for Logistic Regression: {'C': 0.3}
Cross-validation scores for Logistic Regression: [0.99195696 0.99273181 0.99181119 0.9926655  0.99181599]
Test set scores for Logistic Regression: {'Accuracy': 0.9961977186311787, 'F1-Score': 0.9961755177255888}
Evaluating model: Decision Tree
Best parameters for Decision Tree: {'max_depth': 5}
Cross-validation scores for Decision Tree: [0.99273181 0.98891528 0.98732171 0.99188449 0.99118106]
Test set scores for Decision Tree: {'Accuracy': 0.9942965779467681, 'F1-Score': 0.9942462969460354}
Evaluating model: Random Forest
Best parameters for Random Forest: {'max_depth': 10, 'n_estimators': 200}
Cross-validation scores for Random Forest: [0.993512

In [None]:
print("Column to Classify: Passed. Feature Set: more_restricted")
print("Best Model Info using MI Feature Selection method:", mi_more_restricted_passed_best_model)
print("Best Model Info using RF Feature Selection method:", rf_more_restricted_passed_best_model)
print("Best Model Info using DT Feature Selection method:", dt_more_restricted_passed_best_model)

Column to Classify: Passed. Feature Set: more_restricted
Best Model Info using MI Feature Selection method: {'model_name': 'Logistic Regression', 'params': array([0.99195696, 0.99273181, 0.99181119, 0.9926655 , 0.99181599]), 'dataset': 'predict_passed_MI_more_restricted', 'test_scores': {'Accuracy': 0.9961977186311787, 'F1-Score': 0.9961755177255888}}
Best Model Info using RF Feature Selection method: {'model_name': 'Logistic Regression', 'params': array([0.99195696, 0.99273181, 0.99181119, 0.9926655 , 0.99181599]), 'dataset': 'predict_passed_RF_more_restricted', 'test_scores': {'Accuracy': 0.9961977186311787, 'F1-Score': 0.9961755177255888}}
Best Model Info using DT Feature Selection method: {'model_name': 'Logistic Regression', 'params': array([0.99195696, 0.99273181, 0.99181119, 0.9926655 , 0.99181599]), 'dataset': 'predict_passed_DT_more_restricted', 'test_scores': {'Accuracy': 0.9961977186311787, 'F1-Score': 0.9961755177255888}}


### 4.3.5 Predicting FinalGrade feature from students that passed

#### CPU

In [None]:
# Analyze 'FinalGrade' using the original dataset without feature selection
original_finalgrade_results_zero, original_finalgrade_best_model_zero = analyze_datasets_for_target_column(
    datasets_dict=loaded_datasets,
    target_column='FinalGrade',
    group_col='Year',
    imputation_strategy='zero',
    exclusion_type='more_restricted'  # When using original dataframe only more_restricted available
)

Processing datasets for target column: FinalGrade with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: filtered_combined_df_FinalGrade
No specific columns to scale provided. Scaling all columns except the target.
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-1.66094957e+00 -3.81845096e+19 -1.53845885e+00 -1.64729813e+00
 -1.43368663e+00]
Test set scores for Linear Regression: {'MSE': 3.081687220586811e+21, 'R2': -1.0226671550599767e+21, 'MAE': 1433338114.2409737}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-3.05370756 -3.05258986 -3.28384251 -3.17936663 -2.85559656]
Test set scores for Decision Tree Regressor: {'MSE': 3.025924757467669, 'R2': -0.004162214280707399, 'MAE': 1.3010038229034229}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regre

In [None]:
# Analyze 'FinalGrade' using the original dataset without feature selection
original_finalgrade_results_zero, original_finalgrade_best_model_zero = analyze_datasets_for_target_column(
    datasets_dict=loaded_datasets,
    target_column='FinalGrade',
    imputation_strategy='zero',
    exclusion_type='more_restricted'  # When using original dataframe only more_restricted available
)

Processing datasets for target column: FinalGrade with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: filtered_combined_df_FinalGrade
Column 'None' not found. Scaling without grouping.
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-1.66094957e+00 -3.81845096e+19 -1.53845885e+00 -1.64729813e+00
 -1.43368663e+00]
Test set scores for Linear Regression: {'MSE': 3.081687220586811e+21, 'R2': -1.0226671550599767e+21, 'MAE': 1433338114.2409737}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-3.05370756 -3.05258986 -3.28384251 -3.17936663 -2.85559656]
Test set scores for Decision Tree Regressor: {'MSE': 3.025924757467669, 'R2': -0.004162214280707399, 'MAE': 1.3010038229034229}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-1.81021229 -1.71477

In [None]:
# Analyze 'FinalGrade' using the original dataset without feature selection
original_finalgrade_results_remove_miss, original_finalgrade_best_model_remove_miss = analyze_datasets_for_target_column(
    datasets_dict=loaded_datasets,
    target_column='FinalGrade',
    exclusion_type='more_restricted'  # When using original dataframe only more_restricted available
)

Processing datasets for target column: FinalGrade with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: filtered_combined_df_FinalGrade
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-2.0006911  -1.84215961 -1.9472068  -1.9190993  -1.81520376]
Test set scores for Linear Regression: {'MSE': 1.836522415826088, 'R2': 0.39054452325661215, 'MAE': 1.0485898819811703}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-3.00810554 -2.77334315 -2.75190128 -3.13746596 -2.64945154]
Test set scores for Decision Tree Regressor: {'MSE': 2.82063259960325, 'R2': 0.06396460457255426, 'MAE': 1.2647467369070156}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-2.08209838 -1.95933419 -1.99900606 -2.16883448 -1.81411228]
Test set scores for Random Forest Regresso

In [None]:
# Analyze 'FinalGrade' using the original dataset without feature selection
original_finalgrade_results_median, original_finalgrade_best_model_median = analyze_datasets_for_target_column(
    datasets_dict=loaded_datasets,
    target_column='FinalGrade',
    imputation_strategy='median',
    exclusion_type='more_restricted'  # When using original dataframe only more_restricted available
)

Processing datasets for target column: FinalGrade with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: filtered_combined_df_FinalGrade
Column 'None' not found. Scaling without grouping.
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-1.72749264e+00 -8.59933402e+23 -1.64299697e+00 -1.73481778e+00
 -1.50386018e+00]
Test set scores for Linear Regression: {'MSE': 4.586946360181301e+21, 'R2': -1.522191919167607e+21, 'MAE': 1748703589.3578944}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-3.22682877 -2.98965786 -3.13078757 -3.41534906 -2.82590664]
Test set scores for Decision Tree Regressor: {'MSE': 3.159277334985475, 'R2': -0.04841566744072057, 'MAE': 1.3519265600366601}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-1.7791738  -1.7170476

In [None]:
# Analyze 'FinalGrade' using the original dataset without feature selection
original_finalgrade_results_knn, original_finalgrade_best_model_knn = analyze_datasets_for_target_column(
    datasets_dict=loaded_datasets,
    target_column='FinalGrade',
    imputation_strategy='knn',
    exclusion_type='more_restricted'  # When using original dataframe only more_restricted available
)

Processing datasets for target column: FinalGrade with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: filtered_combined_df_FinalGrade
Column 'None' not found. Scaling without grouping.
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-1.86063283 -1.82374364 -1.74222725 -1.74090774 -1.60484651]
Test set scores for Linear Regression: {'MSE': 8.808117552972195e+22, 'R2': -2.923000251017256e+22, 'MAE': 7662948760.182307}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-3.11804294 -3.05998993 -3.57111891 -3.29472256 -2.79123457]
Test set scores for Decision Tree Regressor: {'MSE': 3.0303878458770197, 'R2': -0.005643303567082869, 'MAE': 1.3067294013065014}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-1.82834421 -1.68097199 -1.72181797 -1.843

In [None]:
# Analyze 'FinalGrade' using the original dataset without feature selection
original_finalgrade_results_zero, original_finalgrade_best_model_zero = analyze_datasets_for_target_column(
    datasets_dict=loaded_datasets,
    target_column='FinalGrade',
    imputation_strategy='zero',
    exclusion_type='more_restricted'  # When using original dataframe only more_restricted available
)

Processing datasets for target column: FinalGrade with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: filtered_combined_df_FinalGrade
Column 'None' not found. Scaling without grouping.
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-1.66094957e+00 -3.81845096e+19 -1.53845885e+00 -1.64729813e+00
 -1.43368663e+00]
Test set scores for Linear Regression: {'MSE': 3.081687220586811e+21, 'R2': -1.0226671550599767e+21, 'MAE': 1433338114.2409737}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-3.05370756 -3.05258986 -3.28384251 -3.17936663 -2.85559656]
Test set scores for Decision Tree Regressor: {'MSE': 3.025924757467669, 'R2': -0.004162214280707399, 'MAE': 1.3010038229034229}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-1.81021229 -1.71477

In [None]:
# Analyze 'FinalGrade' using the original dataset without feature selection
original_finalgrade_results_mean, original_finalgrade_best_model_mean = analyze_datasets_for_target_column(
    datasets_dict=loaded_datasets,
    target_column='FinalGrade',
    imputation_strategy='mean',
    exclusion_type='more_restricted'  # When using original dataframe only more_restricted available
)

Processing datasets for target column: FinalGrade with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: filtered_combined_df_FinalGrade
Column 'None' not found. Scaling without grouping.
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-1.77900037e+00 -1.64048846e+04 -1.66278585e+00 -1.73825033e+00
 -1.50843758e+00]
Test set scores for Linear Regression: {'MSE': 7.943618886770027e+20, 'R2': -2.6361137735019397e+20, 'MAE': 727718760.1275631}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-2.94311812 -3.10380944 -3.11572559 -3.17414727 -2.93639197]
Test set scores for Decision Tree Regressor: {'MSE': 3.220220415900072, 'R2': -0.06863981178709366, 'MAE': 1.3464011747552749}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-1.78768716 -1.7142406

In [None]:
# RF - More Restricted
rf_more_restricted_finalgrade_results_mean, rf_more_restricted_finalgrade_best_model_mean = analyze_datasets_for_target_column(
    datasets_dict=rf_finalgrade_datasets,
    target_column='FinalGrade',
    feature_set_name='RF',
    imputation_strategy='mean',
    exclusion_type='more_restricted'
)

Processing datasets for target column: FinalGrade with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: predict_finalgrade_RF_more_restricted
Column 'None' not found. Scaling without grouping.
Dropping rows with NaN values in the target variable.
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-1.85498108 -1.94002198 -2.13435664 -1.82414137 -1.99267997]
Test set scores for Linear Regression: {'MSE': 1.7495952905999652, 'R2': 0.7765690944771519, 'MAE': 0.9961799624231823}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-3.5434959  -3.1667224  -3.45359465 -3.5237446  -3.11567099]
Test set scores for Decision Tree Regressor: {'MSE': 3.160843254694458, 'R2': 0.5963466097522526, 'MAE': 1.310413963617961}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regre

In [None]:
# RF - More Restricted
rf_more_restricted_finalgrade_results_zero, rf_more_restricted_finalgrade_best_model_zero = analyze_datasets_for_target_column(
    datasets_dict=rf_finalgrade_datasets,
    target_column='FinalGrade',
    feature_set_name='RF',
    imputation_strategy='zero',
    exclusion_type='more_restricted'
)

Processing datasets for target column: FinalGrade with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: predict_finalgrade_RF_more_restricted
Column 'None' not found. Scaling without grouping.
Dropping rows with NaN values in the target variable.
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-1.84427965 -1.88795566 -2.04206462 -1.81835179 -1.97139711]
Test set scores for Linear Regression: {'MSE': 1.6917523040562292, 'R2': 0.783955894688063, 'MAE': 0.9689118640890197}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-3.54377761 -3.07398499 -3.63340837 -3.27324677 -3.23467901]
Test set scores for Decision Tree Regressor: {'MSE': 3.1296027555708896, 'R2': 0.6003361569610651, 'MAE': 1.2757103012944102}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regr

#### GPU T4

In [None]:
# Analyze 'FinalGrade' using the original dataset without feature selection
original_finalgrade_results_zero, original_finalgrade_best_model_zero = analyze_datasets_for_target_column(
    datasets_dict=loaded_datasets,
    target_column='FinalGrade',
    imputation_strategy='zero',
    exclusion_type='more_restricted'  # When using original dataframe only more_restricted available
)

Processing datasets for target column: FinalGrade with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: filtered_combined_df_FinalGrade
Column 'None' not found. Scaling without grouping.
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-1.66094957e+00 -3.81845096e+19 -1.53845885e+00 -1.64729813e+00
 -1.43368663e+00]
Test set scores for Linear Regression: {'MSE': 3.081687220586811e+21, 'R2': -1.0226671550599767e+21, 'MAE': 1433338114.2409737}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-3.05370756 -3.05258986 -3.28384251 -3.17936663 -2.85559656]
Test set scores for Decision Tree Regressor: {'MSE': 3.025924757467669, 'R2': -0.004162214280707399, 'MAE': 1.3010038229034229}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-1.81021229 -1.71477

In [None]:
# Analyze 'FinalGrade' using the original dataset without feature selection
original_finalgrade_results, original_finalgrade_best_model = analyze_datasets_for_target_column(
    datasets_dict=loaded_datasets,
    target_column='FinalGrade',
    exclusion_type='more_restricted'  # When using original dataframe only more_restricted available
)

Processing datasets for target column: FinalGrade with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: filtered_combined_df_FinalGrade
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-2.0006911  -1.84215961 -1.9472068  -1.9190993  -1.81520376]
Test set scores for Linear Regression: {'MSE': 1.836522415826088, 'R2': 0.39054452325661215, 'MAE': 1.0485898819811703}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-3.00810554 -2.77334315 -2.75190128 -3.13746596 -2.64945154]
Test set scores for Decision Tree Regressor: {'MSE': 2.82063259960325, 'R2': 0.06396460457255426, 'MAE': 1.2647467369070156}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-2.08209838 -1.95933419 -1.99900606 -2.16883448 -1.81411228]
Test set scores for Random Forest Regresso

In [None]:
# RF - More Restricted
rf_more_restricted_finalgrade_results, rf_more_restricted_finalgrade_best_model = analyze_datasets_for_target_column(
    datasets_dict=rf_finalgrade_datasets,
    target_column='FinalGrade',
    feature_set_name='RF',
    exclusion_type='more_restricted'
)

Processing datasets for target column: FinalGrade with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: predict_finalgrade_RF_more_restricted
Dropping rows with NaN values in the target variable.
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-2.35568686 -2.31094363 -2.615568   -2.25695608 -2.42648909]
Test set scores for Linear Regression: {'MSE': 2.150599460534049, 'R2': 0.7253591230693722, 'MAE': 1.0997911923342618}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-3.17594436 -3.21441579 -3.25460283 -3.32660802 -3.29355657]
Test set scores for Decision Tree Regressor: {'MSE': 2.6030025911558505, 'R2': 0.6675852814962516, 'MAE': 1.1931070864932396}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-2.3572851  -2.37910687 -2.47996912 -2.4154

In [None]:
# Lasso - More Restricted
lasso_more_restricted_finalgrade_results, lasso_more_restricted_finalgrade_best_model = analyze_datasets_for_target_column(
    datasets_dict=lasso_finalgrade_datasets,
    target_column='FinalGrade',
    feature_set_name='Lasso',
    exclusion_type='more_restricted'
)

Processing datasets for target column: FinalGrade with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: predict_finalgrade_Lasso_more_restricted
Dropping rows with NaN values in the target variable.
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-2.35568686 -2.31094363 -2.615568   -2.25695608 -2.42648909]
Test set scores for Linear Regression: {'MSE': 2.150599460534049, 'R2': 0.7253591230693722, 'MAE': 1.0997911923342618}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-3.17594436 -3.21441579 -3.25460283 -3.32660802 -3.29355657]
Test set scores for Decision Tree Regressor: {'MSE': 2.6030025911558505, 'R2': 0.6675852814962516, 'MAE': 1.1931070864932396}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-2.3572851  -2.37910687 -2.47996912 -2.4

In [None]:
# FR - More Restricted
fr_more_restricted_finalgrade_results, fr_more_restricted_finalgrade_best_model = analyze_datasets_for_target_column(
    datasets_dict=fr_finalgrade_datasets,
    target_column='FinalGrade',
    feature_set_name='FR',
    exclusion_type='more_restricted'
)


Processing datasets for target column: FinalGrade with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: predict_finalgrade_FR_more_restricted
Dropping rows with NaN values in the target variable.
Performing regression analysis...
Evaluating model: Linear Regression
Cross-validation scores for Linear Regression: [-2.35773592 -2.31601582 -2.61897769 -2.25351371 -2.43895817]
Test set scores for Linear Regression: {'MSE': 2.156141568527234, 'R2': 0.7246513718459462, 'MAE': 1.098872561129167}
Evaluating model: Decision Tree Regressor
Cross-validation scores for Decision Tree Regressor: [-3.02345669 -2.93598351 -3.10919102 -3.05124064 -3.1000556 ]
Test set scores for Decision Tree Regressor: {'MSE': 2.462837109199588, 'R2': 0.6854850213531158, 'MAE': 1.149253198270344}
Evaluating model: Random Forest Regressor
Cross-validation scores for Random Forest Regressor: [-2.3079899  -2.27356729 -2.43943595 -2.3606815

#### Improve results - Fine Tunning
Between Bagging, Boosting and Stacking, with better results on stacking and then boosting.

##### A100 GPU

In [None]:
# Apply the optimized analysis with imputation testing for RF with more_restricted
original_finalgrade_best_model = analyze_final_grade_with_advanced_ensembles(
    datasets_dict=loaded_datasets,
    target_column='FinalGrade',
    group_col='Year',
    exclusion_type='more_restricted',
)

Processing datasets for target column: FinalGrade with exclusion type: more_restricted
Processing complete for datasets. Total time: 172.39 seconds
Best model: {'model_name': 'filtered_combined_df_FinalGrade_CatBoost', 'params': <catboost.core.CatBoostRegressor object at 0x7d378afa70a0>, 'dataset': 'filtered_combined_df_FinalGrade', 'test_scores': {'MSE': 1.6792544847288797, 'R2': 0.4427343582933829, 'MAE': 1.006535434283224}}


In [None]:
# Apply the optimized analysis with imputation testing for RF with more_restricted
original_finalgrade_best_model_knn = analyze_final_grade_with_advanced_ensembles(
    datasets_dict=loaded_datasets,
    target_column='FinalGrade',
    group_col='Year',
    imputation_strategy='knn',
    exclusion_type='more_restricted',
)

Processing datasets for target column: FinalGrade with exclusion type: more_restricted
No specific columns to scale provided. Scaling all columns except the target.
Processing complete for datasets. Total time: 421.16 seconds
Best model: {'model_name': 'filtered_combined_df_FinalGrade_CatBoost', 'params': <catboost.core.CatBoostRegressor object at 0x7d3848321060>, 'dataset': 'filtered_combined_df_FinalGrade', 'test_scores': {'MSE': 1.417487463820378, 'R2': 0.5296025299795568, 'MAE': 0.9211656989314588}}


In [None]:
# Apply the optimized analysis for RF with more_restricted
rf_more_restricted_finalgrade_results_mean, rf_more_restricted_finalgrade_best_model_mean = analyze_final_grade_with_advanced_ensembles(
    datasets_dict=rf_finalgrade_datasets,
    target_column='FinalGrade',
    feature_set_name='RF',
    exclusion_type='more_restricted',
    imputation_strategy='mean',  # Use the best strategy found
)


Processing datasets for target column: FinalGrade with exclusion type: more_restricted
Column 'None' not found. Scaling without grouping.
Dropping rows with NaN values in the target variable.
Processing complete for datasets. Total time: 436.34 seconds
Best model: {'model_name': 'predict_finalgrade_RF_more_restricted_CatBoost', 'params': <catboost.core.CatBoostRegressor object at 0x7d378ab743d0>, 'dataset': 'predict_finalgrade_RF_more_restricted', 'test_scores': {'MSE': 1.3925497758755025, 'R2': 0.8221653549931494, 'MAE': 0.8832376288494231}}


In [None]:
# Apply the optimized analysis for RF with more_restricted
rf_more_restricted_finalgrade_results_knn, rf_more_restricted_finalgrade_best_model_knn = analyze_final_grade_with_advanced_ensembles(
    datasets_dict=rf_finalgrade_datasets,
    target_column='FinalGrade',
    feature_set_name='RF',
    exclusion_type='more_restricted',
    imputation_strategy='knn',  # Use the best strategy found
)


Processing datasets for target column: FinalGrade with exclusion type: more_restricted
Column 'None' not found. Scaling without grouping.
Dropping rows with NaN values in the target variable.
Processing complete for datasets. Total time: 467.72 seconds
Best model: {'model_name': 'predict_finalgrade_RF_more_restricted_Stacking', 'params': StackingRegressor(estimators=[('xgb',
                               XGBRegressor(base_score=None, booster=None,
                                            callbacks=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=None, device=None,
                                            early_stopping_rounds=None,
                                            enable_categorical=False,
                                            eval_metric=None,
                                            feature_types=None, gamm

In [None]:
# Apply the optimized analysis for RF with more_restricted
rf_more_restricted_finalgrade_results_zero, rf_more_restricted_finalgrade_best_model_zero = analyze_final_grade_with_advanced_ensembles(
    datasets_dict=rf_finalgrade_datasets,
    target_column='FinalGrade',
    feature_set_name='RF',
    exclusion_type='more_restricted',
    imputation_strategy='zero',  # Use the best strategy found
)


Processing datasets for target column: FinalGrade with exclusion type: more_restricted
Column 'None' not found. Scaling without grouping.
Dropping rows with NaN values in the target variable.
Processing complete for datasets. Total time: 585.47 seconds
Best model: {'model_name': 'predict_finalgrade_RF_more_restricted_CatBoost', 'params': <catboost.core.CatBoostRegressor object at 0x7d378ab828f0>, 'dataset': 'predict_finalgrade_RF_more_restricted', 'test_scores': {'MSE': 1.406426058269903, 'R2': 0.8203932935585255, 'MAE': 0.8844594491722937}}


##### L4 GPU

In [None]:
# Apply the optimized analysis with imputation testing for RF with more_restricted
original_finalgrade_best_model = analyze_final_grade_with_advanced_ensembles(
    datasets_dict=loaded_datasets,
    target_column='FinalGrade',
    exclusion_type='more_restricted',
)

Processing datasets for target column: FinalGrade with exclusion type: more_restricted
Processing complete for datasets. Total time: 169.93 seconds
Best model: {'model_name': 'filtered_combined_df_FinalGrade_CatBoost', 'params': <catboost.core.CatBoostRegressor object at 0x7d3346791630>, 'dataset': 'filtered_combined_df_FinalGrade', 'test_scores': {'MSE': 1.6792544847288797, 'R2': 0.4427343582933829, 'MAE': 1.006535434283224}}


In [None]:
# Apply the optimized analysis with imputation testing for RF with more_restricted
original_finalgrade_best_model = analyze_final_grade_with_advanced_ensembles(
    datasets_dict=loaded_datasets,
    target_column='FinalGrade',
    group_col='Year',
    imputation_strategy='mean',
    exclusion_type='more_restricted',
)

Processing datasets for target column: FinalGrade with exclusion type: more_restricted
No specific columns to scale provided. Scaling all columns except the target.
Processing complete for datasets. Total time: 351.10 seconds
Best model: {'model_name': 'filtered_combined_df_FinalGrade_CatBoost', 'params': <catboost.core.CatBoostRegressor object at 0x7d3345f8bd90>, 'dataset': 'filtered_combined_df_FinalGrade', 'test_scores': {'MSE': 1.4264121162681838, 'R2': 0.5266408572738632, 'MAE': 0.9178229082371265}}


##### CPU

In [None]:
# Apply the optimized analysis with imputation testing for RF with more_restricted
original_finalgrade_best_model = analyze_final_grade_with_advanced_ensembles(
    datasets_dict=loaded_datasets,
    target_column='FinalGrade',
    group_col='Year',
    exclusion_type='more_restricted',
)

Processing datasets for target column: FinalGrade with exclusion type: more_restricted
Processing complete for datasets. Total time: 219.13 seconds
Best model: {'model_name': 'filtered_combined_df_FinalGrade_CatBoost', 'params': <catboost.core.CatBoostRegressor object at 0x7cd6ee2d81c0>, 'dataset': 'filtered_combined_df_FinalGrade', 'test_scores': {'MSE': 1.6792544847288797, 'R2': 0.4427343582933829, 'MAE': 1.006535434283224}}


In [None]:
# Apply the optimized analysis with imputation testing for RF with more_restricted
original_finalgrade_best_model_knn = analyze_final_grade_with_advanced_ensembles(
    datasets_dict=loaded_datasets,
    target_column='FinalGrade',
    group_col='Year',
    imputation_strategy='knn',
    exclusion_type='more_restricted',
)

Processing datasets for target column: FinalGrade with exclusion type: more_restricted
No specific columns to scale provided. Scaling all columns except the target.
Processing complete for datasets. Total time: 638.80 seconds
Best model: {'model_name': 'filtered_combined_df_FinalGrade_CatBoost', 'params': <catboost.core.CatBoostRegressor object at 0x7cd6ee2d8130>, 'dataset': 'filtered_combined_df_FinalGrade', 'test_scores': {'MSE': 1.417487463820378, 'R2': 0.5296025299795568, 'MAE': 0.9211656989314588}}


In [None]:
# Apply the optimized analysis for RF with more_restricted
rf_more_restricted_finalgrade_results_mean, rf_more_restricted_finalgrade_best_model_mean = analyze_final_grade_with_advanced_ensembles(
    datasets_dict=rf_finalgrade_datasets,
    target_column='FinalGrade',
    feature_set_name='RF',
    exclusion_type='more_restricted',
    imputation_strategy='mean',  # Use the best strategy found
)


Processing datasets for target column: FinalGrade with exclusion type: more_restricted
Column 'None' not found. Scaling without grouping.
Dropping rows with NaN values in the target variable.
Processing complete for datasets. Total time: 654.31 seconds
Best model: {'model_name': 'predict_finalgrade_RF_more_restricted_CatBoost', 'params': <catboost.core.CatBoostRegressor object at 0x7cd6f05d8d60>, 'dataset': 'predict_finalgrade_RF_more_restricted', 'test_scores': {'MSE': 1.3925497758755025, 'R2': 0.8221653549931494, 'MAE': 0.8832376288494231}}


In [None]:
# Apply the optimized analysis for RF with more_restricted
rf_more_restricted_finalgrade_results_mean, rf_more_restricted_finalgrade_best_model_mean = analyze_final_grade_with_advanced_ensembles(
    datasets_dict=rf_finalgrade_datasets,
    target_column='FinalGrade',
    feature_set_name='RF',
    exclusion_type='more_restricted',
    group_col='Year',
    imputation_strategy='mean',  # Use the best strategy found
)


Processing datasets for target column: FinalGrade with exclusion type: more_restricted
Column 'Year' not found. Scaling without grouping.
Dropping rows with NaN values in the target variable.
Processing complete for datasets. Total time: 887.28 seconds
Best model: {'model_name': 'predict_finalgrade_RF_more_restricted_CatBoost', 'params': <catboost.core.CatBoostRegressor object at 0x7cd6ed593f40>, 'dataset': 'predict_finalgrade_RF_more_restricted', 'test_scores': {'MSE': 1.3925497758755025, 'R2': 0.8221653549931494, 'MAE': 0.8832376288494231}}


In [None]:
# Apply the optimized analysis for RF with more_restricted
rf_more_restricted_finalgrade_results_knn, rf_more_restricted_finalgrade_best_model_knn = analyze_final_grade_with_advanced_ensembles(
    datasets_dict=rf_finalgrade_datasets,
    target_column='FinalGrade',
    feature_set_name='RF',
    exclusion_type='more_restricted',
    imputation_strategy='knn',  # Use the best strategy found
)


Processing datasets for target column: FinalGrade with exclusion type: more_restricted
Column 'None' not found. Scaling without grouping.
Dropping rows with NaN values in the target variable.
Processing complete for datasets. Total time: 678.13 seconds
Best model: {'model_name': 'predict_finalgrade_RF_more_restricted_Stacking', 'params': StackingRegressor(estimators=[('xgb',
                               XGBRegressor(base_score=None, booster=None,
                                            callbacks=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=None, device=None,
                                            early_stopping_rounds=None,
                                            enable_categorical=False,
                                            eval_metric=None,
                                            feature_types=None, gamm

In [None]:
# Apply the optimized analysis for RF with more_restricted
rf_more_restricted_finalgrade_results_zero, rf_more_restricted_finalgrade_best_model_zero = analyze_final_grade_with_advanced_ensembles(
    datasets_dict=rf_finalgrade_datasets,
    target_column='FinalGrade',
    feature_set_name='RF',
    exclusion_type='more_restricted',
    imputation_strategy='zero',  # Use the best strategy found
)


Processing datasets for target column: FinalGrade with exclusion type: more_restricted
Column 'None' not found. Scaling without grouping.
Dropping rows with NaN values in the target variable.
Processing complete for datasets. Total time: 819.71 seconds
Best model: {'model_name': 'predict_finalgrade_RF_more_restricted_CatBoost', 'params': <catboost.core.CatBoostRegressor object at 0x7cd6ec077d90>, 'dataset': 'predict_finalgrade_RF_more_restricted', 'test_scores': {'MSE': 1.406426058269903, 'R2': 0.8203932935585255, 'MAE': 0.8844594491722937}}


Old results

In [None]:
# Apply the optimized analysis for RF with more_restricted
rf_more_restricted_finalgrade_results_mean, rf_more_restricted_finalgrade_best_model_mean = analyze_final_grade_with_advanced_ensembles(
    datasets_dict=rf_finalgrade_datasets,
    target_column='FinalGrade',
    feature_set_name='RF',
    exclusion_type='more_restricted',
    imputation_strategy='mean',  # Use the best strategy found
)


Processing datasets for target column: FinalGrade with exclusion type: more_restricted
Column 'None' not found. Scaling without grouping.
Dropping rows with NaN values in the target variable.
Processing complete for datasets. Total time: 658.05 seconds
Best model: {'model_name': 'predict_finalgrade_RF_more_restricted_CatBoost', 'params': <catboost.core.CatBoostRegressor object at 0x79947d02f940>, 'dataset': 'predict_finalgrade_RF_more_restricted', 'test_scores': {'MSE': 1.3925497758755025, 'R2': 0.8221653549931494, 'MAE': 0.8832376288494231}}


In [None]:
# Apply the optimized analysis for Lasso with more_restricted
lasso_more_restricted_finalgrade_results_zero, lasso_more_restricted_finalgrade_best_model_zero = analyze_final_grade_with_advanced_ensembles(
    datasets_dict=lasso_finalgrade_datasets,
    target_column='FinalGrade',
    feature_set_name='Lasso',
    exclusion_type='more_restricted',
    imputation_strategy='zero',  # Use the best strategy found
)


Processing datasets for target column: FinalGrade with exclusion type: more_restricted
Column 'None' not found. Scaling without grouping.
Dropping rows with NaN values in the target variable.
Processing complete for datasets. Total time: 946.76 seconds
Best model: {'model_name': 'predict_finalgrade_Lasso_more_restricted_CatBoost', 'params': <catboost.core.CatBoostRegressor object at 0x79947d00e3e0>, 'dataset': 'predict_finalgrade_Lasso_more_restricted', 'test_scores': {'MSE': 1.406491126503044, 'R2': 0.8203849840629928, 'MAE': 0.8903159880405647}}


##### T4 GPU

In [None]:
# Apply the optimized analysis for RF with more_restricted
rf_more_restricted_finalgrade_results, rf_more_restricted_finalgrade_best_model = analyze_final_grade_with_advanced_ensembles(
    datasets_dict=rf_finalgrade_datasets,
    target_column='FinalGrade',
    feature_set_name='RF',
    exclusion_type='more_restricted',
    imputation_strategy='zero',  # Use the best strategy found
)


Processing datasets for target column: FinalGrade with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: predict_finalgrade_RF_more_restricted
Column 'None' not found. Scaling without grouping.
Performing regression analysis with advanced ensemble techniques...
Evaluating model: XGBoost
Best parameters for XGBoost: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}
Cross-validation scores for XGBoost: -1.6715343238650893 ± 0.06155015826305219
Test set scores for XGBoost: {'MSE': 1.4070921377302954, 'R2': 0.8203082323230585, 'MAE': 0.8877477020044002}
Evaluating model: LightGBM
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001864 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 158
[LightGBM] [Info] Number of data points in the train set: 4897, n

In [None]:
# Apply the optimized analysis for Lasso with more_restricted
lasso_more_restricted_finalgrade_results, lasso_more_restricted_finalgrade_best_model = analyze_final_grade_with_advanced_ensembles(
    datasets_dict=lasso_finalgrade_datasets,
    target_column='FinalGrade',
    feature_set_name='Lasso',
    exclusion_type='more_restricted',
    imputation_strategy='zero',  # Use the best strategy found
)


Processing datasets for target column: FinalGrade with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: predict_finalgrade_Lasso_more_restricted
Column 'None' not found. Scaling without grouping.
Performing regression analysis with advanced ensemble techniques...
Evaluating model: XGBoost
Best parameters for XGBoost: {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100}
Cross-validation scores for XGBoost: -1.6704152613303211 ± 0.05926084313181137
Test set scores for XGBoost: {'MSE': 1.4821339357127656, 'R2': 0.8107250693108077, 'MAE': 0.9054457450206288}
Evaluating model: LightGBM
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001715 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 168
[LightGBM] [Info] Number of data points in the train set: 4897

In [None]:
# Apply the optimized analysis for RF with more_restricted
fr_more_restricted_finalgrade_results, fr_more_restricted_finalgrade_best_model = analyze_final_grade_with_advanced_ensembles(
    datasets_dict=fr_finalgrade_datasets,
    target_column='FinalGrade',
    feature_set_name='FR',
    exclusion_type='more_restricted',
    imputation_strategy='zero',  # Use the best strategy found
)


Processing datasets for target column: FinalGrade with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: predict_finalgrade_FR_more_restricted
Column 'None' not found. Scaling without grouping.
Performing regression analysis with advanced ensemble techniques...
Evaluating model: XGBoost
Best parameters for XGBoost: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}
Cross-validation scores for XGBoost: -1.6943884781841532 ± 0.07533710014043599
Test set scores for XGBoost: {'MSE': 1.415285743125347, 'R2': 0.8192618733835089, 'MAE': 0.8956791861572084}
Evaluating model: LightGBM
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001323 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 126
[LightGBM] [Info] Number of data points in the train set: 4897, nu

#### Improve results - Test different imputation techniques
Zero imputation got the best results

##### CPU

In [None]:
# Apply the optimized analysis with imputation testing for RF with more_restricted
rf_more_restricted_finalgrade_best_model = analyze_final_grade_with_imputation_strategies(
    datasets_dict=rf_finalgrade_datasets,
    target_column='FinalGrade',
    feature_set_name='RF',
    exclusion_type='more_restricted',
    imputation_strategies=['mean', 'median', 'zero', 'knn', 'most_frequent', 'constant'],  # Including all imputation strategies
)

Processing datasets for target column: FinalGrade with exclusion type: more_restricted
Testing imputation strategy: mean
Processing datasets for target column: FinalGrade with exclusion type: more_restricted
Column 'None' not found. Scaling without grouping.
Dropping rows with NaN values in the target variable.
Processing complete for datasets. Total time: 702.98 seconds
Best model: {'model_name': 'predict_finalgrade_RF_more_restricted_CatBoost', 'params': <catboost.core.CatBoostRegressor object at 0x7994a92b8be0>, 'dataset': 'predict_finalgrade_RF_more_restricted', 'test_scores': {'MSE': 1.3925497758755025, 'R2': 0.8221653549931494, 'MAE': 0.8832376288494231}}
Testing imputation strategy: median
Processing datasets for target column: FinalGrade with exclusion type: more_restricted
Column 'None' not found. Scaling without grouping.
Dropping rows with NaN values in the target variable.
Processing complete for datasets. Total time: 770.15 seconds
Best model: {'model_name': 'predict_final

##### L4 GPU

In [None]:
# Apply the optimized analysis with imputation testing for RF with more_restricted
original_finalgrade_best_model = analyze_final_grade_with_imputation_strategies(
    datasets_dict=loaded_datasets,
    target_column='FinalGrade',
    exclusion_type='more_restricted',
    imputation_strategies=['mean', 'median', 'zero', 'knn', 'most_frequent', 'constant'],  # Including all imputation strategies
)

Processing datasets for target column: FinalGrade with exclusion type: more_restricted
Testing imputation strategy: mean
Processing datasets for target column: FinalGrade with exclusion type: more_restricted
Column 'None' not found. Scaling without grouping.
Processing complete for datasets. Total time: 348.09 seconds
Best model: {'model_name': 'filtered_combined_df_FinalGrade_CatBoost', 'params': <catboost.core.CatBoostRegressor object at 0x7d3346791f90>, 'dataset': 'filtered_combined_df_FinalGrade', 'test_scores': {'MSE': 1.4264121162681838, 'R2': 0.5266408572738632, 'MAE': 0.9178229082371265}}
Testing imputation strategy: median
Processing datasets for target column: FinalGrade with exclusion type: more_restricted
Column 'None' not found. Scaling without grouping.
Processing complete for datasets. Total time: 429.67 seconds
Best model: {'model_name': 'filtered_combined_df_FinalGrade_CatBoost', 'params': <catboost.core.CatBoostRegressor object at 0x7d3346790310>, 'dataset': 'filtered

In [None]:
# Apply the optimized analysis with imputation testing for RF with more_restricted
original_finalgrade_best_model_group = analyze_final_grade_with_imputation_strategies(
    datasets_dict=loaded_datasets,
    target_column='FinalGrade',
    group_col='Year',
    exclusion_type='more_restricted',
    imputation_strategies=['mean', 'median', 'zero', 'knn', 'most_frequent', 'constant'],  # Including all imputation strategies
)

Processing datasets for target column: FinalGrade with exclusion type: more_restricted
Testing imputation strategy: mean
Processing datasets for target column: FinalGrade with exclusion type: more_restricted
No specific columns to scale provided. Scaling all columns except the target.
Processing complete for datasets. Total time: 353.78 seconds
Best model: {'model_name': 'filtered_combined_df_FinalGrade_CatBoost', 'params': <catboost.core.CatBoostRegressor object at 0x7d3346793f70>, 'dataset': 'filtered_combined_df_FinalGrade', 'test_scores': {'MSE': 1.4264121162681838, 'R2': 0.5266408572738632, 'MAE': 0.9178229082371265}}
Testing imputation strategy: median
Processing datasets for target column: FinalGrade with exclusion type: more_restricted
No specific columns to scale provided. Scaling all columns except the target.
Processing complete for datasets. Total time: 414.02 seconds
Best model: {'model_name': 'filtered_combined_df_FinalGrade_CatBoost', 'params': <catboost.core.CatBoostReg

##### T4 GPU

In [None]:
# Apply the optimized analysis with imputation testing for RF with more_restricted
rf_more_restricted_finalgrade_best_model = analyze_final_grade_with_imputation_strategies(
    datasets_dict=rf_finalgrade_datasets,
    target_column='FinalGrade',
    feature_set_name='RF',
    exclusion_type='more_restricted',
    imputation_strategies=['mean', 'median', 'zero', 'knn', 'most_frequent', 'constant'],  # Including all imputation strategies
)

Processing datasets for target column: FinalGrade with exclusion type: more_restricted
Testing imputation strategy: mean
Processing datasets for target column: FinalGrade with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: predict_finalgrade_RF_more_restricted
Column 'None' not found. Scaling without grouping.
Performing regression analysis with advanced ensemble techniques...
Evaluating model: XGBoost
Best parameters for XGBoost: {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100}
Cross-validation scores for XGBoost: -1.6345593461138677 ± 0.0680983568239383
Test set scores for XGBoost: {'MSE': 1.4279573363519793, 'R2': 0.8176436559796012, 'MAE': 0.8927671727951064}
Evaluating model: LightGBM
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001867 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `

In [None]:
# Apply the optimized analysis with imputation testing for Lasso with more_restricted
lasso_more_restricted_finalgrade_best_model = analyze_final_grade_with_imputation_strategies(
    datasets_dict=lasso_finalgrade_datasets,
    target_column='FinalGrade',
    feature_set_name='Lasso',
    exclusion_type='more_restricted',
    imputation_strategies=['mean', 'median', 'zero', 'knn', 'most_frequent', 'constant'],  # Including all imputation strategies
)

Processing datasets for target column: FinalGrade with exclusion type: more_restricted
Testing imputation strategy: mean
Processing datasets for target column: FinalGrade with exclusion type: more_restricted
--------------------------------------------------------------------------------
Processing dataset: predict_finalgrade_Lasso_more_restricted
Column 'None' not found. Scaling without grouping.
Performing regression analysis with advanced ensemble techniques...
Evaluating model: XGBoost
Best parameters for XGBoost: {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100}
Cross-validation scores for XGBoost: -1.6344802654276926 ± 0.07531326397915611
Test set scores for XGBoost: {'MSE': 1.4363215153580373, 'R2': 0.8165755140502537, 'MAE': 0.8970098091916537}
Evaluating model: LightGBM
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001925 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can s

# 5. Save and Display Results

In [None]:
folder_path = 'best_models'
zip_path = 'best_models.zip'

shutil.make_archive(base_name='best_models', format='zip', root_dir=folder_path)

files.download(zip_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
folder_path = 'catboost_info'
zip_path = 'catboost_info.zip'
shutil.make_archive(base_name='catboost_info', format='zip', root_dir=folder_path)

files.download(zip_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Save the document at the end
save_document('prediction_analysis_output.docx')


Document saved as prediction_analysis_output.docx
