# Probability to Binary outputs

### Import libraries

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Load CSV files

In [6]:
# Logistic Regression Models
df_2L = pd.read_csv('predictions_2L.csv')
df_7L = pd.read_csv('predictions_7L.csv')
df_10L = pd.read_csv('predictions_10L.csv')

# Random Forest Models
df_2R = pd.read_csv('predictions_2R.csv')
df_7R = pd.read_csv('predictions_7R.csv')
df_10R = pd.read_csv('predictions_10R.csv')

### Computation of Binary outputs

In [10]:
# Function to convert values to low (0) or high (1) based on a quantile level
def convert_values_to_low_high(df, prefix1, prefix2, quantile_level=0.7):
  
    # Function to check if a value is numeric 
    def is_number(value):
        try:
            float(value)
            return True
        except ValueError:
            return False

    # Function to convert values to 0 (low) or 1 (high) based on the threshold value
    def convert_to_low_high(x, threshold):
        try:
            return 0 if float(x) <= threshold else 1
        except ValueError:
            return x  # Return the original value if it cannot be converted to a number

    # Convert data to a numpy array and exclude the first column
    data_array = np.array(df, dtype=object)[:, 1:]

    # Remove all non-numeric values and convert to float
    numeric_data_array = np.array([
        [float(cell) if is_number(cell) else np.nan for cell in row]
        for row in data_array
    ])

    # Initialize a dictionary to store each column as an array and sort
    columns_np_sorted = {
        f'{prefix1}{prefix2}M{i}': np.sort(numeric_data_array[:, i][~np.isnan(numeric_data_array[:, i])])
        for i in range(numeric_data_array.shape[1])
    }

    # Split sorted arrays into 10 equal parts and get the boundaries
    quantiles = {key: np.quantile(value, np.linspace(0.1, 1.0, 10)) for key, value in columns_np_sorted.items()}

    # Store the selected quantiles in an array
    quantile_index = int(quantile_level * 10 - 1)
    quantiles_selected = {key: value[quantile_index] for key, value in quantiles.items()}
    #print(f"Selected quantiles for {prefix}:\n", quantiles_selected)

    # Create a copy of the dataset to modify the values
    df_copy = df.copy()

    # Get original column names
    original_columns = df.columns[1:]  
    column_mapping = {f'{prefix1}{prefix2}M{i}': col for i, col in enumerate(original_columns)}

    # Go through each column of the dataset and set the values to low (0) or high (1)
    for key, value in quantiles_selected.items():
        if key in column_mapping:
            col_name = column_mapping[key]
            df_copy[col_name] = df_copy[col_name].apply(lambda x: convert_to_low_high(x, value))
        else:
            print(f"Column {key} not found in DataFrame")

    # Define the output file name 
    output_file = f'predictions_{prefix1}B{prefix2}.csv'

    # Save the modified dataset 
    df_copy.to_csv(output_file, index=False)

In [11]:
# convert_values_to_low_high(df_2L, '2','L' )
# convert_values_to_low_high(df_7L, '7', 'L')
# convert_values_to_low_high(df_10L, '10','L')

# convert_values_to_low_high(df_2R, '2', 'L')
# convert_values_to_low_high(df_7R, '7','R' )
# convert_values_to_low_high(df_10R, '10', 'R')