In [None]:
import joblib
# from sklearn.tree import DecisionTreeClassifier
# import sklearn
import sys
import numpy as np
import pandas as pd
# from sklearn.preprocessing import MinMaxScaler
from itertools import product

In [None]:
# Load the trained model from the file
cf_df = pd.read_csv('cfs.csv')

conds_df = cf_df.iloc[:,-50:]

use_rank = False # If True, rank features. If False, set weights manually.

In [None]:
# Display the columns to confirm they were added
print(list(cf_df.columns))

In [None]:
# Step 1: Identify all columns containing 'credit_score'
credit_score_columns = [col for col in cf_df.columns if 'credit_score' in col]

# Step 2: Convert these columns to float
cf_df[credit_score_columns] = cf_df[credit_score_columns].astype(float)

In [None]:
# List of prefixes
leaf_stop = 15
prefixes = [[f"prox{i+1}", f"round_prox{i+1}"] for i in range(leaf_stop)]
prefixes = [item for sublist in prefixes for item in sublist]
prefixes.insert(0, 'orig')
prefixes

In [None]:
# Define the groups of dummy variable prefixes
cat_vars = ['employment_type', 'education_type']

# Generate all unique pairs
cf_cat_vars = [f"{a}_{b}" for a, b in product(prefixes, cat_vars)]
# print(cf_cat_vars)

In [None]:
columns_to_drop = []

# Loop through each prefix and convert dummy variables back to categorical columns
for var in cf_cat_vars:
    # Select the columns that match the current prefix
    dummy_cols = [col for col in cf_df.columns if var in col]
    
    # Revert the dummy variables to a single categorical column
    cf_df[var] = cf_df[dummy_cols].idxmax(axis=1).str.replace(f'{var}_', '', regex=True).str.replace("round_", "")

    # Add these dummy columns to the drop list
    columns_to_drop.extend(dummy_cols)

columns_to_drop.append('Unnamed: 0')

# Drop all dummy columns at once after the loop
cf_df = cf_df.drop(columns=columns_to_drop)

In [None]:
cf_df.columns

In [None]:
cf_df.head(5)

In [None]:
# Higher rank means more difficult to change
ft_ranking = {
    'education_type': 5,
    'employment_type': 4,
    'income': 3,
    'credit_score': 1,
    'amount_requested': 2,
}

# Alternative to ranking: set weights manually. Weights must add to 1.
weights = {
    'income': 0.1508,
    'credit_score': 0.0497,
    'amount_requested': 0.5467,
    'employment_type': 0.2058,
    'education_type': 0.0471,
}

if use_rank:
    # Weights for each feature (define based on preferences)
    print("Using ranking")
    weights = dict()
    for ft in ft_ranking:
        weights[ft] = ft_ranking[ft]*(1/15)

In [None]:
# Lists to hold all rows for each hypothesis
all_rows_hypothesis_1 = []
all_rows_hypothesis_2 = []

# Helper function to check if a value is numeric
def is_numeric(value):
    try:
        float(value)
        return True
    except ValueError:
        return False

In [None]:
# # Step 1: Define continuous columns to normalize and initialize scaler
# continuous_columns = [
#     'orig_income', 'orig_credit_score', 'orig_amount_requested',
#     'prox1_income', 'prox1_credit_score', 'prox1_amount_requested',
#     'prox2_income', 'prox2_credit_score', 'prox2_amount_requested',
#     'prox3_income', 'prox3_credit_score', 'prox3_amount_requested',
#     'prox4_income', 'prox4_credit_score', 'prox4_amount_requested',
#     'round_prox1_income', 'round_prox1_credit_score', 'round_prox1_amount_requested',
#     'round_prox2_income', 'round_prox2_credit_score', 'round_prox2_amount_requested',
#     'round_prox3_income', 'round_prox3_credit_score', 'round_prox3_amount_requested',
#     'round_prox4_income', 'round_prox4_credit_score', 'round_prox4_amount_requested'
# ]
# scaler = MinMaxScaler()

# # Step 2: Fit and transform on all rows
# normalized_df = cf_df.copy()  # Make a copy to store normalized values
# normalized_df[continuous_columns] = scaler.fit_transform(cf_df[continuous_columns].astype(float))

# # Step 3: Restore "na" values in the normalized copy
# normalized_df[continuous_columns] = normalized_df[continuous_columns].fillna(np.nan)

In [None]:
# Step 1: Define column groups for normalization
income_columns = [
    f"{prefix}_income"
    for prefix in prefixes
]

credit_score_columns = [
    f"{prefix}_credit_score"
    for prefix in prefixes
]

amount_requested_columns = [
    f"{prefix}_amount_requested"
    for prefix in prefixes
]

# Step 2: Normalize each group separately
normalized_df = cf_df.copy()  # Make a copy to store normalized values

for group_columns in [income_columns, credit_score_columns, amount_requested_columns]:
    # Calculate global min and max for the group
    global_min = cf_df[group_columns].min().min()
    global_max = cf_df[group_columns].max().max()

    # Normalize the columns in the group
    normalized_df[group_columns] = normalized_df[group_columns].apply(
        lambda x: (x - global_min) / (global_max - global_min)
    )

# Step 3: Verify results
normalized_df.head(5)

In [None]:
normalized_df['orig_credit_score']

In [None]:
normalized_df['prox1_income']

In [None]:
# Function to calculate proximities with normalized values for calculation only
def calculate_proximities(orig_row, prox_row, normalized_row):
    true_proximity = 0
    weighted_proximity = 0
    sparsity = 0
    true_weight = 0.2  # Non-biased weighing

    for feature, weight in weights.items():
        if feature in ['income', 'credit_score', 'amount_requested']:
            orig_value = normalized_row[f'orig_{feature}']
            prox_value = normalized_row[f"{prox_row['source']}_{feature}"]
            
            # Skip if prox_value is "na"
            if np.isnan(prox_value):
                return np.nan, np.nan, np.nan
            
            # orig_value = float(orig_value)
            # prox_value = float(prox_value)

        else:
            orig_value = orig_row[f'orig_{feature}']
            prox_value = prox_row[feature]

        # Continuous features
        if feature in ['income', 'credit_score', 'amount_requested']:
            diff = abs(orig_value - prox_value)
            if round(orig_row[f'orig_{feature}'], 3) != round(prox_row[feature], 3):
            #     true_proximity += diff * true_weight
            #     weighted_proximity += diff * weight
                sparsity += 1
            true_proximity += diff * true_weight
            weighted_proximity += diff * weight

        # Categorical features
        elif feature in ['employment_type', 'education_type']:
            if orig_value != prox_value:
                true_proximity += 1 * true_weight
                weighted_proximity += 1 * weight
                sparsity += 1

    return true_proximity, weighted_proximity, sparsity


In [None]:
cf_df = pd.concat([cf_df, conds_df], axis=1)

cf_df.columns

In [None]:
# Process each row and store original values for output
for index, row in cf_df.iterrows():
    normalized_row = normalized_df.loc[index]

    # Original data to be used for Hypothesis 1 and Hypothesis 2 outputs
    original_data = {
        'source': 'original',
        'income': row['orig_income'],
        'credit_score': row['orig_credit_score'],
        'amount_requested': row['orig_amount_requested'],
        'employment_type': row['orig_employment_type'],
        'education_type': row['orig_education_type'],
        'true_proximity': None,
        'weighted_proximity': None,
        'sparsity': None,
        'conditions': None
    }

    original_data_norm = {
        'source': 'original',
        'income': normalized_row['orig_income'],
        'credit_score': normalized_row['orig_credit_score'],
        'amount_requested': normalized_row['orig_amount_requested'],
        'employment_type': normalized_row['orig_employment_type'],
        'education_type': normalized_row['orig_education_type'],
        'true_proximity': None,
        'weighted_proximity': None,
        'sparsity': None
    }
    
    # Append the original data to both hypotheses
    all_rows_hypothesis_1.append(original_data)
    all_rows_hypothesis_2.append(original_data)
    # all_rows_hypothesis_1.append(original_data_norm)
    # all_rows_hypothesis_2.append(original_data_norm)

    # Hypothesis 1: Only unrounded counterfactuals (prox1, prox2, prox3, prox4)
    for i in range(1, leaf_stop):
        source_label = f"prox{i}"
        
        prox_data = {
            'source': source_label,
            'income': row[f'{source_label}_income'],
            'credit_score': row[f'{source_label}_credit_score'],
            'amount_requested': row[f'{source_label}_amount_requested'],
            'employment_type': row.get(f'{source_label}_employment_type'),
            'education_type': row.get(f'{source_label}_education_type'),
        }

        prox_data_norm = {
            'source': source_label,
            'income': normalized_row[f'{source_label}_income'],
            'credit_score': normalized_row[f'{source_label}_credit_score'],
            'amount_requested': normalized_row[f'{source_label}_amount_requested'],
            'employment_type': normalized_row.get(f'{source_label}_employment_type'),
            'education_type': normalized_row.get(f'{source_label}_education_type'),
        }

        # Calculate proximity metrics
        true_proximity, weighted_proximity, sparsity = calculate_proximities(row, prox_data, normalized_row)
        prox_data['true_proximity'] = true_proximity
        prox_data['weighted_proximity'] = weighted_proximity
        prox_data['sparsity'] = sparsity
        # prox_data['conditions'] = row.get(f'{source_label}_conditions').iloc[0]

        prox_data_norm['true_proximity'] = true_proximity
        prox_data_norm['weighted_proximity'] = weighted_proximity
        prox_data_norm['sparsity'] = sparsity

        if not np.isnan(true_proximity):
            all_rows_hypothesis_1.append(prox_data)
            # all_rows_hypothesis_1.append(prox_data_norm)

    # Loop for both unrounded and rounded rows in Hypothesis 2
    for i in range(1, leaf_stop):
        # Unrounded proximity
        unrounded_label = f"prox{i}"

        unrounded_data = {
            'source': unrounded_label,
            'income': row[f'{unrounded_label}_income'],
            'credit_score': row[f'{unrounded_label}_credit_score'],
            'amount_requested': row[f'{unrounded_label}_amount_requested'],
            'employment_type': row.get(f'{unrounded_label}_employment_type'),
            'education_type': row.get(f'{unrounded_label}_education_type'),
        }

        unrounded_data_norm = {
            'source': unrounded_label,
            'income': normalized_row[f'{unrounded_label}_income'],
            'credit_score': normalized_row[f'{unrounded_label}_credit_score'],
            'amount_requested': normalized_row[f'{unrounded_label}_amount_requested'],
            'employment_type': normalized_row.get(f'{unrounded_label}_employment_type'),
            'education_type': normalized_row.get(f'{unrounded_label}_education_type'),
        }

        # Calculate proximity for unrounded
        true_proximity, weighted_proximity, sparsity = calculate_proximities(row, unrounded_data, normalized_row)
        unrounded_data['true_proximity'] = true_proximity
        unrounded_data['weighted_proximity'] = weighted_proximity
        unrounded_data['sparsity'] = sparsity
        # unrounded_data['conditions'] = row.get(f'{unrounded_label}_conditions').iloc[0]

        unrounded_data_norm['true_proximity'] = true_proximity
        unrounded_data_norm['weighted_proximity'] = weighted_proximity
        unrounded_data_norm['sparsity'] = sparsity

        if not np.isnan(true_proximity):
            all_rows_hypothesis_2.append(unrounded_data)
            # all_rows_hypothesis_2.append(unrounded_data_norm)

        # Rounded proximity
        rounded_label = f"round_prox{i}"

        rounded_data = {
            'source': rounded_label,
            'income': row[f'{rounded_label}_income'],
            'credit_score': row[f'{rounded_label}_credit_score'],
            'amount_requested': row[f'{rounded_label}_amount_requested'],
            'employment_type': row.get(f'{rounded_label}_employment_type', "na"),
            'education_type': row.get(f'{rounded_label}_education_type', "na"),
        }

        rounded_data_norm = {
            'source': rounded_label,
            'income': normalized_row[f'{rounded_label}_income'],
            'credit_score': normalized_row[f'{rounded_label}_credit_score'],
            'amount_requested': normalized_row[f'{rounded_label}_amount_requested'],
            'employment_type': normalized_row.get(f'{rounded_label}_employment_type', "na"),
            'education_type': normalized_row.get(f'{rounded_label}_education_type', "na"),
        }

        # Calculate proximity for rounded
        true_proximity, weighted_proximity, sparsity = calculate_proximities(row, rounded_data, normalized_row)
        rounded_data['true_proximity'] = true_proximity
        rounded_data['weighted_proximity'] = weighted_proximity
        rounded_data['sparsity'] = sparsity
        # rounded_data['conditions'] = row.get(f'{unrounded_label}_conditions').iloc[0]

        rounded_data_norm['true_proximity'] = true_proximity
        rounded_data_norm['weighted_proximity'] = weighted_proximity
        rounded_data_norm['sparsity'] = sparsity

        if not np.isnan(true_proximity):
            all_rows_hypothesis_2.append(rounded_data)
            # all_rows_hypothesis_2.append(rounded_data_norm)

    # Add separator row for readability in both hypotheses
    separator_row = {'source': '---', 'income': '---', 'credit_score': '---', 'amount_requested': '---', 
                     'employment_type': '---', 'education_type': '---', 'true_proximity': '---', 
                     'weighted_proximity': '---', 'sparsity': '---', 'conditions': '---'}
    all_rows_hypothesis_1.append(separator_row)
    all_rows_hypothesis_2.append(separator_row)


In [None]:
# Create DataFrames for each hypothesis
hypothesis_1_df = pd.DataFrame(all_rows_hypothesis_1)
hypothesis_2_df = pd.DataFrame(all_rows_hypothesis_2)

# Save to separate CSV files
hypothesis_1_df.to_csv('cfs_formatted.csv', index=False)
hypothesis_2_df.to_csv('round_cfs_formatted.csv', index=False)