## Training and Evaluation

### Import libraries

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

### Load Data

In [4]:
df = pd.read_csv("compas-scores-two-years.csv")

### Data cleaning

In [5]:
df = df[(df["days_b_screening_arrest"] <= 30) 
        & (df["days_b_screening_arrest"] >= -30)
        & (df["is_recid"] != -1)
        & (df["c_charge_degree"] != 'O')
        & (df["score_text"] != 'N/A')].reset_index(drop=True)

### Feature selection

In [12]:
# Two features 
two_features = df[["age", "priors_count"]]

# Seven features
seven_features = df[["sex", "age", "c_charge_desc", "c_charge_degree", "juv_misd_count", "juv_fel_count", "priors_count"]]

# Columns to drop to get ten features
unclear_features = ['start', 'end', 'event', 'two_year_recid', 'violent_recid', 'in_custody', 'out_custody',
                    'vr_charge_degree', 'vr_charge_desc', 'r_charge_degree', 'r_charge_desc']
duplicate_features = ['decile_score.1', 'priors_count.1', 'screening_date', 'v_screening_date']
compas_output_scores = ['decile_score', 'v_decile_score', 'score_text', 'v_score_text']
target_related_features = ['is_violent_recid']
temporal_attributes = ['compas_screening_date', 'days_b_screening_arrest', 'c_jail_in', 'c_jail_out', 'c_case_number',
                       'c_offense_date', 'c_arrest_date', 'c_days_from_compas', 'r_days_from_arrest', 'r_offense_date',
                       'r_jail_in', 'r_jail_out', 'vr_offense_date']
identifying_features = ['id', 'name', 'first', 'last', 'dob', 'vr_case_number']
unclear_additional_features = ['type_of_assessment', 'v_type_of_assessment', 'r_case_number']
target_variable = ['is_recid']

# Combine all columns to drop into a single list
columns_to_drop = (unclear_features + duplicate_features + compas_output_scores + target_related_features +
                   temporal_attributes + identifying_features + unclear_additional_features + target_variable)

# Ten features
ten_features = df.drop(columns_to_drop, axis=1)


### One-hot encoding

In [13]:
#  One-hot encoding for seven features
seven_features = pd.get_dummies(seven_features, columns=["sex", "c_charge_desc", "c_charge_degree"])

# One-hot encoding for ten features
ten_features = pd.get_dummies(ten_features, columns=["race", "sex", "age_cat","race","c_charge_desc", "c_charge_degree"])

### Train and evaluate

In [14]:
# Parameters for the model
test_size = 0.2
num_iterations = 1
max_iter = 1000

# Function to train and evaluate the model
def train_and_evaluate(features, target, id_column, num_iterations, test_size, max_iter, prefix, model_type='logistic'):
    performance_metrics = []
    all_results = {f"{prefix}M{i}": {} for i in range(num_iterations)}

    # Create a new DataFrame with "ID" as the first header and "M1", "M2", ..., "M1000" as the rest of the headers
    new_columns = ['ID'] + [f"{prefix}M{i}" for i in range(num_iterations)]
    new_df = pd.DataFrame(columns=new_columns)
    new_df['ID'] = id_column.tolist()

    # Fill each column with dummy values, but do this for all columns except the first one
    dummy_values = ['-'] * (new_df.shape[0])
    for column in new_columns[1:]:
        new_df[column] = dummy_values

    # Check model type
    if model_type not in ['logistic', 'random_forest']:
        raise ValueError("Unsupported model type. Choose 'logistic' or 'random_forest'.")

    # n random 80/20 training/testing splits
    for i in range(num_iterations):
        X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=test_size, random_state=i)
        
        if model_type == 'logistic':
            # Train the Logistic Regression model
            model = LogisticRegression(max_iter=max_iter)
        elif model_type == 'random_forest':
            # Train the Random Forest model
            model = RandomForestClassifier(n_estimators=100, random_state=i)
        
        model.fit(X_train, y_train)

        # Predict probabilities on the test set
        probabilities = model.predict_proba(X_test)[:, 1]  # Probability of class 1 (recidivism)
        print(f"Probabilities: {i} + {probabilities}")
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        performance_metrics.append(accuracy)

        # Store the prediction results with only 2 decimal places
        results = dict(zip(id_column[X_test.index], probabilities.round(2)))
        all_results[f"{prefix}M{i}"].update(results)

    # Replace dummy values with predictions where predictions exist for the corresponding test IDs
    for i in range(num_iterations):
        col_name = f"{prefix}M{i}"
        new_df[col_name] = new_df['ID'].map(all_results[col_name]).fillna('-')

    # Save the DataFrame to a CSV file 
    new_df.to_csv(f"predictions_{prefix}.csv", index=False)

    # Calculate the mean and standard deviation of the performance metrics
    mean_accuracy = np.mean(performance_metrics)
    std_accuracy = np.std(performance_metrics)

    # Save the mean and standard deviation of the performance metrics in a text file if it is not already there
    with open(f"performance_metrics.txt", "a") as f:
        f.write(f"Mean Accuracy ({prefix}): {mean_accuracy:.4f}\n")
        f.write(f"Standard Deviation of Accuracy ({prefix}): {std_accuracy:.4f}\n")

    print(f"Mean Accuracy ({prefix}): {mean_accuracy:.4f}")
    print(f"Standard Deviation of Accuracy ({prefix}): {std_accuracy:.4f}")


In [20]:
#train_and_evaluate(two_features, df['is_recid'], df['id'], num_iterations, test_size, max_iter, "2L", model_type='logistic')
#train_and_evaluate(seven_features, df['is_recid'], df['id'], num_iterations, test_size, max_iter, "7L", model_type='logistic')
#train_and_evaluate(ten_features, df['is_recid'], df['id'], num_iterations, test_size, max_iter, "10L", model_type='logistic')

# Train random forest model with two, seven, and ten features for all, black, and white
#train_and_evaluate(two_features, df['is_recid'], df['id'], num_iterations, test_size, max_iter, "2R", model_type='random_forest')
#train_and_evaluate(seven_features, df['is_recid'], df['id'], num_iterations, test_size, max_iter, "8R", model_type='random_forest')
#train_and_evaluate(ten_features, df['is_recid'], df['id'], num_iterations, test_size, max_iter, "10R", model_type='random_forest')
