In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from joblib import dump

def load_and_preprocess_data(filename):
    """
    Load and preprocess the dataset.

    Parameters:
    - filename: Path to the dataset.

    Returns:
    - features: Processed feature data.
    - labels: Corresponding labels.
    """
    df = pd.read_excel(filename).dropna()
    drop_cols = ['ST', 'Town', 'county', 'Bank_Name', 'State_Town', 'State_County'] + \
                ['ST' + state for state in ['AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 'IA', 'ID', 'IL', 
                                          'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI', 'MN', 'MO', 'MS', 'MT', 
                                          'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 
                                          'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY']]
    df = df.drop(drop_cols, axis=1)
    df['unique'] = df.pop('unique')
    return df.drop('ReopenedByMarch29_UR', axis=1), df['ReopenedByMarch29_UR']

def optimize_parameters(model, param_grid, features, labels):
    """
    Optimize model parameters using GridSearchCV.

    Parameters:
    - model: The model instance.
    - param_grid: Grid of hyperparameters for the model.
    - features: The feature data.
    - labels: The label data.

    Returns:
    - best_params_: The best parameters from the grid search.
    """
    features_opt = features.drop(['unique'], axis=1)
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(features_opt, labels)
    return grid_search.best_params_

def run_model(model, best_params, num_iterations, features, labels, model_name):
    """
    Run a given model multiple times and store the results.
    
    Args:
        model (function): The model class to instantiate and run.
        best_params (dict): Optimal parameters for the model.
        num_iterations (int): Number of iterations to run the model.
        features (DataFrame): Feature matrix.
        labels (Series): Target labels.
        model_name (str): Name of the model for labeling purposes.

    Returns:
        DataFrame, list: Combined results over iterations and accuracies for each iteration.
    """
    combined_results = pd.DataFrame()
    accuracies = []

    for i in range(num_iterations):
        X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.5, random_state=42)

        # Extract 'unique' column and then drop it because it is in a string format
        X_test_unique = X_test['unique']
        X_train = X_train.drop(['unique'], axis=1)
        X_test = X_test.drop(['unique'], axis=1)

        model_instance = model(**best_params)
        model_instance.fit(X_train, y_train)
        predictions = model_instance.predict(X_test)

        # Construct the result dataframe for this iteration
        result_df = pd.DataFrame({
            'unique': X_test_unique,
            f'prediction{model_name}{i}': predictions
        })

        if i == 0:
            combined_results = result_df
        else:
            combined_results = pd.merge(result_df, combined_results, on="unique", how="outer")

        accuracies.append(accuracy_score(y_test, predictions))

    combined_results = combined_results.fillna(8).groupby('unique').sum()
    prediction_columns = [f'prediction{model_name}{i}' for i in range(num_iterations)]
    combined_results = combined_results[combined_results[prediction_columns].lt(9).all(axis=1)]

    return combined_results, accuracies


def main():
    """
    The main execution function of the script.
    
    Steps performed by the function:
    1. Load and preprocess the dataset from the provided Excel file.
    2. Define hyperparameter configurations for various models.
    3. For each model:
       a. Determine the best hyperparameters using grid search and cross-validation.
       b. Train the model multiple times (20 iterations) using the optimal parameters.
       c. Save the aggregated results to a CSV file.
       d. Print the average accuracy achieved across all iterations.

    Models used and optimized:
    - RandomForestClassifier
    - Support Vector Classifier
    - KNeighborsClassifier
    - LogisticRegression

    Outputs:
    - For each model, a CSV file with aggregated results across all iterations.
    - Printed average accuracy for each model.

    """
    # Data Loading
    features, labels = load_and_preprocess_data('ConcurrentExecution/RAdatafile_filtered.xlsx')

    # Model configuration: a tuple of (Model Class, Parameter Grid, Model Name)
    models_config = [
    (RandomForestClassifier, {
        # Maximum depth of the tree. Helps in controlling over-fitting. None means nodes are expanded until all leaves are pure.
        'max_depth': [2, 4, 6, 8, 10, 12, 14, 16], 
        # Number of trees in the forest.
        'n_estimators': [50, 100, 150, 200], 
        # Fraction of samples used for fitting the individual base learners.
        'max_samples': [0.3, 0.4, 0.5] 
    }, "RF"),

    (SVC, {
        # Regularization parameter. Smaller values specify stronger regularization.
        'C': [0.1, 1, 10, 100],
        # Kernel coefficient. 'scale' means it's calculated from the data.
        'gamma': [1, 0.1, 0.01, 0.001], 
        # Specifies the kernel type to be used in the algorithm.
        'kernel': ['linear', 'rbf'] 
    }, "SVM"),

    (KNeighborsClassifier, {
        # Number of neighbors to use for kneighbors queries.
        'n_neighbors': list(range(1, 31)), 
        # Weight function used in prediction. Uniform means all points are weighted equally.
        'weights': ['uniform', 'distance'], 
        # The distance metric to use for the tree.
        'metric': ['euclidean', 'manhattan', 'minkowski'] 
    }, "KNN"),

    (LogisticRegression, {
        # Inverse of regularization strength. Smaller values cause stronger regularization.
        'C': np.logspace(-3, 3, 7), 
        # Used to specify the norm used in the penalization.
        'penalty': ['l1', 'l2', 'elasticnet'], 
        # Algorithm to use in the optimization problem.
        'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
    }, "LR")
]
    # Iterate over models to optimize, run and save results
    for model, param_grid, model_name in models_config:
        best_params = optimize_parameters(model(), param_grid, features, labels)
        results, accuracies = run_model(model, best_params, 20, features, labels, model_name)
        results.to_csv(f'{model_name}Results.csv')
        print(f"{model_name} Accuracy: {np.mean(accuracies):.2f}")

if __name__ == "__main__":
    main()


