In [20]:
# Standard libraries
import numpy as np
import pandas as pd

# Scikit-learn utilities
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

# Scikit-learn models
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression


In [5]:
def preprocess_data(filename):
    """
    Read the data file into a pandas dataframe, preprocess the data, and define feature and target variables.

    Parameters:
    - filename (str): Path to the data file.

    Globals:
    - X (DataFrame): Features dataframe.
    - Y (Series): Target variable series.

    """
    global X, Y  # Declare the variables as global
    
    df_tidy = pd.read_excel(filename)
    df_tidy = df_tidy.dropna()

    # Drop unnecessary columns
    columns_to_drop = ['STAL', 'STAR', 'STAZ', 'STCA', 'STCO', 'STCT',
       'STDE', 'STFL', 'STGA', 'STIA', 'STID', 'STIL', 'STIN', 'STKS', 'STKY',
       'STLA', 'STMA', 'STMD', 'STME', 'STMI', 'STMN', 'STMO', 'STMS', 'STMT',
       'STNC', 'STND', 'STNE', 'STNH', 'STNJ', 'STNM', 'STNV', 'STNY', 'STOH',
       'STOK', 'STOR', 'STPA', 'STRI', 'STSC', 'STSD', 'STTN', 'STTX', 'STUT',
       'STVA', 'STVT', 'STWA', 'STWI', 'STWV', 'STWY']
    df_reduced = df_tidy.drop(columns_to_drop, axis=1)
    df_filtered = df_reduced.drop(["ST", "Town", "county", "Bank_Name", "State_Town", "State_County"], axis=1)

    # Reorder columns to move 'unique' to the end
    df_filtered['unique'] = df_filtered.pop('unique')

    # Define feature and target variables
    features = ['lntotalassets', 'LoanAssets', 'CashDeposits_w',
       'EquityAssets', 'lnbankage', 'bank_National', 'bank_StateMem',
       'bankwithbranches', 'CRC', 'RC', 'CH', 'lntotpop', 'fracurbpop',
       'fracilliteratepop', 'fracnonwhite', 'lntotal_banks_county',
       'farmspercap', 'mfgestabpercap100', 'lnretailsales1929',
       'pfarmswithmortgagedebt', 'fracpresdemvote1932', 'fraccongdemvote1930',
       'n_1_bankt', 'ln_rmdr_avg_assetsperbk_town',
       'rmdr_avg_loanassets_by_town', 'rmdr_avg_cashdeposits_by_town_w',
       'rmdr_avg_equityassets_by_town', 'rmr_fracnat_town',
       'rmr_fracstatemem_town','unique']
    
    X = df_filtered[features]
    Y = df_filtered['ReopenedByMarch29_UR']


## Model 1: Random Forest

Random Forest is an ensemble learning method that works by constructing multiple decision trees during training. The predictions of individual trees are then combined (either by averaging or by majority voting) to produce a single output. It is known for its flexibility, ability to model complex interactions, and mitigation of overfitting through the introduction of randomness in the model-building process.


In [10]:
def optimize_parameters_rf():
    """
    Optimize the hyperparameters of a RandomForestClassifier using GridSearchCV.

    The function will perform a grid search over specified parameter values 
    for a random forest classifier. The dataset used for this is assumed to be 
    global and named as `X` for the features and `Y` for the target variable.
    The function drops the 'unique' column from the feature set before optimization.
    
    Parameters:
    None
    
    Returns:
    dict: Best parameters found during the grid search.
    
    """
    global X, Y  # Declare the variables as global

    
    X_opt = X.drop(['unique'], axis=1)
    
    param_grid = {
       'max_depth': [8, 10, 12, 14, 16],
       'n_estimators': [50, 100, 150, 200],
       'max_samples': [0.3, 0.4, 0.5]
    }

    rforest_model = RandomForestClassifier()
    grid_search = GridSearchCV(estimator=rforest_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_opt, Y)
    
    return grid_search.best_params_

def run_model_rf(best_params, iter_num):
    """
    Run the RandomForestClassifier model using provided optimal parameters and returns the predictions and accuracy score.

    The function trains a random forest classifier using the best parameters 
    provided as input. The dataset used is assumed to be global and named as 
    `X` for the features and `Y` for the target variable. 
    The function drops the 'unique' column from the feature set before training the model.
    
    Parameters:
    - best_params (dict): The best hyperparameters obtained after optimization.
    - iter_num (int): An identifier or iteration number, used to name the prediction column.
    
    Returns:
    list: A list containing:
    - [0]: DataFrame with 'unique' column and the prediction column named 'predictionRF{iter_num}'
    - [1]: Accuracy score of the model on the test set.
    
    """
    global X, Y  # Declare the variables as global

    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.5)
    X_train_np = X_train.drop(['unique'], axis=1).to_numpy()
    X_test_np = X_test.drop(['unique'], axis=1).to_numpy()
    y_train_np = y_train.to_numpy()
    y_test_np = y_test.to_numpy()

    rforest_model = RandomForestClassifier(**best_params)
    rforest_model.fit(X_train_np, y_train_np)

    predictions = rforest_model.predict(X_test_np)

    df_predictions = pd.DataFrame(predictions, columns=['predicted_outcome'])
    X_test = X_test.reset_index(drop=True)
    X_test[f'predictionRF{iter_num}'] = df_predictions['predicted_outcome']
    df_results = X_test[["unique", f'predictionRF{iter_num}']]

    return [df_results, accuracy_score(y_test_np, predictions)]

def loop_model_rf(num_iterations):
    """
    Run the RandomForestClassifier model for a specified number of iterations using optimized parameters.
    
    This function first optimizes the hyperparameters of a RandomForest classifier 
    using the `optimize_parameters` function. Then, it runs the `run_model_RF` function 
    for a given number of iterations. The predictions from each iteration are combined 
    into a single DataFrame, and the accuracies from each run are stored in a list.
    
    Parameters:
    - num_iterations (int): The number of times the model should be run.
    
    Globals:
    - Combined results and accuracies are stored in the global variables `combined_results` 
      and `accuracies` respectively.
    
    Note:
    This function relies on the existence and proper functioning of both the 
    `optimize_parameters` and `run_model_RF` functions.
    """
    
    best_params = optimize_parameters_rf()
    global combined_results
    global accuracies
    combined_results = pd.DataFrame()
    accuracies = []

    for i in range(num_iterations):
        results, accuracy = run_model_rf(best_params, i)
        if i == 0:
            combined_results = results
        else:
            combined_results = pd.merge(results, combined_results, on="unique", how="outer")
        accuracies.append(accuracy)
    return [combined_results, accuracies]

def save_results_rf(num_iterations=20, output_filename='RandomForestResults.csv'):
    """
    Execute the random forest model multiple times, aggregate the results, 
    and save them to a CSV file.

    Parameters:
    - num_iterations (int): Number of iterations to run the random forest model.
    - output_filename (str): Path for the output CSV file.

    """
    preprocess_data("ConcurrentExecution/RAdatafile.xlsx")
    # Execute the model for the given number of iterations
    rf_combined_results = loop_model_rf(num_iterations)
    
    # Extract and fill missing values in the results dataframe
    aggregated_results = rf_combined_results[0].fillna(8)
    
    # Group by the 'unique' column and sum the results
    summed_results = aggregated_results.groupby('unique').sum()

    # Filter out rows where any prediction column has a value of 9 or greater
    prediction_columns = [f'predictionRF{i}' for i in range(num_iterations)]
    filtered_results = summed_results[summed_results[prediction_columns].lt(9).all(axis=1)]

    # Save the final results to a CSV file
    filtered_results.to_csv(output_filename)
    return filtered_results



## Model 2: Support Vector Machine (SVM)

Support Vector Machines are supervised learning algorithms which aim to find the optimal hyperplane that best separates the dataset into classes. Suitable for both regression and classification tasks, SVMs are particularly well-suited for classification of complex but small- or medium-sized datasets. They work by maximizing the margin between decision boundary and the closest data points from each class, known as support vectors. The capability to use kernel functions also allows SVMs to solve non-linear problems.


In [None]:
def optimize_parameters_svm():
    """
    Optimize the hyperparameters of a Support Vector Machine (SVM) using GridSearchCV.

    This function performs a grid search over specified parameter values 
    for an SVM classifier. The dataset used for this optimization is assumed 
    to be global and named as `X` for the features and `Y` for the target variable. 
    The function drops the 'unique' column from the feature set before optimization.

    Parameters:
    None

    Globals:
    - X (DataFrame): Features dataframe. Expected to have a column named 'unique' which will be dropped.
    - Y (Series): Target variable series.

    Returns:
    dict: Best parameters found during the grid search.
    """
    X_opt = X.drop(['unique'], axis=1)
    param_grid = {
        'C': [0.1, 1, 10, 100],
        'gamma': [1, 0.1, 0.01, 0.001],
        'kernel': ['linear', 'rbf']
    }

    svm_model = SVC()
    grid_search = GridSearchCV(estimator=svm_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_opt, Y)
    return grid_search.best_params_

def run_model_svm(best_params, iter_num):
    """
    Train and evaluate a Support Vector Machine (SVM) model using provided optimal parameters.

    This function trains an SVM classifier using the best parameters 
    provided as input. The dataset used for training and evaluation is 
    assumed to be global and named as `X` for the features and `Y` for the target variable. 
    The function drops the 'unique' column from the feature set before training the model.
    After predictions are made on the test set, the results are combined with the 
    'unique' column for identification and named according to the iteration number.

    Parameters:
    - best_params (dict): The best hyperparameters obtained after optimization.
    - iter_num (int): An identifier or iteration number, used to name the prediction column.

    Globals:
    - X (DataFrame): Features dataframe. Expected to have a column named 'unique' which will be dropped.
    - Y (Series): Target variable series.

    Returns:
    list: A list containing:
    - [0]: DataFrame with 'unique' column and the prediction column named 'predictionSVM{iter_num}'
    - [1]: Accuracy score of the model on the test set.

    """
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.5)
    X_train_np = X_train.drop(['unique'], axis=1).to_numpy()
    X_test_np = X_test.drop(['unique'], axis=1).to_numpy()
    y_train_np = y_train.to_numpy()
    y_test_np = y_test.to_numpy()

    svm_model = SVC(**best_params)
    svm_model.fit(X_train_np, y_train_np)

    predictions = svm_model.predict(X_test_np)

    df_predictions = pd.DataFrame(predictions, columns=['predicted_outcome'])
    X_test = X_test.reset_index(drop=True)
    X_test[f'predictionSVM{iter_num}'] = df_predictions['predicted_outcome']
    df_results = X_test[["unique", f'predictionSVM{iter_num}']]

    return [df_results, accuracy_score(y_test_np, predictions)]

def loop_model_svm(num_iterations):
    """
    Run the SVM model for a specified number of iterations using optimized parameters.
    
    This function first optimizes the hyperparameters of an SVM classifier 
    using the `optimize_parameters_svm` function. Then, it runs the `run_model_SVM` function 
    for a given number of iterations. The predictions from each iteration are combined 
    into a single DataFrame, and the accuracies from each run are stored in a list.

    Parameters:
    - num_iterations (int): The number of times the SVM model should be run.

    Returns:
    list: A list containing:
    - [0]: DataFrame with 'unique' column and the prediction columns named 'predictionSVM{iter_num}' for each iteration.
    - [1]: A list of accuracy scores of the model for each iteration.

    Note:
    This function relies on the existence and proper functioning of both the 
    `optimize_parameters_svm` and `run_model_SVM` functions.
    """
    
    best_params = optimize_parameters_svm()
    combined_results = pd.DataFrame()
    accuracies = []

    for i in range(num_iterations):
        print(i)
        results, accuracy = run_model_SVM(best_params, i)
        if i == 0:
            combined_results = results
        else:
            combined_results = pd.merge(results, combined_results, on="unique", how="outer")
        accuracies.append(accuracy)

    return [combined_results, accuracies]

def save_results_svm(num_iterations=20, output_filename='SVMResults.csv'):
    """
    Execute the SVM model multiple times, aggregate the results, 
    and save them to a CSV file.

    Parameters:
    - num_iterations (int): Number of iterations to run the SVM model.
    - output_filename (str): Path for the output CSV file.

    Returns:
    - DataFrame: The filtered aggregated results from the SVM model runs.

    """
    preprocess_data("ConcurrentExecution/RAdatafile.xlsx")

    # Execute the model for the given number of iterations
    svm_combined_results = loop_model_svm(num_iterations)
    
    # Extract and fill missing values in the results dataframe
    aggregated_results = svm_combined_results[0].fillna(8)
    
    # Group by the 'unique' column and sum the results
    summed_results = aggregated_results.groupby('unique').sum()

    # Filter out rows where any prediction column has a value of 9 or greater
    prediction_columns = [f'predictionSVM{i}' for i in range(num_iterations)]
    filtered_results = summed_results[summed_results[prediction_columns].lt(9).all(axis=1)]

    # Save the final results to a CSV file
    filtered_results.to_csv(output_filename)
    
    return filtered_results




## Model 3: K-Nearest Neighbor (KNN)

The K-Nearest Neighbor (KNN) algorithm is a type of instance-based learning where the function is approximated locally and all computation is deferred until classification. It is a non-parametric, lazy learning algorithm. When queried for a classification, KNN does not use any trained model but rather performs the classification based on the majority class among the K-most similar instances from the training dataset. The 'distance' between instances defines the similarity. Commonly used distance metrics include Euclidean, Manhattan, and Minkowski distances. The choice of K and the distance metric are critical decisions in the application of this algorithm.


In [None]:
def optimize_parameters_knn():
    """
    Optimize the hyperparameters of a K-Nearest Neighbors (KNN) classifier using GridSearchCV.

    This function performs a grid search over specified parameter values 
    for a KNN classifier. The dataset used for this optimization is assumed 
    to be global and named as `X` for the features and `Y` for the target variable. 
    The function drops the 'unique' column from the feature set before optimization.

    Parameters:
    None

    Globals:
    - X (DataFrame): Features dataframe. Expected to have a column named 'unique' which will be dropped.
    - Y (Series): Target variable series.

    Returns:
    dict: Best parameters found during the grid search.
    """
    X_opt = X.drop(['unique'], axis=1)
    param_grid = {
        'n_neighbors': list(range(1, 31)),
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan', 'minkowski']
    }

    knn_model = KNeighborsClassifier()
    grid_search = GridSearchCV(estimator=knn_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_opt, Y)
    return grid_search.best_params_

def run_model_knn(best_params, iter_num):
    """
    Train and evaluate a K-Nearest Neighbors (KNN) classifier using provided optimal parameters.

    This function trains a KNN classifier using the best parameters 
    provided as input. The dataset used for training and evaluation is 
    assumed to be global and named as `X` for the features and `Y` for the target variable. 
    The function drops the 'unique' column from the feature set before training the classifier.
    After predictions are made on the test set, the results are combined with the 
    'unique' column for identification and named according to the iteration number.

    Parameters:
    - best_params (dict): The best hyperparameters obtained after optimization.
    - iter_num (int): An identifier or iteration number, used to name the prediction column.

    Globals:
    - X (DataFrame): Features dataframe. Expected to have a column named 'unique' which will be dropped.
    - Y (Series): Target variable series.

    Returns:
    list: A list containing:
    - [0]: DataFrame with 'unique' column and the prediction column named 'predictionKNN{iter_num}'
    - [1]: Accuracy score of the classifier on the test set.
    """
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.5)
    X_train_np = X_train.drop(['unique'], axis=1).to_numpy()
    X_test_np = X_test.drop(['unique'], axis=1).to_numpy()
    y_train_np = y_train.to_numpy()
    y_test_np = y_test.to_numpy()

    knn_model = KNeighborsClassifier(**best_params)
    knn_model.fit(X_train_np, y_train_np)

    predictions = knn_model.predict(X_test_np)

    df_predictions = pd.DataFrame(predictions, columns=['predicted_outcome'])
    X_test = X_test.reset_index(drop=True)
    X_test[f'predictionKNN{iter_num}'] = df_predictions['predicted_outcome']
    df_results = X_test[["unique", f'predictionKNN{iter_num}']]

    return [df_results, accuracy_score(y_test_np, predictions)]

def loop_model_knn(num_iterations):
    """
    Run the KNN classifier for a specified number of iterations using optimized parameters.
    
    This function first optimizes the hyperparameters of a KNN classifier 
    using the `optimize_parameters_knn` function. Then, it runs the `run_model_knn` function 
    for a given number of iterations. The predictions from each iteration are combined 
    into a single DataFrame, and the accuracies from each run are stored in a list.

    Parameters:
    - num_iterations (int): The number of times the KNN classifier should be run.

    Returns:
    list: A list containing:
    - [0]: DataFrame with 'unique' column and the prediction columns named 'predictionKNN{iter_num}' for each iteration.
    - [1]: A list of accuracy scores of the classifier for each iteration.
    """
    
    best_params = optimize_parameters_knn()
    combined_results = pd.DataFrame()
    accuracies = []

    for i in range(num_iterations):
        print(i)
        results, accuracy = run_model_knn(best_params, i)
        if i == 0:
            combined_results = results
        else:
            combined_results = pd.merge(results, combined_results, on="unique", how="outer")
        accuracies.append(accuracy)

    return [combined_results, accuracies]

def save_results_knn(num_iterations=20, output_filename='KNNResults.csv'):
    """
    Execute the KNN classifier multiple times, aggregate the results, 
    and save them to a CSV file.

    Parameters:
    - num_iterations (int): Number of iterations to run the KNN classifier.
    - output_filename (str): Path for the output CSV file.

    Returns:
    - DataFrame: The filtered aggregated results from the KNN classifier runs.
    """
    # Note: The following line was not included in your original SVM function
    # preprocess_data("ConcurrentExecution/RAdatafile.xlsx")

    # Execute the classifier for the given number of iterations
    knn_combined_results = loop_model_knn(num_iterations)
    
    # Extract and fill missing values in the results dataframe
    aggregated_results = knn_combined_results[0].fillna(8)
    
    # Group by the 'unique' column and sum the results
    summed_results = aggregated_results.groupby('unique').sum()

    # Filter out rows where any prediction column has a value of 9 or greater
    prediction_columns_knn = [f'predictionKNN{i}' for i in range(num_iterations)]
    filtered_results = summed_results[summed_results[prediction_columns_knn].lt(9).all(axis=1)]

    # Save the final results to a CSV file
    filtered_results.to_csv(output_filename)
    
    return filtered_results


## Model 4: Logistic Regression (Logit)

Logistic Regression is a widely used statistical method to predict a binary outcome from a linear combination of predictor variables. The central principle of Logistic Regression is the logistic function, which is an S-shaped curve that can take any real-valued number and map it between 0 and 1. This is useful in scenarios where the response variable is categorical, typically with two categories/classes. The coefficients of the logistic regression model are estimated from the training data using the maximum likelihood estimation. Regularization techniques, such as L1 (Lasso) and L2 (Ridge), can be applied to prevent overfitting and handle multicollinearity in the data.


In [None]:
def optimize_parameters_logit():
    """
    Optimize the hyperparameters of a Logistic Regression classifier using GridSearchCV.

    This function performs a grid search over specified parameter values 
    for a Logistic Regression classifier. The dataset used for this optimization 
    is assumed to be global and named as `X` for the features and `Y` for the 
    target variable. The function drops the 'unique' column from the feature set 
    before optimization.

    Parameters:
    None

    Globals:
    - X (DataFrame): Features dataframe. Expected to have a column named 'unique' which will be dropped.
    - Y (Series): Target variable series.

    Returns:
    dict: Best parameters found during the grid search.
    """
    X_opt = X.drop(['unique'], axis=1)
    param_grid = {
        'C': np.logspace(-4, 4, 20),  # Regularization strength, inverse of lambda
        'penalty': ['l1', 'l2'],  # Lasso and Ridge
        'solver': ['liblinear']  # This solver works well with both L1 and L2 regularization
    }

    logit_model = LogisticRegression(max_iter=10000)
    grid_search = GridSearchCV(estimator=logit_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_opt, Y)
    return grid_search.best_params_

def run_model_logit(best_params, iter_num):
    """
    Train and evaluate a Logistic Regression classifier using provided optimal parameters.

    This function trains a Logistic Regression classifier using the best 
    parameters provided as input. The dataset used for training and evaluation 
    is assumed to be global and named as `X` for the features and `Y` for the 
    target variable. The function drops the 'unique' column from the feature 
    set before training the classifier. After predictions are made on the test 
    set, the results are combined with the 'unique' column for identification 
    and named according to the iteration number.

    Parameters:
    - best_params (dict): The best hyperparameters obtained after optimization.
    - iter_num (int): An identifier or iteration number, used to name the prediction column.

    Globals:
    - X (DataFrame): Features dataframe. Expected to have a column named 'unique' which will be dropped.
    - Y (Series): Target variable series.

    Returns:
    list: A list containing:
    - [0]: DataFrame with 'unique' column and the prediction column named 'predictionLogit{iter_num}'
    - [1]: Accuracy score of the classifier on the test set.
    """
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.5)
    X_train_np = X_train.drop(['unique'], axis=1).to_numpy()
    X_test_np = X_test.drop(['unique'], axis=1).to_numpy()
    y_train_np = y_train.to_numpy()
    y_test_np = y_test.to_numpy()

    logit_model = LogisticRegression(**best_params, max_iter=10000)
    logit_model.fit(X_train_np, y_train_np)

    predictions = logit_model.predict(X_test_np)

    df_predictions = pd.DataFrame(predictions, columns=['predicted_outcome'])
    X_test = X_test.reset_index(drop=True)
    X_test[f'predictionLogit{iter_num}'] = df_predictions['predicted_outcome']
    df_results = X_test[["unique", f'predictionLogit{iter_num}']]

    return [df_results, accuracy_score(y_test_np, predictions)]

def loop_model_logit(num_iterations):
    """
    Run the Logistic Regression classifier for a specified number of iterations using optimized parameters.

    This function first optimizes the hyperparameters of a Logistic Regression classifier 
    using the `optimize_parameters_logit` function. Then, it runs the `run_model_logit` function 
    for a given number of iterations. The predictions from each iteration are combined 
    into a single DataFrame, and the accuracies from each run are stored in a list.

    Parameters:
    - num_iterations (int): The number of times the Logistic Regression classifier should be run.

    Returns:
    list: A list containing:
    - [0]: DataFrame with 'unique' column and the prediction columns named 'predictionLogit{iter_num}' for each iteration.
    - [1]: A list of accuracy scores of the classifier for each iteration.
    """
    best_params = optimize_parameters_logit()
    combined_results = pd.DataFrame()
    accuracies = []

    for i in range(num_iterations):
        results, accuracy = run_model_logit(best_params, i)
        if i == 0:
            combined_results = results
        else:
            combined_results = pd.merge(results, combined_results, on="unique", how="outer")
        accuracies.append(accuracy)

    return [combined_results, accuracies]

def save_results_logit(num_iterations=20, output_filename='LogitResults.csv'):
    """
    Execute the Logistic Regression classifier multiple times, aggregate the results, 
    and save them to a CSV file.

    Parameters:
    - num_iterations (int): Number of iterations to run the Logistic Regression classifier.
    - output_filename (str): Path for the output CSV file.

    Returns:
    - DataFrame: The filtered aggregated results from the Logistic Regression classifier runs.
    """
    # Execute the classifier for the given number of iterations
    logit_combined_results = loop_model_logit(num_iterations)
    
    # Extract and fill missing values in the results dataframe
    aggregated_results = logit_combined_results[0].fillna(8)
    
    # Group by the 'unique' column and sum the results
    summed_results = aggregated_results.groupby('unique').sum()

    # Filter out rows where any prediction column has a value of 9 or greater
    prediction_columns_logit = [f'predictionLogit{i}' for i in range(num_iterations)]
    filtered_results = summed_results[summed_results[prediction_columns_logit].lt(9).all(axis=1)]

    # Save the final results to a CSV file
    filtered_results.to_csv(output_filename)
    
    return filtered_results
