# Regression Analysis

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import os

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.calibration import CalibratedClassifierCV

## Prepare Data

In [None]:
def reconcile_filename_column(df):
    if 'filename' in df.keys():
        df.rename(columns={'filename': 'Filename'}, inplace=True)
    return df

def drop_txt(txt):
    if type(txt) == type(2):
        return txt
    else:
        return(txt.replace('.txt', ''))

def combine_psuedos_with_scores(df, scores_df):
    df['Filename'] = df['Filename'].apply(drop_txt).astype('int')
    scores_df['Psuedos'] = scores_df['Psuedos'].astype('int')

    combined_df = df.merge(scores_df, how='right', left_on='Filename', right_on='Psuedos')
    combined_df = combined_df.dropna()
    return combined_df

def categorize_grades(score):
    if score >= 0.9:
        return 4
    elif score >= 0.8:
        return 3
    elif score >= 0.7:
        return 2
    elif score >= 0.6:
        return 1
    else:
        return 0

def extract_all_features(df):
    list_of_col_names = df.keys().tolist()
    if 'Filename' in list_of_col_names:
        list_of_col_names.remove('Filename')
    elif 'filename' in list_of_col_names:
        list_of_col_names.remove('filename')
    return list_of_col_names

def create_X_y(df, scores_df):
    combined_df = combine_psuedos_with_scores(df, scores_df)

    response_var = combined_df['ChatGPT Percent Score']
    combined_df.drop(['Filename', 'Psuedos', 'ChatGPT Percent Score', 'Response Word Count'], axis=1, inplace=True)
    predictor_vars = combined_df

    return {'X': predictor_vars, 
            'y': response_var
    }

def split_and_fit(temp_df, scores_df, pred_name):
    temp_X_y_dict = create_X_y(temp_df, scores_df)

    scaled_X = temp_X_y_dict['X'].copy()
    scaler = StandardScaler()
    scaled_X = pd.DataFrame(scaler.fit_transform(scaled_X), columns=scaled_X.columns)

    y = temp_X_y_dict['y'].apply(categorize_grades)

    X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.2)

    temp_df_dict = {
        'name': pred_name, 
        'df': temp_df, 
        'X_train': X_train, 
        'X_test': X_test,
        'y_train': y_train,
        'y_test': y_test
    }
    return temp_df_dict

def create_combined_features_df(base_dir, combined_dict):
    list_of_partial_dfs = []

    for pred_name in ['taaco', 'taaled', 'taales', 'taassc']:
        temp_pred_results_file_path = os.path.join(base_dir, 'predictor_results', f'{pred_name}_results.csv')
        temp_df = reconcile_filename_column(pd.read_csv(temp_pred_results_file_path))
        try:
            reduced_df = temp_df[combined_dict[pred_name] + ['Filename']]
        except: 
            reduced_df = temp_df[combined_dict[pred_name] + ['filename']]
        list_of_partial_dfs.append(reduced_df)

    stable_features_df = pd.concat(list_of_partial_dfs, axis=1)
    return stable_features_df

def load_dfs(base_dir, math=0):
    '''
    Takes base features and target, then combines with some linguistic measures. 
    Scales features using StandardScaler()
    '''
    combined_scores_file_path = os.path.join(base_dir, 'data', 'combined_responses_scores_added.xlsx')
    scores_df = pd.read_excel(combined_scores_file_path, index_col=0)
    scores_df = scores_df[['Psuedos', 'ChatGPT Percent Score', 'Response Word Count', 'Course']]
    if math == 0:
        scores_df = scores_df[scores_df['Course'] != 'MATH 111']
    
    scores_df = scores_df.drop(['Course'], axis=1)

    list_of_df_dicts = []
    combined_dict = {}

    for pred_name in ['taaco', 'taaled', 'taales', 'taassc']:
        temp_pred_results_file_path = os.path.join(base_dir, 'predictor_results', f'{pred_name}_results.csv')
        temp_df = reconcile_filename_column(pd.read_csv(temp_pred_results_file_path))
        list_of_df_dicts.append(split_and_fit(temp_df, scores_df, pred_name))
        combined_dict[pred_name] = extract_all_features(temp_df)

    selected_features_df = create_combined_features_df(base_dir, combined_dict)
    selected_features_df = selected_features_df.loc[:,~selected_features_df.columns.duplicated()].copy() # Remove multiple copies of "Filename" column
    selected_features_df_dict = split_and_fit(selected_features_df, scores_df, 'all_features')
    list_of_df_dicts.append(selected_features_df_dict)

    return [list_of_df_dicts, scores_df]

In [None]:
base_dir=os.getcwd()
list_of_df_dicts, scores_df = load_dfs(base_dir, math=1)

In [None]:
def classification_distribution(scores_df):
    df = scores_df.copy()
    df['letter_grades'] = df['ChatGPT Percent Score'].apply(categorize_grades)
    return df['letter_grades']

In [None]:
classification_distribution(scores_df).value_counts(normalize=True)

In [None]:
def grid_search_best_model(df_dict):
    param_grid = {
        'max_depth': [2, 4, 6, 8],
        'class_weight': ['balanced'],
        'n_estimators': [10, 20, 30, 40, 50],
        #'n_estimators': [10]
    }
    search = GridSearchCV(RandomForestClassifier(), param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    search.fit(df_dict['X_train'], df_dict['y_train'])

    return search.best_estimator_

def run_rf(df_dict, top_n):
    clf = grid_search_best_model(df_dict)
    df_dict['accuracy'] = clf.score(df_dict['X_test'], df_dict['y_test'])
    df_dict['feature_importances'] = clf.feature_importances_
    top_features_series = pd.Series(clf.feature_importances_, index=df_dict['X_train'].columns).nlargest(n=top_n)
    df_dict[f'top_{top_n}_feature_names'] = top_features_series.index
    df_dict[f'top_{top_n}_feature_importances'] = top_features_series.tolist()
    return df_dict

def graph_feature_importance(df_dict):
    plt.barh(df_dict['X_train'].columns, df_dict['feature_importances'])
    plt.xlabel('Feature Importance')
    plt.title('Feature Importance in Random Forest Classifier')
    plt.show()

def run_k_models(df_dict, top_n, k_models):
    list_of_run_df_dicts = []

    for run_index in range(0, k_models): 
        temp_df_dict = df_dict.copy()
        temp_df_dict['run_index'] = run_index
        temp_df_dict = run_rf(temp_df_dict, top_n)
        list_of_run_df_dicts.append(temp_df_dict)
    return list_of_run_df_dicts

def check_accuracy_df(list_of_run_df_dicts):
    list_of_acc = []
    for run_index in range(0, len(list_of_run_df_dicts)): 
        temp_df_dict = list_of_run_df_dicts[run_index]
        temp_acc_dict = {}
        temp_acc_dict[temp_df_dict['name']] = temp_df_dict['accuracy']
        list_of_acc.append(temp_acc_dict)
    return pd.DataFrame(list_of_acc)

def check_feature_stability(list_of_run_df_dicts, top_n):
    stability_df = pd.DataFrame(list_of_run_df_dicts)
    check_stability_df = stability_df.copy()
    check_stability_df = check_stability_df.explode(f'top_{top_n}_feature_names')
    check_counts = check_stability_df[f'top_{top_n}_feature_names'].value_counts(normalize=True)
    check_counts = check_counts * top_n # fix normalization after exploding
    check_counts = check_counts[check_counts > 0.3]
    return check_counts

In [None]:
def run_all_models(list_of_df_dicts):
    list_of_all_models = []
    for temp_dict in list_of_df_dicts:
        list_of_temp_run_df_dicts = run_k_models(temp_dict, 3, 1000)
        list_of_all_models.append(list_of_temp_run_df_dicts)
    return list_of_all_models

In [None]:
list_of_all_models = run_all_models(list_of_df_dicts)

In [None]:
#def report_accuracy(list_of_all_models):
#    for temp_all_runs in list_of_all_models:


def report_feature_stability(list_of_all_models):
    for temp_all_runs in list_of_all_models:
        temp_counts = check_feature_stability(temp_all_runs, 3)
        print(temp_counts)
        print()

In [None]:
report_feature_stability(list_of_all_models)