# libraries

In [148]:
#libraries and dirrectory 
import pandas as pd
import numpy as np
import os
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from scipy.optimize import curve_fit
import patientFunctions as ptfn
import seaborn as sns
import matplotlib.pyplot as plt
import re
import matplotlib.patches as mpatches


# directory

## inputs

In [149]:
#updates to directory management
# machine_directory = 'C:/Users/mcremer' #the C and path to the project folder
machine_directory = 'C:/Users/maega' #when working from home machine
storage_directory = 'UFL Dropbox/Maegan Cremer/research-share/Maegan/Projects' #Local, HPG, or dropbox
project_directory = 'Cardiac-Amyloidosis-Multiple-Myeloma' #project folder
project_lv2_directory = '017_Mistic_noNotes_t-60' #deeper part of project folder

path = os.path.join(machine_directory, storage_directory, 
                    project_directory, project_lv2_directory)

# parent_dir = path
outputDir = path
os.chdir(outputDir)

## outputs

In [150]:
folderNameHeader = "AggFeatures_"
folderNameCommon = ""
folderNameSuffix = "_20250508"

# fileNameHeader_1 = "DF_SelectedFeats_"
# # fileNameHeader_2 = "ALUnknownPts"
# # fileNameCommon = "_Features"
# fileNameSuffix = "_20250416"

folder_data_out = ptfn.makeFolderPathForData(parent_dir=outputDir, folderName_header=folderNameHeader, 
                                        folderName_common=folderNameCommon, folderName_suffix=folderNameSuffix)


folderNameOut = folder_data_out

# get list of models

In [151]:
# featureRed = 0.25 #feature reduction value of interest
modelFolderNames = ['EchoAdj', 'descOnly', 'ParamsOnly', 'descFits']
modelNames = ['EchoAdj', 'DescOnly', 'ParamsOnly', 'DescFits']

#keys are the folder names
#values are the file names
modelNames_dict = {'EchoAdj': 'EchoAdj',
                   'DescOnly': 'DescOnly',
                   'ParamsOnly': 'ParamsOnly',
                   'DescFits': 'DescFits'}

In [152]:

ConditionFolder = os.listdir(outputDir) #only for models with t-60


In [153]:
ConditionFolder

['AggFeatures__20250508',
 'Book1.xlsx',
 'DFsForSVM_noNotes_t-60__20250506_v1',
 'DF_SelectedFeats_20250508',
 'FeatureCorrelation.prism',
 'MannWhitneyFeaturesStats',
 'StatsVisuals__20250508',
 'SVM_linear_DescFits_red01_250506_v1',
 'SVM_linear_DescFits_red025_250506_v1',
 'SVM_linear_DescFits_red02_250506_v1',
 'SVM_linear_DescFits_red03_250506_v1',
 'SVM_linear_DescOnly_red01_250506_v1',
 'SVM_linear_DescOnly_red025_250506_v1',
 'SVM_linear_DescOnly_red02_250506_v1',
 'SVM_linear_DescOnly_red03_250506_v1',
 'SVM_linear_EchoAdj_red01_250506_v1',
 'SVM_linear_EchoAdj_red025_250506_v1',
 'SVM_linear_EchoAdj_red02_250506_v1',
 'SVM_linear_EchoAdj_red03_250506_v1',
 'SVM_linear_ParamsOnly_red01_250506_v1',
 'SVM_linear_ParamsOnly_red025_250506_v1',
 'SVM_linear_ParamsOnly_red02_250506_v1',
 'SVM_linear_ParamsOnly_red03_250506_v1',
 'SVM_model_comparison_noNotes_t-60.xlsx',
 'zips']

In [154]:
#finding model folders
#SVM_linear_descFits is the start of all folders for the one model

modelFolders_dict = {}

for model in modelNames_dict.keys():
    listOfFolders = []
    for name in ConditionFolder:
        if model in name:
            listOfFolders.append(name)
    modelFolders_dict[model] = listOfFolders


In [155]:
modelFolders_dict

{'EchoAdj': ['SVM_linear_EchoAdj_red01_250506_v1',
  'SVM_linear_EchoAdj_red025_250506_v1',
  'SVM_linear_EchoAdj_red02_250506_v1',
  'SVM_linear_EchoAdj_red03_250506_v1'],
 'DescOnly': ['SVM_linear_DescOnly_red01_250506_v1',
  'SVM_linear_DescOnly_red025_250506_v1',
  'SVM_linear_DescOnly_red02_250506_v1',
  'SVM_linear_DescOnly_red03_250506_v1'],
 'ParamsOnly': ['SVM_linear_ParamsOnly_red01_250506_v1',
  'SVM_linear_ParamsOnly_red025_250506_v1',
  'SVM_linear_ParamsOnly_red02_250506_v1',
  'SVM_linear_ParamsOnly_red03_250506_v1'],
 'DescFits': ['SVM_linear_DescFits_red01_250506_v1',
  'SVM_linear_DescFits_red025_250506_v1',
  'SVM_linear_DescFits_red02_250506_v1',
  'SVM_linear_DescFits_red03_250506_v1']}

# get list of features across models

In [156]:
#SVM_linear_descFits is the start of all folders for the one model
#SVM_linear_DescFits is the start of the file name with the feature lists and performance
#sheet concat_features has one column per rank weight with the selected features of that rank weight's best performing model
#the first column is index, the first row is the rank weight

In [157]:
dict_ogfeats_byModel = {}
dict_ogFeats_byFolder = {}
dict_feats_byModel = {}
for model in modelFolders_dict.keys():
    #read in each of the selected features workbooks with sheet "concat_features"
    model_OGdataList_multipleRuns = []
    model_features_multipleRuns = []
    for folder in modelFolders_dict[model]:
        #get the folder contents
        filesInFolder = os.listdir(os.path.join(outputDir,folder))
        #find file name
        fileName = ''.join([file for file in filesInFolder if "SVM_" in file and 'xlsx' in file])
        #make path 
        tempPath = os.path.join(outputDir,folder,fileName)
        dataTable = pd.read_excel(tempPath, sheet_name = 'concat_features')
        dataMelted = pd.melt(dataTable.iloc[:,1:]).dropna()
        model_features_multipleRuns = model_features_multipleRuns + list(dataMelted['value'])
        ogdataList_all = [feature.split("_",1)[0] for feature in dataMelted['value']]
        ogdataList = list(set(ogdataList_all))
        dict_ogFeats_byFolder[folder] = ogdataList
        model_OGdataList_multipleRuns = model_OGdataList_multipleRuns + ogdataList
    dict_ogfeats_byModel[model] = list(set(model_OGdataList_multipleRuns))
    dict_feats_byModel[model]= list(set(model_features_multipleRuns))

In [158]:
len(dict_ogfeats_byModel['ParamsOnly'])

48

# get the original feature frames 

In [159]:
dfFolder = os.listdir('DFsForSVM_noNotes_t-60__20250506_v1')

In [160]:
dict_dataframes = {}
for model in modelNames_dict.values():
    #find the file
    fileName = ''.join([file for file in dfFolder if model in file and 'xlsx' in file])
    features = pd.read_excel(os.path.join('DFsForSVM_noNotes_t-60__20250506_v1', fileName), sheet_name='knownPts')
    dict_dataframes[model] = features

## compare total original data to selected original data across multiple runs

In [161]:
dict_dataframes.keys()

dict_keys(['EchoAdj', 'DescOnly', 'ParamsOnly', 'DescFits'])

In [162]:
modelNames_dict

{'EchoAdj': 'EchoAdj',
 'DescOnly': 'DescOnly',
 'ParamsOnly': 'ParamsOnly',
 'DescFits': 'DescFits'}

In [163]:
print(modelNames_dict.get('DescFits'))

DescFits


In [168]:
dict_model_stats = {}
for model in dict_dataframes.keys():
    dataTable = dict_dataframes[model]
    # model = modelNames_dict[model]
    dataTable=dict_dataframes[model].drop(['DeID','CA_status_yes'], axis = 1)

    fullFeatureList = list(dataTable.columns)
    fullOGData = [feature.split("_",1)[0] for feature in fullFeatureList]



    refFeats = dict_ogfeats_byModel[model]

    stats = { 'length full feature list': len(fullFeatureList),
             'length of selected features across runs': len(dict_feats_byModel[model]),
             'length of original data in full feature list': len(list(set(fullOGData))),
             'length of original data across selected features': len(refFeats)
             }
    dict_model_stats [model] = stats


In [172]:
featureList_OGData_compare = pd.DataFrame.from_dict(dict_model_stats).T

In [173]:
ptfn.outputToExcel(fileName_header = "FeatListOGDataCompare", fileName_Common = "",
                   fileName_suffix = '_2025-05-08', folderName = folderNameOut, parent_dir= outputDir,
                   sheetName = 'all runs', df_data = featureList_OGData_compare)