# import packages

In [1]:
#libraries and dirrectory 
import pandas as pd
import numpy as np
import os
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from scipy.optimize import curve_fit
import patientFunctions as ptfn
import seaborn as sns
import matplotlib.pyplot as plt

# directory management

In [2]:
#location of original data
starting_directory = os.getcwd()


machine_directory = 'C:/Users/mcremer' #the C and path to the project folder
# machine_directory = 'C:/Users/maega' #when working from home machine
storage_directory = 'UFL Dropbox/Maegan Cremer/research-share/Maegan/Projects' #Local, HPG, or dropbox
project_directory = 'Cardiac-Amyloidosis-Multiple-Myeloma' #project folder
project_lv2_directory = '012 Processed Data/data-2024-06-05' #deeper part of project folder

path = os.path.join(machine_directory, storage_directory, 
                    project_directory, project_lv2_directory)

new_dirrectory = path

os.chdir(new_dirrectory)

In [3]:
#updates to directory management
machine_directory = 'C:/Users/mcremer' #the C and path to the project folder
# machine_directory = 'C:/Users/maega' #when working from home machine
storage_directory = 'UFL Dropbox/Maegan Cremer/research-share/Maegan/Projects' #Local, HPG, or dropbox
project_directory = 'Cardiac-Amyloidosis-Multiple-Myeloma' #project folder
project_lv2_directory = '015_Mistic2025' #deeper part of project folder

path = os.path.join(machine_directory, storage_directory, 
                    project_directory, project_lv2_directory)

# parent_dir = path
outputDir = path

## naming file outputs

In [4]:
folderNameHeader = "DFsForSVM"
folderNameCommon = ""
folderNameSuffix = "_20250402_v1"

fileNameHeader_1 = "DescFits"
# fileNameHeader_2 = "ALUnknownPts"
fileNameCommon = "_Features"
fileNameSuffix = "_20250402_v1"

folder_data_out = ptfn.makeFolderPathForData(parent_dir=outputDir, folderName_header=folderNameHeader, 
                                        folderName_common=folderNameCommon, folderName_suffix=folderNameSuffix)
# outputToExcel(*, fileName_header=fileNameHeader, fileName_Common= fileNameCommon, 
#               fileName_suffix= fileNameSuffix, folderName = folder_data_out, sheetName= *)

# fig_file_out = outputFiguresPath(fileName_header=fileNameHeader, fileName_mid=**, 
#                                  fileName_suff= fileNameSuffix, parent_dir=parent_dir, folderName=folder_data_out)

folderNameOut = folder_data_out

In [5]:
#file name suffix 
outputSheet_Suffix = fileNameSuffix
outputSheet_common = fileNameCommon

# Penalty scores used throughout for missing data and low time resolution

## length 1

In [6]:
#filler information for fits of 1 point
# one_point_STD = 0
one_point_r2 = -1
# one_point_residual_error = 0

## length 0

In [7]:
penalty_coef= np.nan
penalty_intercept = np.nan
penalty_R2 = np.nan
penalty_residual_error = np.nan
penalty_fillna = np.nan

In [8]:
penalty_echo = -1


# collecting patient information

In [9]:
#importing patient data as dictionaries of dataframes

#build a dictionary of patients and their values 
ptFiles = ptfn.GetListofPTfiles(new_dirrectory, '.xlsx', 'data')

ptDict = {}

#loop through the listOfPatients 
for file in ptFiles: 
    pt = ptfn.patient(new_dirrectory, file, 4)
    ptDict[pt.ptID] = pt

In [10]:
EchoTimeColumn ='time from diagnosis to first echo (months)'
echoDates = {}
for pt in ptDict.keys():
    ptEchoData = ptDict[pt].echoData
    patientEchoDate = ptEchoData[EchoTimeColumn]
    if 'none' not in list(patientEchoDate):
        echoDates[pt] = patientEchoDate

#test to see if this still runs and if the data is correct

# collecting patients of interest

In [11]:
#list of patients by amyloid status 

listPositive = []
listNegative = []
listOther = []


list_pts_remove = ["K-01", "G-01", "J-01", "L-02"]
#remove list_pts_remove from ptDict
for pt in list_pts_remove:
    if pt in ptDict:
        del ptDict[pt]

#get list of positive patients 
for pt in ptDict:
    ptALStatus = ptDict[pt].amyloid_status
    if ptALStatus == 'yes':
        listPositive.append(pt)
    elif ptALStatus == 'no':
        listNegative.append(pt)
    else:
        listOther.append(pt)

# collecting the labs of interest

labs of interest have at least one data point for each patient and aren't tagged as methods or limits of normality

In [12]:
#for patient labs, convert to boolean and find the number of each lab (true)
ptLabsBool = {}
for pt in ptDict.keys():
    ptLabsData = ptDict[pt].labsData
    #convert to boolean if not nan
    ptLabsData = ptLabsData.notna() #not sure this is capturing the null data correctly
    #find the number of each lab (true)
    ptLabsData = ptLabsData.sum(axis=0) #check the axis
    #add to the dictionary describing the number of each lab the patient has
    ptLabsBool[pt] = ptLabsData

#then find frequency of labs across all patients
ptLabsBool_df = pd.DataFrame(ptLabsBool)
#count frequency of value in a column 

dict_value_counts_labs ={}
for lab in ptLabsBool_df.index:
    #call a row and perform value counts, save it to a key as the lab name in the dictionary
    dict_value_counts_labs[lab] = ptLabsBool_df.loc[lab].value_counts()

#convert to dataframe and transpose
df_value_counts_labs = pd.DataFrame(dict_value_counts_labs).T

In [13]:
#removing things that aren't lab values
omit = ['Method',"Lower Limit of Normal", "Upper Limit of Normal"]

#drop the rows whos string contains items from the omit list
df_value_counts_labs_filtered = df_value_counts_labs[~df_value_counts_labs.index.str.contains('|'.join(omit))]

#collect the rows where the column"0" is null
df_value_counts_labs_filtered = df_value_counts_labs_filtered[df_value_counts_labs_filtered[0].isnull()]

#print the index of df_value_counts_labs_filtered
filteredLabs = df_value_counts_labs_filtered.index.tolist()

In [14]:
#remove 'DeID', 'Survival(Mo)', 'RelTime(Days)', 'Unnamed: 0' from the filteredLabs list
filteredLabs = [x for x in filteredLabs if x not in ['DeID', 'Survival(Mo)', 'RelTime(Days)', 'Unnamed: 0']]

In [15]:
filteredLabs

['ALT Result(U/L)',
 'AST Result(U/L)',
 'Albumin Electrophoresis Result',
 'Albumin Result(gm/dL)',
 'BUN Result(mg/dL)',
 'Beta-2-Microglobulin Result',
 'Calcium Result(mg/dL)',
 'Chloride Result(mmol/L)',
 'Creatinine Result(mg/dL)',
 'Glucose Result',
 'HCT Result',
 'Hemoglobin Result',
 'Immature Gran Auto',
 'Kappa Free Light Chains Result (mg/L)',
 'Kappa/Lambda Free Light Chain Ratio Result',
 'LDH Result(U/L)',
 'Lambda Free Light Chains Result (mg/L)',
 'MCH Result(pg)',
 'MCHC Result',
 'MCV Result',
 'Monocyte Result',
 'Neutrophil Result',
 'Nucleated RBC Result',
 'Pct. Immature Gran Auto Result',
 'Pct. Mono Result',
 'Pct. Neutrophil Result',
 'Platelet Count Result(k/uL)',
 'Potassium Result(mmol/L)',
 'RBC Result',
 'Serum Electrophoresis Alpha 1 Result',
 'Serum Electrophoresis Alpha 2 Result',
 'Serum Electrophoresis Beta Result',
 'Serum Electrophoresis Gamma Result',
 'Sodium Result(mmol/L)',
 'Total Serum Protein Result',
 'WBC Result(k/uL)',
 'total CO2 Result

## clean the lab names for working with throughout the document

In [16]:
#further clean the labs list to remove units and other non-lab values

# encoding demographic information

In [64]:
import re

In [68]:
#collect age, sex, and dx and make it lowercase
#make a dictionary of patient demographics
# ptDemographics = {}
ptSex, ptDx, ptALStatus = {}, {}, {}
# ptAge = {}
# ptDx = {}
# ptALStatus = {}
for pt in ptDict.keys():
    ptSex[pt] = ptDict[pt].sex.lower()
    # ptAge[pt] = ptDict[pt].ageDx
    ptDx[pt] = ptDict[pt].dx
    ptDx[pt] = re.sub(r"[\\/\d]", "", ptDx[pt]).lower()
    ptALStatus[pt] = ptDict[pt].amyloid_status.lower()

    
    #Convert "multiple myeloma" to "MM"
    ptDx[pt] = ptDx[pt].replace("multiple myeloma", "mm")
    #still have compound entries
    # if ptDx[pt] contains "mg" replace the entry with "mgus", if contains "mm" replace with "mm"
    if "mg" in ptDx[pt]:
        ptDx[pt] = "mgus"
    elif "mm" in ptDx[pt]:
        ptDx[pt] = "mm"
    elif "mm" or "mg" not in ptDx[pt]:
        ptDx[pt] = 'other'

#convert the dictionaries to dataframes and concatenate them
ptSex_df = pd.DataFrame.from_dict(ptSex, orient='index', columns=['sex'])
ptALStatus_df = pd.DataFrame.from_dict(ptALStatus, orient='index', columns=['CA_status'])
# ptAge_df = pd.DataFrame.from_dict(ptAge, orient='index', columns=['age'])
ptDx_df = pd.DataFrame.from_dict(ptDx, orient='index', columns=['dx'])
#concatenate the dataframes
ptDemographics_df = pd.concat([ptALStatus_df, ptSex_df, ptDx_df], axis=1)

In [69]:
#onehot encoding of the categorical variables
ptDemographics_df_encoded = pd.get_dummies(ptDemographics_df, columns=["CA_status", "sex", "dx",], dtype=int, drop_first=True)

In [75]:
#collecting age and survival data
ptAge, ptsurvival = {}, {}
for pt in ptDict.keys():
    ptAge[pt] = ptDict[pt].ageDx
    ptsurvival[pt] = ptDict[pt].survival

#convert to dataframe
ptAge_df = pd.DataFrame.from_dict(ptAge, orient ='index', columns = ['age'])
ptsurvival_df = pd.DataFrame.from_dict(ptsurvival, orient = 'index', columns = ['survival(mo)'])
#concatenate the dataframes
ptDemographics_df = pd.concat([ptDemographics_df, ptAge_df, ptsurvival_df],
                               axis=1)
#rename the columns


# encoding note information

In [None]:
#dictionaries for conversion
dictECOGtoKPS = {0 : 95,
                 1 : 75, 
                 2 : 55, 
                 3 : 35, 
                 4 : 15, 
                 5 : 0}

dictKPStoECOG = {100: 0, 90: 0, 
                 80: 1, 70: 1, 
                 60: 2, 50: 2, 
                 40: 3 , 30: 3, 
                 20: 4, 10: 4, 0: 5 }

ptDictMDnotesScored = {}
#loop through the listOfPatients
for pt in ptDict.keys():
    #call each patient in the dictionary storing patient note information
    ptNotes = ptDict[pt].MDnotes

    if not isinstance(ptNotes, bool):

        #columns of interest
        listColumnNames = ['time from diagnosis to note (months)', 'ECOG Performance Status', 'Karnofsky Performance Status']
        ptNotesCorrected = ptNotes[listColumnNames]
        #convert the Karnofsky scores to integers
        ptNotesCorrected['Karnofsky Performance Status'] = ptNotesCorrected['Karnofsky Performance Status'].astype(str).str.extract('(\d+)').astype(float).astype('Int64')

        #remove rows where the "ECOG Performance Status" is "OTHER"
        ptNotesCorrected = ptNotesCorrected[ptNotesCorrected['ECOG Performance Status'] != 'OTHER'] 
        #remove rows where the "Karnofsky Performance Status" is "OTHER"
        ptNotesCorrected = ptNotesCorrected[ptNotesCorrected['Karnofsky Performance Status'] != 'OTHER']

        # old methods, we will see if we still need this

        for i in list(ptNotesCorrected.index):
            #get the scores for that row
            ptTime = ptNotesCorrected.at[i, 'time from diagnosis to note (months)']
            ptECOG = ptNotesCorrected.at[i,'ECOG Performance Status']
            ptKPS = ptNotesCorrected.at[i, 'Karnofsky Performance Status']
        
            #if the value of ptECOG was "OTHER", then the row should be dropped.
            if ptECOG == "OTHER":
                # ptNotesCorrected.drop(index = i, inplace=True ) #drops the full row, would need to add time back if interested in clinical impression
                ptECOG = np.nan #replacement is dropped before descriptive stats are run
                ptKPS = np.nan #replacement is dropped before descriptive stats are run

            #infer ECOG from Karnofsky where you now have half points
            elif np.isnan(ptECOG) ==True and pd.isna(ptKPS) == False: 
                ptECOG = dictKPStoECOG[ptKPS]

            #infer Karnofsky from ECOG 
            elif pd.isna(ptKPS) == True and np.isnan(ptECOG) ==False: 
                ptKPS = dictECOGtoKPS[ptECOG]

            #save these values back to the dataframe
            if ptECOG != "OTHER":
                ptNotesCorrected.at[i,'ECOG Performance Status'] = ptECOG
                ptNotesCorrected.at[i, 'Karnofsky Performance Status'] = ptKPS
                                
    #saving the notes to the new dictionary
    ptDictMDnotesScored[pt] = ptNotesCorrected


  ptNotesCorrected['Karnofsky Performance Status'] = ptNotesCorrected['Karnofsky Performance Status'].astype(str).str.extract('(\d+)').astype(float).astype('Int64')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ptNotesCorrected['Karnofsky Performance Status'] = ptNotesCorrected['Karnofsky Performance Status'].astype(str).str.extract('(\d+)').astype(float).astype('Int64')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ptNotesCorrected['Karnofsky Performance Status'] = ptNotesCorrected['Karnofsky Performance Status'].astype(str).str.extract('(\d+)').astype(fl

# descriptive stats for each lab by patient
don't forget to handle instances of 1 for values which aren't counts to be the one recorded value

In [None]:
#descriptive statistics for each lab for each patient and assign to a dictionary
ptLabsDescriptiveStats = {}
#loop through the listOfPatients
for pt in ptDict.keys():
    #call each patient in the dictionary storing patient lab information
    ptLabs = ptDict[pt].labsData #instead call the filtered labs situation
    #get the descriptive statistics for the labs
    ptLabsDescriptiveStats[pt] = ptLabs[filteredLabs].describe()
    #hybridize the lab names to include the name of the statistic
    ptLabsDescriptiveStats[pt].columns = [str(pt) + "_" + str(col) for col in ptLabsDescriptiveStats[pt].columns]
#convert to dataframe
ptLabsDescriptiveStats_df = pd.DataFrame(ptLabsDescriptiveStats).T

## descriptive stats of notes information

# treatment information

# fits of data
look in old code to see if there are better ways of writing it.

In [None]:
#definitions of functions 
# def linear_model(x, a, b):
#     return a * x + b 

def linear_model_forceB(x, a):
    return a * x 

def quadratic_model(x, a, b, c):
    return a * x**2 + b*x + c

# Define model systems
def quadratic_model_force(x, a, b):
    return a * x **2 + b*x 
