In [None]:
import os
import pandas as pd
import re

# Directory containing the CSV files
directory = '.'

# Pattern to match the CSV files
pattern = re.compile(r'combined-augmented_to(\d+)new\.csv')

# List to store the dataframes
dataframes = []

# Get all files in the directory
files = os.listdir(directory)

# Filter and sort files based on the ending index
sorted_files = sorted(
    [f for f in files if pattern.match(f)],
    key=lambda x: int(pattern.search(x).group(1))
)

# Read each CSV file and append to the list
for file in sorted_files:
    file_path = os.path.join(directory, file)
    df = pd.read_csv(file_path)
    dataframes.append(df)

# Concatenate all dataframes
combined_df = pd.concat(dataframes, ignore_index=True)

# Drop duplicate rows based on the 'Unnamed: 0' column, keeping the first occurrence

# Read the additional CSV file and append its rows to the combined dataframe
missing_df = pd.read_csv('combined-augmented_missingnew.csv')
combined_df = pd.concat([combined_df, missing_df], ignore_index=True)

combined_df = combined_df.drop_duplicates(subset='Unnamed: 0', keep='first')

# Sort the final dataframe using the 'Unnamed: 0' column
combined_df = combined_df.sort_values(by='Unnamed: 0').reset_index(drop=True)

# Save the combined dataframe to a new CSV file
combined_df.to_csv('combined_augmented_all.csv', index=False)

print("CSV files combined successfully into 'combined_augmented_all.csv'")

CSV files combined successfully into 'combined_augmented_all.csv'


In [12]:
combined_df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,dicom_id,subject_id,stay_id,study_id,split,eye_tracking_data_discarded,gender,...,Augmented_Pleural effusion risk factors (1-5),Augmented_Pleural disease risk factors (1-10),augmenting_query,augmenting_output,Augmented_History of COPD,Augmented_Age,Augmented_Heartrate,Augmented_Oxygen levels (%),Augmented_Importance Score,Augmented_Data
0,0,0,P102R108387,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,32067002.0,55032240,train,False,F,...,4.0,8.0,You are a clinical expert. With following extr...,"{""Shortness of breath"": true, ""Fatigue"": true,...",,,,,,
1,1,1,P102R379837,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,32067002.0,55032240,val,False,F,...,4.0,8.0,You are a clinical expert. With following extr...,"{""Shortness of breath"": true, ""Fatigue"": true,...",,,,,,
2,2,2,P102R558314,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,32067002.0,55032240,val,False,F,...,4.0,8.0,You are a clinical expert. With following extr...,"{""Shortness of breath"": true, ""Fatigue"": true,...",,,,,,
3,3,3,P102R765317,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,32067002.0,55032240,train,False,F,...,3.0,5.0,You are a clinical expert. With following extr...,"{""Shortness of breath"": false, ""Fatigue"": fals...",,,,,,
4,4,4,P102R915878,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,32067002.0,55032240,train,False,F,...,4.0,8.0,You are a clinical expert. With following extr...,"{""Shortness of breath"": true, ""Fatigue"": true,...",,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
794,42,898,P300R206615,44572c6e-62135033-45ea19ba-cbea2777-a81fce81,19893236,31661301.0,59220136,train,False,F,...,3.0,5.0,You are a clinical expert. With following extr...,"{""Shortness of breath"": false, ""Fatigue"": fals...",,,,,,
795,43,899,P300R611251,2b20dcdf-4077bc16-48fc8eb5-265ef218-f6552cb0,19906407,33352559.0,57296330,test,False,M,...,3.0,5.0,You are a clinical expert. With following extr...,"{""Shortness of breath"": false, ""Fatigue"": fals...",,,,,,
796,44,900,P300R519683,92134f99-0e73faba-1280ad81-218c68ba-933a85c5,19907884,39112538.0,57427881,train,False,F,...,3.0,5.0,You are a clinical expert. With following extr...,"{""Shortness of breath"": true, ""Fatigue"": true,...",,,,,,
797,45,901,P300R749452,c6db0413-f3266e66-031e9892-2809b536-c13cf9f2,19907884,31819342.0,59325966,train,False,F,...,3.0,5.0,You are a clinical expert. With following extr...,"{""Shortness of breath"": false, ""Fatigue"": fals...",,,,,,


In [3]:
sample_df = pd.read_csv("./spreadsheets/reflacx_clinical.csv")

In [5]:
generated_idx = combined_df["Unnamed: 0"].tolist()

In [8]:
[(idx)for idx, id in enumerate(sample_df["Unnamed: 0"].tolist()) if id not in generated_idx]

[30, 31, 32, 33, 34, 414, 431, 445, 446, 447, 464]

In [4]:
combined_df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,dicom_id,subject_id,stay_id,study_id,split,eye_tracking_data_discarded,gender,...,Augmented_Chest radiography,Augmented_Computed tomography (CT) scans,Augmented_Pulmonary function tests,Augmented_Heart failure,Augmented_Perforation of the intestine,Augmented_Surgical procedure,Augmented_Pleural effusion risk factors (1-5),Augmented_Pleural disease risk factors (1-10),augmenting_query,augmenting_output
0,0,0,P102R108387,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,32067002.0,55032240,train,False,F,...,True,True,True,True,True,True,4,8,You are a clinical expert. With following extr...,"{""Shortness of breath"": true, ""Fatigue"": true,..."
1,1,1,P102R379837,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,32067002.0,55032240,val,False,F,...,True,True,True,True,True,True,4,8,You are a clinical expert. With following extr...,"{""Shortness of breath"": true, ""Fatigue"": true,..."
2,2,2,P102R558314,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,32067002.0,55032240,val,False,F,...,True,True,True,True,True,True,4,8,You are a clinical expert. With following extr...,"{""Shortness of breath"": true, ""Fatigue"": true,..."
3,3,3,P102R765317,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,32067002.0,55032240,train,False,F,...,False,False,False,False,False,False,3,5,You are a clinical expert. With following extr...,"{""Shortness of breath"": false, ""Fatigue"": fals..."
4,4,4,P102R915878,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,32067002.0,55032240,train,False,F,...,True,True,True,True,True,True,4,8,You are a clinical expert. With following extr...,"{""Shortness of breath"": true, ""Fatigue"": true,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
724,42,898,P300R206615,44572c6e-62135033-45ea19ba-cbea2777-a81fce81,19893236,31661301.0,59220136,train,False,F,...,False,False,False,False,False,False,3,5,You are a clinical expert. With following extr...,"{""Shortness of breath"": false, ""Fatigue"": fals..."
725,43,899,P300R611251,2b20dcdf-4077bc16-48fc8eb5-265ef218-f6552cb0,19906407,33352559.0,57296330,test,False,M,...,False,False,False,False,False,False,3,5,You are a clinical expert. With following extr...,"{""Shortness of breath"": false, ""Fatigue"": fals..."
726,44,900,P300R519683,92134f99-0e73faba-1280ad81-218c68ba-933a85c5,19907884,39112538.0,57427881,train,False,F,...,False,False,False,False,False,False,3,5,You are a clinical expert. With following extr...,"{""Shortness of breath"": true, ""Fatigue"": true,..."
727,45,901,P300R749452,c6db0413-f3266e66-031e9892-2809b536-c13cf9f2,19907884,31819342.0,59325966,train,False,F,...,False,False,False,False,False,False,3,5,You are a clinical expert. With following extr...,"{""Shortness of breath"": false, ""Fatigue"": fals..."


In [3]:
sorted_files

['combined-augmented_to22new.csv',
 'combined-augmented_to41new.csv',
 'combined-augmented_to73new.csv',
 'combined-augmented_to92new.csv',
 'combined-augmented_to102new.csv',
 'combined-augmented_to124new.csv',
 'combined-augmented_to185new.csv',
 'combined-augmented_to192new.csv',
 'combined-augmented_to252new.csv',
 'combined-augmented_to289new.csv',
 'combined-augmented_to303new.csv',
 'combined-augmented_to350new.csv',
 'combined-augmented_to408new.csv',
 'combined-augmented_to414new.csv',
 'combined-augmented_to464new.csv',
 'combined-augmented_to527new.csv',
 'combined-augmented_to658new.csv',
 'combined-augmented_to700new.csv',
 'combined-augmented_to752new.csv',
 'combined-augmented_to799new.csv']

In [1]:
import os
import pandas as pd
import re

# Directory containing the CSV files
directory = '.'

# Pattern to match the CSV files
pattern = re.compile(r'combined-augmented_to(\d+)_nodb\.csv')

# List to store the dataframes
dataframes = []

# Get all files in the directory
files = os.listdir(directory)

# Filter and sort files based on the ending index
sorted_files = sorted(
    [f for f in files if pattern.match(f)],
    key=lambda x: int(pattern.search(x).group(1))
)

# Read each CSV file and append to the list
for file in sorted_files:
    file_path = os.path.join(directory, file)
    df = pd.read_csv(file_path)
    dataframes.append(df)

# Concatenate all dataframes
combined_df = pd.concat(dataframes, ignore_index=True)

# Save the combined dataframe to a new CSV file
combined_df.to_csv('combined_augmented_all_nodb.csv', index=False)

print("CSV files combined successfully into 'combined_augmented_all_nodb.csv'")

CSV files combined successfully into 'combined_augmented_all.csv'


In [None]:
# for mistral with DB

import os
import pandas as pd
import re

# Directory containing the CSV files
directory = '.'
# augmented-mistral-to185
# Pattern to match the CSV files
pattern = re.compile(r'augmented-mistral-to(\d+)\.csv')

# List to store the dataframes
dataframes = []

# Get all files in the directory
files = os.listdir(directory)

# Filter and sort files based on the ending index
sorted_files = sorted(
    [f for f in files if pattern.match(f)],
    key=lambda x: int(pattern.search(x).group(1))
)

# Read each CSV file and append to the list
for file in sorted_files:
    file_path = os.path.join(directory, file)
    df = pd.read_csv(file_path)
    dataframes.append(df)

# Concatenate all dataframes
combined_df = pd.concat(dataframes, ignore_index=True)

# Save the combined dataframe to a new CSV file
combined_df.to_csv('combined_augmented_mistral.csv', index=False)

print("CSV files combined successfully into 'combined_augmented_all_nodb.csv'")

In [9]:
combined_df = pd.read_csv('combined_augmented_mistral.csv')

In [11]:
799-679

120

In [10]:
len(combined_df)

679

In [4]:
# for mistral without DB

import os
import pandas as pd
import re

# Directory containing the CSV files
directory = '.'
# augmented-mistral-to185
# Pattern to match the CSV files
pattern = re.compile(r'augmented-mistral-nodb-to(\d+)\.csv')

# List to store the dataframes
dataframes = []

# Get all files in the directory
files = os.listdir(directory)

# Filter and sort files based on the ending index
sorted_files = sorted(
    [f for f in files if pattern.match(f)],
    key=lambda x: int(pattern.search(x).group(1))
)

# Read each CSV file and append to the list
for file in sorted_files:
    file_path = os.path.join(directory, file)
    df = pd.read_csv(file_path)
    dataframes.append(df)

# Concatenate all dataframes
combined_df = pd.concat(dataframes, ignore_index=True)

# Save the combined dataframe to a new CSV file
combined_df.to_csv('combined_augmented_mistral_nodb.csv', index=False)

print("CSV files combined successfully into 'combined_augmented_all_nodb.csv'")

CSV files combined successfully into 'combined_augmented_all_nodb.csv'


In [7]:
799- len(combined_df)

228

In [None]:
# 120 -> with DB
# 228 -> no db



| Model | Using Storage | Not Using Storage |
|---|---|---|
|LLama 3.1 8B | 0 | 0 |
|Mistral 7B | 120 | 228 |