In [12]:
import os
import shutil
import pandas as pd
import json

In [14]:
datasets_directory="als_datasets"
output_directory="../LLM/emg_datasets"

os.makedirs(output_directory,exist_ok=True)


In [3]:
def dataset_formatting(dataframe,columns_to_drop,classification_rules):
    dataset=[]
    for i, row in dataframe.drop(columns=columns_to_drop).iterrows():
        sample={}
        prompt="Quantitative EMG: "
        for key,value in row.items():
            if(key != "Class"):
                prompt+=f"{key} = {value}, "
        completion=classification_rules[int(row["Class"])]
        sample["prompt"]=prompt[:-2]
        sample["completion"]=completion
        
        dataset.append(sample)
    
    return dataset

In [4]:
def save_jsonl(data, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        for item in data:
            f.write(json.dumps(item) + "\n")

In [16]:
control_vs_als_dataset=os.path.join(datasets_directory,"control_vs_als_dataset")
control_vs_als_jsonl=os.path.join(output_directory,"control_vs_als_dataset")
train_df=pd.read_csv(f"{control_vs_als_dataset}/train.csv")
val_df=pd.read_csv(f"{control_vs_als_dataset}/val.csv")
test_df=pd.read_csv(f"{control_vs_als_dataset}/test.csv")

columns_to_drop=["Patient"]
classification_rules=["Healthy","ALS"]

train_json=dataset_formatting(train_df,columns_to_drop,classification_rules)
val_json=dataset_formatting(val_df,columns_to_drop,classification_rules)
test_json=dataset_formatting(test_df,columns_to_drop,classification_rules)


os.makedirs(control_vs_als_jsonl,exist_ok=True)
save_jsonl(train_json,f"{control_vs_als_jsonl}/train.jsonl")
save_jsonl(val_json,f"{control_vs_als_jsonl}/val.jsonl")
save_jsonl(test_json,f"{control_vs_als_jsonl}/test.jsonl")


In [17]:
control_vs_myopathy_dataset=os.path.join(datasets_directory,"control_vs_myopathy_dataset")
control_vs_myopathy_jsonl=os.path.join(output_directory,"control_vs_myopathy_dataset")
train_df=pd.read_csv(f"{control_vs_myopathy_dataset}/train.csv")
val_df=pd.read_csv(f"{control_vs_myopathy_dataset}/val.csv")
test_df=pd.read_csv(f"{control_vs_myopathy_dataset}/test.csv")


columns_to_drop=["Patient"]
classification_rules=["Healthy","Myopathy"]

train_json=dataset_formatting(train_df,columns_to_drop,classification_rules)
val_json=dataset_formatting(val_df,columns_to_drop,classification_rules)
test_json=dataset_formatting(test_df,columns_to_drop,classification_rules)

os.makedirs(control_vs_myopathy_jsonl,exist_ok=True)
save_jsonl(train_json,f"{control_vs_myopathy_jsonl}/train.jsonl")
save_jsonl(val_json,f"{control_vs_myopathy_jsonl}/val.jsonl")
save_jsonl(test_json,f"{control_vs_myopathy_jsonl}/test.jsonl")


In [19]:
als_vs_myopathy_dataset=os.path.join(datasets_directory,"als_vs_myopathy_dataset")
als_vs_myopathy_jsonl=os.path.join(output_directory,"als_vs_myopathy_dataset")
train_df=pd.read_csv(f"{als_vs_myopathy_dataset}/train.csv")
val_df=pd.read_csv(f"{als_vs_myopathy_dataset}/val.csv")
test_df=pd.read_csv(f"{als_vs_myopathy_dataset}/test.csv")



columns_to_drop=["Patient"]
classification_rules=["Myopathy","ALS"]

train_json=dataset_formatting(train_df,columns_to_drop,classification_rules)
val_json=dataset_formatting(val_df,columns_to_drop,classification_rules)
test_json=dataset_formatting(test_df,columns_to_drop,classification_rules)

os.makedirs(als_vs_myopathy_jsonl,exist_ok=True)
save_jsonl(train_json,f"{als_vs_myopathy_jsonl}/train.jsonl")
save_jsonl(val_json,f"{als_vs_myopathy_jsonl}/val.jsonl")
save_jsonl(test_json,f"{als_vs_myopathy_jsonl}/test.jsonl")


In [20]:
control_vs_als_vs_myopathy_dataset=os.path.join(datasets_directory,"control_vs_als_vs_myopathy_dataset")
control_vs_als_vs_myopathy_jsonl=os.path.join(output_directory,"control_vs_als_vs_myopathy_dataset")
train_df=pd.read_csv(f"{control_vs_als_vs_myopathy_dataset}/train.csv")
val_df=pd.read_csv(f"{control_vs_als_vs_myopathy_dataset}/val.csv")
test_df=pd.read_csv(f"{control_vs_als_vs_myopathy_dataset}/test.csv")

columns_to_drop=["Patient"]
classification_rules=["Healthy","ALS","Myopathy"]

train_json=dataset_formatting(train_df,columns_to_drop,classification_rules)
val_json=dataset_formatting(val_df,columns_to_drop,classification_rules)
test_json=dataset_formatting(test_df,columns_to_drop,classification_rules)

os.makedirs(control_vs_als_vs_myopathy_jsonl,exist_ok=True)
save_jsonl(train_json,f"{control_vs_als_vs_myopathy_jsonl}/train.jsonl")
save_jsonl(val_json,f"{control_vs_als_vs_myopathy_jsonl}/val.jsonl")
save_jsonl(test_json,f"{control_vs_als_vs_myopathy_jsonl}/test.jsonl")




In [21]:
als_vs_control_and_myopathy_dataset=os.path.join(datasets_directory,"als_vs_control_and_myopathy_dataset")
als_vs_control_and_myopathy_jsonl=os.path.join(output_directory,"als_vs_control_and_myopathy_dataset")
train_df=pd.read_csv(f"{als_vs_control_and_myopathy_dataset}/train.csv")
val_df=pd.read_csv(f"{als_vs_control_and_myopathy_dataset}/val.csv")
test_df=pd.read_csv(f"{als_vs_control_and_myopathy_dataset}/test.csv")


columns_to_drop=["Patient"]
classification_rules=["NON-ALS","ALS"]

train_json=dataset_formatting(train_df,columns_to_drop,classification_rules)
val_json=dataset_formatting(val_df,columns_to_drop,classification_rules)
test_json=dataset_formatting(test_df,columns_to_drop,classification_rules)

os.makedirs(als_vs_control_and_myopathy_jsonl,exist_ok=True)
save_jsonl(train_json,f"{als_vs_control_and_myopathy_jsonl}/train.jsonl")
save_jsonl(val_json,f"{als_vs_control_and_myopathy_jsonl}/val.jsonl")
save_jsonl(test_json,f"{als_vs_control_and_myopathy_jsonl}/test.jsonl")




In [22]:
control_vs_als_and_myopathy_dataset=os.path.join(datasets_directory,"control_vs_als_and_myopathy_dataset")
control_vs_als_and_myopathy_jsonl=os.path.join(output_directory,"control_vs_als_and_myopathy_dataset")
train_df=pd.read_csv(f"{control_vs_als_and_myopathy_dataset}/train.csv")
val_df=pd.read_csv(f"{control_vs_als_and_myopathy_dataset}/val.csv")
test_df=pd.read_csv(f"{control_vs_als_and_myopathy_dataset}/test.csv")


columns_to_drop=["Patient"]
classification_rules=["NON-Healthy","Healthy"]

train_json=dataset_formatting(train_df,columns_to_drop,classification_rules)
val_json=dataset_formatting(val_df,columns_to_drop,classification_rules)
test_json=dataset_formatting(test_df,columns_to_drop,classification_rules)

os.makedirs(control_vs_als_and_myopathy_jsonl,exist_ok=True)
save_jsonl(train_json,f"{control_vs_als_and_myopathy_jsonl}/train.jsonl")
save_jsonl(val_json,f"{control_vs_als_and_myopathy_jsonl}/val.jsonl")
save_jsonl(test_json,f"{control_vs_als_and_myopathy_jsonl}/test.jsonl")



In [23]:
myopathy_vs_control_and_als_dataset=os.path.join(datasets_directory,"myopathy_vs_control_and_als_dataset")
myopathy_vs_control_and_als_jsonl=os.path.join(output_directory,"myopathy_vs_control_and_als_dataset")
train_df=pd.read_csv(f"{myopathy_vs_control_and_als_dataset}/train.csv")
val_df=pd.read_csv(f"{myopathy_vs_control_and_als_dataset}/val.csv")
test_df=pd.read_csv(f"{myopathy_vs_control_and_als_dataset}/test.csv")



columns_to_drop=["Patient"]
classification_rules=["NON-Myopathy","Myopathy"]

train_json=dataset_formatting(train_df,columns_to_drop,classification_rules)
val_json=dataset_formatting(val_df,columns_to_drop,classification_rules)
test_json=dataset_formatting(test_df,columns_to_drop,classification_rules)

os.makedirs(myopathy_vs_control_and_als_jsonl,exist_ok=True)
save_jsonl(train_json,f"{myopathy_vs_control_and_als_jsonl}/train.jsonl")
save_jsonl(val_json,f"{myopathy_vs_control_and_als_jsonl}/val.jsonl")
save_jsonl(test_json,f"{myopathy_vs_control_and_als_jsonl}/test.jsonl")

