In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

# Load the dataset
file_path = '/content/drive/MyDrive/P2/T1.2/Dataset/LLM-train.csv'
data = pd.read_csv(file_path)

# Define symptoms columns (excluding the disease label)
symptoms_columns = [col for col in data.columns if col != "disease_label"]

# Detect duplicate rows (overlapping/conflicting symptom combinations)
duplicates = data[data.duplicated(subset=symptoms_columns, keep=False)].copy()

# Create "symptom_combination" column
duplicates["symptom_combination"] = duplicates[symptoms_columns].apply(
    lambda row: ', '.join(row.index[row == 1]), axis=1
)

# Create "conflicting_diseases" column
conflicting_diseases = duplicates.groupby("symptom_combination")["disease_label"].transform(
    lambda x: ' > '.join(x.unique())
)
duplicates["conflicting_diseases"] = conflicting_diseases

# Merge back to the original data, leaving blank for non-overlapping rows
data["symptom_combination"] = data[symptoms_columns].apply(
    lambda row: ', '.join(row.index[row == 1]), axis=1
)
data = pd.merge(data, duplicates[["symptom_combination", "conflicting_diseases"]],
                on="symptom_combination", how="left")

# Fill non-overlapping rows with blank values
data["conflicting_diseases"].fillna("", inplace=True)

# Save the updated dataset
output_file_path = '/content/drive/MyDrive/P2/T1.2/Dataset/LLM-train3.csv'
data.to_csv(output_file_path, index=False)

print(f"Processed dataset with overlapping symptom combinations saved to {output_file_path}.")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data["conflicting_diseases"].fillna("", inplace=True)


In [None]:
import pandas as pd

# Load the dataset
file_path = '/content/drive/MyDrive/P2/T1.2/Dataset/ML_test.csv'
data = pd.read_csv(file_path)

# Define symptoms columns (excluding the disease label)
symptoms_columns = [col for col in data.columns if col != "disease_label"]

# Detect duplicate rows (overlapping/conflicting symptom combinations)
duplicates = data[data.duplicated(subset=symptoms_columns, keep=False)].copy()

# Create "symptom_combination" column
duplicates["symptom_combination"] = duplicates[symptoms_columns].apply(
    lambda row: ', '.join(row.index[row == 1]), axis=1
)

# Create "conflicting_diseases" column
conflicting_diseases = duplicates.groupby("symptom_combination")["disease_label"].transform(
    lambda x: ' > '.join(x.unique())
)
duplicates["conflicting_diseases"] = conflicting_diseases

# Merge back to the original data, leaving blank for non-overlapping rows
data["symptom_combination"] = data[symptoms_columns].apply(
    lambda row: ', '.join(row.index[row == 1]), axis=1
)
data = pd.merge(data, duplicates[["symptom_combination", "conflicting_diseases"]],
                on="symptom_combination", how="left")

# Fill non-overlapping rows with blank values
data["conflicting_diseases"].fillna("", inplace=True)

# Save the updated dataset
output_file_path = '/content/drive/MyDrive/P2/T1.2/Dataset/ML_test.csv'
data.to_csv(output_file_path, index=False)

print(f"Processed dataset with overlapping symptom combinations saved to {output_file_path}.")
