In [56]:
import tensorflow as tf
import pandas as pd


data = pd.read_csv(r'C:\Users\zrzr_\OneDrive\桌面\clean_ind\indiana_reports.csv')


# Check for unique values in the "Problems" column
unique_problems = data['Problems'].nunique()

# Display the unique values in the "Problems" column
unique_values_problems = data['Problems'].unique()
unique_values_problems



array(['normal', 'Cardiomegaly;Pulmonary Artery',
       'Pulmonary Disease, Chronic Obstructive;Bullous Emphysema;Pulmonary Fibrosis;Cicatrix;Opacity;Opacity;Opacity;Opacity',
       ..., 'Cardiomegaly;Pulmonary Congestion;Heart Failure',
       'Lung;Diaphragm;Cicatrix;Pulmonary Atelectasis;Pulmonary Emphysema',
       'Opacity;Granuloma'], dtype=object)

In [57]:


# Display the columns in the dataset to confirm "Problems" column exists
data.columns

# Split the unique values in "Problems" column where multiple words are combined with semicolons
split_problems = [item.split(';') for item in data['Problems'].unique()]

# Flatten the list to get all individual words
flat_list = [item for sublist in split_problems for item in sublist]

# Get the unique values from the flattened list
unique_terms = set(flat_list)
unique_terms_sorted = sorted(unique_terms)

# Define categories
categories = {
    "normal":['normal'
              ],
    "Anatomical Structures": [
        'Heart', 'Cardiac Shadow', 'Pulmonary Artery', 'Aorta', 'Aorta, Thoracic', 
        'Thoracic Vertebrae', 'Diaphragm', 'Trachea', 'Trachea, Carina', 'Ribs', 
        'Lung', 'Thorax', 'Bone and Bones', 'Cervical Vertebrae', 'Lumbar Vertebrae', 
        'Spine', 'Shoulder', 'Osteophyte', 'Spondylosis', 'Adipose Tissue', 'Pleura', 
        'Subcutaneous Emphysema', 'Abdomen', 'Lymph Nodes', 'Blood Vessels',
        'Heart Atria', 'Heart Ventricles'
    ],
    "Pulmonary Diseases": [
        'Pulmonary Disease, Chronic Obstructive', 'Pulmonary Atelectasis', 
        'Pulmonary Congestion', 'Pulmonary Edema', 'Pulmonary Emphysema', 
        'Pulmonary Fibrosis', 'Pneumonia', 'Pneumothorax', 'Bronchiectasis', 
        'Bronchiolitis', 'Bronchitis', 'Cystic Fibrosis', 'Sarcoidosis', 'Lung Diseases, Interstitial',
        'Bullous Emphysema'
    ],
    "Cardiovascular Diseases": [
        'Heart Failure', 'Hypertension, Pulmonary', 'Aortic Aneurysm', 'Atherosclerosis',
        'Cardiomegaly', 'Epicardial Fat'
    ],
    "Musculoskeletal Conditions": [
        'Kyphosis', 'Scoliosis', 'Osteoporosis', 'Spinal Fusion', 'Arthritis', 
        'Spondylosis', 'Dislocations', 'Fractures, Bone', 'Sclerosis'
    ],
    "Neoplasms and Tumors": [
        'Granuloma', 'Granulomatous Disease', 'Nodule', 'Mass', 'Cavitation',
        'Cysts'
    ],
    "Metabolic and Endocrine Disorders": [
        'Bone Diseases, Metabolic', 'Hyperostosis, Diffuse Idiopathic Skeletal'
    ],
    "Infections and Inflammations": [
        'Tuberculosis', 'Infiltrate', 'Hemothorax', 'Hemopneumothorax', 'Fibrosis'
    ],
    "Skin and Subcutaneous Conditions": [
        'Cicatrix', 'Calcinosis', 'Calcified Granuloma', 'Blister', 'Subcutaneous Emphysema'
    ],
    "Hernias": [
        'Hernia, Diaphragmatic', 'Hernia, Hiatal', 'Pectus Carinatum', 'Funnel Chest'
    ],
    "Miscellaneous Conditions": [
        'Emphysema', 'Pneumonectomy', 'Pneumoperitoneum', 'Hydropneumothorax', 
        'Hypovolemia', 'Volume Loss', 'Deformity', 'Airspace Disease', 'Colonic Interposition',
        'Mastectomy'
    ],
    "Medical Devices and Implants": [
        'Implanted Medical Device', 'Breast Implants', 'Stents', 'Surgical Instruments', 
        'Catheters, Indwelling', 'Tube, Inserted', 'Sutures'
    ],
    "Imaging and Contrast": [
        'Contrast Media', 'Nipple Shadow', 'Technical Quality of Image Unsatisfactory'
    ],
    "Radiological and Imaging Terms": [
        'Opacity', 'Lucency', 'Density', 'Costophrenic Angle', 'Sulcus', 'Shift',
        'Pleural Effusion', 'Consolidation', 'Thickening', 'No Indexing', 'Markings'
    ]
}

# Function to label each term
def label_term(term, categories):
    labels = []
    for category, terms in categories.items():
        if term in terms:
            labels.append(category)
    return labels

# Label each term
labeled_terms = {term: label_term(term, categories) for term in flat_list}

# Assign missing terms to their appropriate categories
missing_terms = {term: label_term(term, categories) for term in flat_list if not labeled_terms[term]}

# Create a function to replace terms with their corresponding labels
def replace_with_labels(problems, labeled_terms):
    new_problems = []
    for problem in problems:
        labels = []
        terms = problem.split(';')
        for term in terms:
            if term in labeled_terms:
                if labeled_terms[term]:
                    labels.extend(labeled_terms[term])
                else:
                    labels.append(term)  # If no label, keep the term
        new_problems.append(';'.join(set(labels)))  # Use set to avoid duplicate labels
    return new_problems

# Replace the terms in the "Problems" column with their labels
data['Problems_Labeled'] = replace_with_labels(data['Problems'], labeled_terms)

# Mapping of categories to numerical labels
category_to_number = {
    "normal":0,
    "Anatomical Structures": 1,
    "Pulmonary Diseases": 2,
    "Cardiovascular Diseases": 3,
    "Musculoskeletal Conditions": 4,
    "Neoplasms and Tumors": 5,
    "Metabolic and Endocrine Disorders": 6,
    "Infections and Inflammations": 7,
    "Skin and Subcutaneous Conditions": 8,
    "Hernias": 9,
    "Miscellaneous Conditions": 10,
    "Medical Devices and Implants": 11,
    "Imaging and Contrast": 12,
    "Radiological and Imaging Terms": 13
}

# Function to encode the labels into numbers
def encode_labels(problems_labeled, category_to_number):
    encoded_problems = []
    for labels in problems_labeled:
        encoded = []
        for label in labels.split(';'):
            if label in category_to_number:
                encoded.append(str(category_to_number[label]))
        encoded_problems.append(';'.join(set(encoded)))  # Use set to avoid duplicate labels
    return encoded_problems

# Encode the labeled problems
data['Problems_Encoded'] = encode_labels(data['Problems_Labeled'], category_to_number)

# Display the updated dataframe with the encoded "Problems" column
data[['Problems', 'Problems_Labeled', 'Problems_Encoded']].head()




Unnamed: 0,Problems,Problems_Labeled,Problems_Encoded
0,normal,normal,0
1,Cardiomegaly;Pulmonary Artery,Anatomical Structures;Cardiovascular Diseases,3;1
2,normal,normal,0
3,"Pulmonary Disease, Chronic Obstructive;Bullous...",Radiological and Imaging Terms;Pulmonary Disea...,13;8;2
4,Osteophyte;Thickening;Lung,Anatomical Structures;Radiological and Imaging...,13;1


In [58]:
# Merge the content of the 'findings' and 'impressions' columns
data['notes'] = data['findings'].fillna('') + " " + data['impression'].fillna('')


In [59]:
modified_text = data[['uid', 'notes','Problems_Encoded']]
modified_text.rename(columns={'Problems_Encoded': 'labels'}, inplace=True)

modified_text.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,uid,notes,labels
0,1,The cardiac silhouette and mediastinum size ar...,0
1,2,Borderline cardiomegaly. Midline sternotomy XX...,3;1
2,3,"No displaced rib fractures, pneumothorax, or ...",0
3,4,There are diffuse bilateral interstitial and a...,13;8;2
4,5,The cardiomediastinal silhouette and pulmonary...,13;1


In [65]:
import os

# Define the directory and file path
directory = r'C:\Users\zrzr_\OneDrive\桌面\CLEAN_IND'
file_path = os.path.join(directory, 'modified_text.csv')

# Create the directory if it doesn't exist
if not os.path.exists(directory):
    os.makedirs(directory)

# Save the file in the specified directory with the name "modified_text.csv"
modified_text.to_csv(file_path, index=False)
