## 1. Importing main libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## 2. Extracting the data from files

In [14]:
import os
import pandas as pd
import re

folder_path = r"/content/drive/MyDrive/TDSP/New folder"  # Update your path

# Store extracted data
all_data = []

# Define expected section headers to avoid false column splits
expected_headers = {"EXAMINATION","HISTORY", "COMPARISON", "FINDINGS", "IMPRESSION", "TECHNIQUE"}

# Recursively search for .txt files in all subfolders
for root, dirs, files in os.walk(folder_path):
    for file_name in files:
        if file_name.endswith(".txt"):  # Process only .txt files
            file_path = os.path.join(root, file_name)

            with open(file_path, 'r', encoding='utf-8') as file:
                lines = file.readlines()

            # Process data
            structured_data = {}
            current_key = None

            for line in lines:
                line = line.strip()
                if not line:
                    continue  # Skip empty lines

                # Match first occurrence of `:` but only if the key is in expected headers
                match = re.match(r"([^:]+):\s*(.*)", line)

                if match:
                    key = match.group(1).strip()
                    value = match.group(2).strip()

                    # Only process valid headers, otherwise append to previous key
                    if key in expected_headers:
                        structured_data[key] = value
                        current_key = key  # Set current key for multiline values
                    else:
                        if current_key:  # Append to previous section
                            structured_data[current_key] += " " + line.strip()
                elif current_key:  # Handle multiline values
                    structured_data[current_key] += " " + line.strip()

            # Add filename and folder path as identifiers
            structured_data["Filename"] = file_name
            structured_data["Folder_Path"] = root  # Store folder where file was found

            # Append structured data to list
            all_data.append(structured_data)

# Convert list of dictionaries to a DataFrame
df1 = pd.DataFrame(all_data)

# Ensure all required columns exist (even if missing in some reports)
for col in expected_headers:
    if col not in df1.columns:
        df1[col] = None  # Fill missing columns with None

# Display DataFrame
print(df1.head())

# Optional: Save to CSV
# df1.to_csv("structured_reports_1.csv", index=False)


                                          COMPARISON       Filename  \
0  ___. Normal chest radiograph, no evidence of p...  s55608920.txt   
1                                                NaN  s58054149.txt   
2                                                NaN  s52163036.txt   
3              Chest radiograph ___ and chest CT ___  s54573731.txt   
4                                                NaN  s55969846.txt   

                                        Folder_Path  \
0  /content/drive/MyDrive/TDSP/New folder/p10002013   
1  /content/drive/MyDrive/TDSP/New folder/p10002013   
2  /content/drive/MyDrive/TDSP/New folder/p10002013   
3  /content/drive/MyDrive/TDSP/New folder/p10002013   
4  /content/drive/MyDrive/TDSP/New folder/p10002013   

                                            FINDINGS  \
0                                                NaN   
1  A moderate left pleural effusion is new since ...   
2                                                NaN   
3   Patient is sta

## Importing biobert libraies

In [15]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

# Use a pre-trained model that is fine-tuned for medical NER (for example, ClinicalBERT)
model_name = "dmis-lab/biobert-large-cased-v1.1"  # Ensure it's the right version fine-tuned for NER
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Initialize a pipeline for Named Entity Recognition (NER)
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)

# Example text: You can replace this with your medical text
text = """
The patient was diagnosed with diabetes and hypertension.
He was prescribed Metformin and Lisinopril for treatment.
"""

# Use the NER pipeline to extract medical entities
entities = ner_pipeline(text)

# Print the extracted entities
for entity in entities:
    print(f"Entity: {entity['word']}, Label: {entity['entity']}, Score: {entity['score']}")

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-large-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Entity: the, Label: LABEL_0, Score: 0.5319763422012329
Entity: patient, Label: LABEL_0, Score: 0.5692000985145569
Entity: was, Label: LABEL_1, Score: 0.5456759929656982
Entity: diagnosed, Label: LABEL_1, Score: 0.5377629995346069
Entity: with, Label: LABEL_1, Score: 0.5790398120880127
Entity: diabetes, Label: LABEL_1, Score: 0.5790342688560486
Entity: and, Label: LABEL_0, Score: 0.5814945697784424
Entity: hypertension, Label: LABEL_1, Score: 0.7008669376373291
Entity: ., Label: LABEL_0, Score: 0.5816124677658081
Entity: he, Label: LABEL_1, Score: 0.5568948984146118
Entity: was, Label: LABEL_1, Score: 0.540989339351654
Entity: prescribed, Label: LABEL_0, Score: 0.5666303038597107
Entity: met, Label: LABEL_1, Score: 0.6086208820343018
Entity: ##form, Label: LABEL_1, Score: 0.5528942942619324
Entity: ##in, Label: LABEL_0, Score: 0.5001283884048462
Entity: and, Label: LABEL_0, Score: 0.5909073352813721
Entity: lis, Label: LABEL_0, Score: 0.5693434476852417
Entity: ##ino, Label: LABEL_1, Sc

## Extracting the medical  features from the dataset using the impression column

In [16]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# Publicly available general NER model
model_name = "dslim/bert-base-NER"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Initialize the NER pipeline
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Define medical conditions and associated keywords
conditions = {
    "has_conso": ["consolidation"],
    "has_edema": ["edema"],
    "has_effusion": ["effusion"],
    "has_tube": ["tube", "intubation"],
    "has_collapse": ["collapse", "atelectasis"],
    "has_pneumothorax": ["pneumothorax"]
}

# NER-based condition detection function
def detect_conditions_with_ner(impression_text):
    if pd.isna(impression_text) or impression_text.strip() == "":
        return pd.Series({column: False for column in conditions})

    result = {column: False for column in conditions}

    try:
        entities = ner_pipeline(impression_text)
    except Exception as e:
        print(f"Error processing text: {impression_text}. Error: {e}")
        return pd.Series(result)

    extracted_terms = [entity['word'].lower() for entity in entities]

    for column, keywords in conditions.items():
        result[column] = any(keyword in extracted_terms for keyword in keywords)

    return pd.Series(result)

# Example DataFrame


# Apply detection
df_conditions = df1["IMPRESSION"].apply(detect_conditions_with_ner)

# Merge and display results
df1 = pd.concat([df1, df_conditions], axis=1)
print(df1.head())


Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


                                          COMPARISON       Filename  \
0  ___. Normal chest radiograph, no evidence of p...  s55608920.txt   
1                                                NaN  s58054149.txt   
2                                                NaN  s52163036.txt   
3              Chest radiograph ___ and chest CT ___  s54573731.txt   
4                                                NaN  s55969846.txt   

                                        Folder_Path  \
0  /content/drive/MyDrive/TDSP/New folder/p10002013   
1  /content/drive/MyDrive/TDSP/New folder/p10002013   
2  /content/drive/MyDrive/TDSP/New folder/p10002013   
3  /content/drive/MyDrive/TDSP/New folder/p10002013   
4  /content/drive/MyDrive/TDSP/New folder/p10002013   

                                            FINDINGS  \
0                                                NaN   
1  A moderate left pleural effusion is new since ...   
2                                                NaN   
3   Patient is sta

In [17]:
df1.head()

Unnamed: 0,COMPARISON,Filename,Folder_Path,FINDINGS,IMPRESSION,EXAMINATION,TECHNIQUE,HISTORY,has_conso,has_edema,has_effusion,has_tube,has_collapse,has_pneumothorax
0,"___. Normal chest radiograph, no evidence of p...",s55608920.txt,/content/drive/MyDrive/TDSP/New folder/p10002013,,,,,,False,False,False,False,False,False
1,,s58054149.txt,/content/drive/MyDrive/TDSP/New folder/p10002013,A moderate left pleural effusion is new since ...,New moderate left pleural effusion with adjace...,,,,False,False,False,False,False,False
2,,s52163036.txt,/content/drive/MyDrive/TDSP/New folder/p10002013,,,,,,False,False,False,False,False,False
3,Chest radiograph ___ and chest CT ___,s54573731.txt,/content/drive/MyDrive/TDSP/New folder/p10002013,Patient is status post median sternotomy and ...,No acute cardiopulmonary abnormality.,CHEST (PORTABLE AP) INDICATION: ___ year old ...,Upright AP view of the chest,,False,False,False,False,False,False
4,,s55969846.txt,/content/drive/MyDrive/TDSP/New folder/p10002013,In comparison with the study of earlier in thi...,,,,"Chest tube removal, to assess for pneumothorax.",False,False,False,False,False,False


## Extracting the gender from data

In [18]:
import pandas as pd
import numpy as np

# Sample DataFrame
# df = pd.read_csv("your_file.csv")  # if you're reading from a file
# Assuming the DataFrame has 'EXAMINATION' and 'HISTORY' columns

def extract_gender(text):
    if pd.isnull(text):
        return None
    text = str(text).lower()
    if 'f' in text and any(w in text for w in [' f ','__f ','___f ', '_f ', ' f ', '___f ','female','___F','WOMAN','woman']):
        return 'female'
    if 'm' in text and any(w in text for w in [' m ','__m ','___m','___m ',' m ' '_m ', ' m ', '____m ','___M',' male','MAN', 'man ']):
        return 'male'
    return None

def determine_gender(row):
    # Check the 'EXAMINATION' column first
    gender = extract_gender(row['EXAMINATION'])
    if gender:
        return gender

    # Check the 'FINDINGS' column
    gender = extract_gender(row['FINDINGS'])
    if gender:
        return gender

    # Check the 'IMPRESSION' column
    gender = extract_gender(row['IMPRESSION'])
    if gender:
        return gender

    # If no gender is found, return a default message
    return 'could not find the gender'


def history_flag(history_text):
    return bool(pd.notnull(history_text) and str(history_text).strip())

# Assuming df is your DataFrame
df1['gender'] = df1.apply(determine_gender, axis=1)
df1['history_present'] = df1['HISTORY'].apply(history_flag)



In [19]:
df1.head()

Unnamed: 0,COMPARISON,Filename,Folder_Path,FINDINGS,IMPRESSION,EXAMINATION,TECHNIQUE,HISTORY,has_conso,has_edema,has_effusion,has_tube,has_collapse,has_pneumothorax,gender,history_present
0,"___. Normal chest radiograph, no evidence of p...",s55608920.txt,/content/drive/MyDrive/TDSP/New folder/p10002013,,,,,,False,False,False,False,False,False,could not find the gender,False
1,,s58054149.txt,/content/drive/MyDrive/TDSP/New folder/p10002013,A moderate left pleural effusion is new since ...,New moderate left pleural effusion with adjace...,,,,False,False,False,False,False,False,could not find the gender,False
2,,s52163036.txt,/content/drive/MyDrive/TDSP/New folder/p10002013,,,,,,False,False,False,False,False,False,could not find the gender,False
3,Chest radiograph ___ and chest CT ___,s54573731.txt,/content/drive/MyDrive/TDSP/New folder/p10002013,Patient is status post median sternotomy and ...,No acute cardiopulmonary abnormality.,CHEST (PORTABLE AP) INDICATION: ___ year old ...,Upright AP view of the chest,,False,False,False,False,False,False,female,False
4,,s55969846.txt,/content/drive/MyDrive/TDSP/New folder/p10002013,In comparison with the study of earlier in thi...,,,,"Chest tube removal, to assess for pneumothorax.",False,False,False,False,False,False,could not find the gender,True


## Removing unnecessary columsn

In [20]:
df1.dropna(subset=['IMPRESSION'], inplace=True)

df1 = df1[df1['gender'] != 'could not find the gender']

In [21]:
df1.head()

Unnamed: 0,COMPARISON,Filename,Folder_Path,FINDINGS,IMPRESSION,EXAMINATION,TECHNIQUE,HISTORY,has_conso,has_edema,has_effusion,has_tube,has_collapse,has_pneumothorax,gender,history_present
3,Chest radiograph ___ and chest CT ___,s54573731.txt,/content/drive/MyDrive/TDSP/New folder/p10002013,Patient is status post median sternotomy and ...,No acute cardiopulmonary abnormality.,CHEST (PORTABLE AP) INDICATION: ___ year old ...,Upright AP view of the chest,,False,False,False,False,False,False,female,False
7,___.,s55941092.txt,/content/drive/MyDrive/TDSP/New folder/p10002013,PA and lateral views of the chest provided. ...,No acute intrathoracic process. If there is s...,CHEST (PA AND LAT) INDICATION: History: ___F ...,,,False,False,False,False,False,False,female,False
9,Chest radiograph dated ___.,s53781756.txt,/content/drive/MyDrive/TDSP/New folder/p10002221,"The lungs are clear. No focal consolidation,...",No acute intrathoracic process. No significa...,PA and lateral chest radiograph INDICATION: _...,,,False,False,False,False,False,False,male,False
10,None.,s56373033.txt,/content/drive/MyDrive/TDSP/New folder/p10002157,The lungs are clear without focal consolidati...,No acute cardiopulmonary process.,Chest: Frontal and lateral views INDICATION:...,Chest: Frontal and Lateral,,False,False,False,False,False,False,male,False
44,Prior study from ___.,s53254222.txt,/content/drive/MyDrive/TDSP/New folder/p10002430,PA and lateral views of the chest provided. ...,Postsurgical changes in the right hemi thorax...,CHEST (PA AND LAT) INDICATION: ___M with DOE ...,,,False,False,False,False,False,False,male,False


In [22]:
df1['Sentiment'] = df_conditions.any(axis=1).apply(lambda x: 'Negative' if x else 'Positive')


In [23]:
df1.head()

Unnamed: 0,COMPARISON,Filename,Folder_Path,FINDINGS,IMPRESSION,EXAMINATION,TECHNIQUE,HISTORY,has_conso,has_edema,has_effusion,has_tube,has_collapse,has_pneumothorax,gender,history_present,Sentiment
3,Chest radiograph ___ and chest CT ___,s54573731.txt,/content/drive/MyDrive/TDSP/New folder/p10002013,Patient is status post median sternotomy and ...,No acute cardiopulmonary abnormality.,CHEST (PORTABLE AP) INDICATION: ___ year old ...,Upright AP view of the chest,,False,False,False,False,False,False,female,False,Positive
7,___.,s55941092.txt,/content/drive/MyDrive/TDSP/New folder/p10002013,PA and lateral views of the chest provided. ...,No acute intrathoracic process. If there is s...,CHEST (PA AND LAT) INDICATION: History: ___F ...,,,False,False,False,False,False,False,female,False,Positive
9,Chest radiograph dated ___.,s53781756.txt,/content/drive/MyDrive/TDSP/New folder/p10002221,"The lungs are clear. No focal consolidation,...",No acute intrathoracic process. No significa...,PA and lateral chest radiograph INDICATION: _...,,,False,False,False,False,False,False,male,False,Positive
10,None.,s56373033.txt,/content/drive/MyDrive/TDSP/New folder/p10002157,The lungs are clear without focal consolidati...,No acute cardiopulmonary process.,Chest: Frontal and lateral views INDICATION:...,Chest: Frontal and Lateral,,False,False,False,False,False,False,male,False,Positive
44,Prior study from ___.,s53254222.txt,/content/drive/MyDrive/TDSP/New folder/p10002430,PA and lateral views of the chest provided. ...,Postsurgical changes in the right hemi thorax...,CHEST (PA AND LAT) INDICATION: ___M with DOE ...,,,False,False,False,False,False,False,male,False,Positive
