# **Context-Rich Natural Sentence Generation from Metadata and `.txt` Files for the DMID Dataset**


## Import necessary libraries

In [37]:
import os
import re
import numpy as np
import pandas as pd

## Load Metadata and preprocess

In [38]:
df = pd.read_csv('mammography-data.csv')
df.head()

Unnamed: 0,Ref num,View,Tissue,Abnormality type,Class,X,Y,Radius
0,IMG001,MLOLT,G,MISC+CALC,M,1567.0,3644.0,295.0
1,IMG001,MLOLT,G,CIRC,B,1461.0,3102.0,85.0
2,IMG002,MLORT,G,NORM,,,,
3,IMG003,CCLT,F,NORM,,,,
4,IMG004,CCRT,F,NORM,,,,


In [39]:
df = df.map(lambda x: x.strip() if isinstance(x, str) else x)
df = df.drop(['X', 'Y', 'Radius', 'View', 'Tissue'], axis=1)
df.head()

Unnamed: 0,Ref num,Abnormality type,Class
0,IMG001,MISC+CALC,M
1,IMG001,CIRC,B
2,IMG002,NORM,
3,IMG003,NORM,
4,IMG004,NORM,


In [40]:
df.isnull().sum()

Ref num               0
Abnormality type      0
Class               200
dtype: int64

In [41]:
df.fillna("B", inplace=True)
df.head()

Unnamed: 0,Ref num,Abnormality type,Class
0,IMG001,MISC+CALC,M
1,IMG001,CIRC,B
2,IMG002,NORM,B
3,IMG003,NORM,B
4,IMG004,NORM,B


In [42]:
df.columns = df.columns.str.strip()
df = df.replace(r'\s+', '', regex=True)
df['Abnormality type'].value_counts()

Abnormality type
NORM         200
CIRC         178
CALC          80
MISC          76
SPIC          43
CIRC+CALC     19
MISC+CALC     13
SPIC+CALC      8
ARCH+CALC      7
ASYM           6
SPIC+ARCH      5
ARCH           4
CALC+MISC      3
CIRC+SPIC      2
ARCH+CLAC      1
CALC+CIRC      1
MISC+SPIC      1
Name: count, dtype: int64

In [43]:
# Sort to prioritize "M" over "B" in duplicates
df.sort_values(by=["Ref num", "Class"], ascending=[True, False], inplace=True)

# Group by "Ref num" and merge "Abnormality type" values
df = df.groupby("Ref num").agg({
    "Abnormality type": lambda x: "+".join(sorted(set("+".join(x).split("+")))),  # Remove duplicates, keep unique
    "Class": "first"  # Take the first (sorted) category
}).reset_index()

## Breast composition extraction from `.txt` file

In [44]:
def extract_breast_composition(txt_dir, output_csv):

    data = []
    
    # Regex to match variations like 'breast' or 'brest'
    pattern = re.compile(
        r'\b(breast|brest)\s*composition[s]?:\s*(.*)',
        re.IGNORECASE
    )

    for filename in os.listdir(txt_dir):
        if filename.lower().endswith(".txt"):
            file_path = os.path.join(txt_dir, filename)
            
            with open(file_path, 'r', encoding='utf-8') as file:
                lines = file.readlines()
                
                required_text = None
                for i, line in enumerate(lines):
                    match = pattern.match(line.strip())
                    if match:
                        # Case 1: Same line after colon
                        if match.group(2):
                            required_text = match.group(2).strip()
                        # Case 2: On next line
                        elif i + 1 < len(lines):
                            required_text = lines[i + 1].strip()
                        break

                data.append({
                    "FILE": filename[:-4].upper(),
                    "Breast Composition": required_text or "Not Found"
                })

    # Convert to DataFrame
    df_text = pd.DataFrame(data)

    # Strip spaces from string cells
    df_text = df_text.map(lambda x: x.strip() if isinstance(x, str) else x)

    # Save to CSV
    df_text.to_csv(output_csv, index=False)

    return df_text

In [None]:
df_ebc = extract_breast_composition(
    txt_dir=r"E:\Mammography\DMID\Reports",
    output_csv="extracted_breast_composition_git.csv"
)
df_ebc.head()

Unnamed: 0,FILE,Breast Composition
0,IMG001,predominantly fibro fatty breast parenchyma (A...
1,IMG002,Fibro fatty with scattered glandular breast pa...
2,IMG003,Predominantly fibro fatty breast parenchyma (A...
3,IMG004,Predominantly fibro fatty breast parenchyma (A...
4,IMG005,Predominantly fibro fatty breast parenchyma (A...


## Mapping

In [47]:
# Mapping of abbreviations to full forms
abnormality_mapping = {
    "CALC": "Calcification",
    "CIRC": "Well-defined/circumscribed Masses",
    "SPIC": "Spiculated Masses",
    "MISC": "Other, ill-defined Masses",
    "ARCH": "Architectural Distortion",
    "ASYM": "Asymmetry",
    "NORM": "Normal"
}

# Replace abbreviations in the "Abnormality type" column
df["Abnormality type"] = (
    df["Abnormality type"]
    .str.split("+")  # Split the string by "+" to handle multiple abbreviations
    .apply(lambda x: ", ".join(abnormality_mapping.get(item, item) for item in x))  # Replace and join with ", "
)

# Display the modified DataFrame
df.head()

Unnamed: 0,Ref num,Abnormality type,Class
0,IMG001,"Calcification, Well-defined/circumscribed Mass...",M
1,IMG002,Normal,B
2,IMG003,Normal,B
3,IMG004,Normal,B
4,IMG005,Normal,B


In [48]:
df['Abnormality type'].value_counts()

Abnormality type
Normal                                                                            200
Well-defined/circumscribed Masses                                                 111
Other, ill-defined Masses                                                          55
Spiculated Masses                                                                  33
Calcification                                                                      24
Calcification, Well-defined/circumscribed Masses                                   24
Calcification, Other, ill-defined Masses                                           14
Calcification, Spiculated Masses                                                    8
Architectural Distortion, Calcification                                             7
Well-defined/circumscribed Masses, Spiculated Masses                                7
Asymmetry                                                                           5
Well-defined/circumscribed Masses, Ot

In [49]:
# Rename columns in the second dataframe to align with the first dataframe
df_ebc.rename(columns={"FILE": "Ref num", "Breast Composition": "Tissue"}, inplace=True)

# Merge the two dataframes on the "Ref num" column
merged_df = pd.merge(df, df_ebc, on="Ref num", how="inner")

# Reorder columns to move "Class" to the end
column_order = [col for col in merged_df.columns if col != "Class"] + ["Class"]
merged_df = merged_df[column_order]

# Replace normal class with benign class
merged_df['Class'] = merged_df['Class'].replace("N", "B")

# Display the merged dataframe
merged_df.head()

Unnamed: 0,Ref num,Abnormality type,Tissue,Class
0,IMG001,"Calcification, Well-defined/circumscribed Mass...",predominantly fibro fatty breast parenchyma (A...,M
1,IMG002,Normal,Fibro fatty with scattered glandular breast pa...,B
2,IMG003,Normal,Predominantly fibro fatty breast parenchyma (A...,B
3,IMG004,Normal,Predominantly fibro fatty breast parenchyma (A...,B
4,IMG005,Normal,Predominantly fibro fatty breast parenchyma (A...,B


In [29]:
merged_df['Class'].value_counts()

Class
B    380
M    130
Name: count, dtype: int64

## Sentence generation 

In [33]:
# Create natural sentences using templates
def generate_sentence(row):
    return f"Findings indicate {row['Abnormality type']} in a breast characterized by {row['Tissue']}"

# Apply the function to the DataFrame
merged_df['Generated Sentence'] = merged_df.apply(generate_sentence, axis=1)

# Display the updated DataFrame
merged_df[['Ref num', 'Generated Sentence']].head()

Unnamed: 0,Ref num,Generated Sentence
0,IMG001,"Findings indicate Calcification, Well-defined/..."
1,IMG002,Findings indicate Normal in a breast character...
2,IMG003,Findings indicate Normal in a breast character...
3,IMG004,Findings indicate Normal in a breast character...
4,IMG005,Findings indicate Normal in a breast character...


In [35]:
# Sample
merged_df['Generated Sentence'][89]

'Findings indicate Well-defined/circumscribed Masses in a breast characterized by fibro glandular breast parenchyma (ACR B).'