<a href="https://colab.research.google.com/github/DarthCoder501/GAAP/blob/main/Impressions_Feature_Extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install scispacy

In [None]:
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_lg-0.5.4.tar.gz

In [None]:
import pandas as pd
import spacy
import re
from collections import defaultdict

# Load clinical model
nlp = spacy.load("en_core_sci_lg")

In [4]:
def detect_abbreviations(text):
    abbreviations = set()

    # Rule-based
    abbrev_pattern = r'\b[A-Z0-9]{2,6}s?\b'
    potential_abbrevs = re.findall(abbrev_pattern, text)

    # Semantic check
    doc = nlp(text)
    for token in doc:
        if token.text in potential_abbrevs and token.text.isupper():
            # Verify against known terms
            if not nlp.vocab[token.text.lower()].is_stop:
                abbreviations.add(token.text)

    return list(abbreviations)

In [5]:
# Read CSV file
df = pd.read_csv('/content/Progression Dataset from Colab.csv')  # replace with your filename

# Process all impressions and collect abbreviations
all_abbreviations = defaultdict(list)

for idx, row in df.iterrows():
    impression = row['impressions']
    abbrevs = detect_abbreviations(impression)

    # Store both the abbreviations and their source row
    for abbrev in abbrevs:
        all_abbreviations[abbrev].append(idx+2)  # +2 because CSV rows typically start at 1, and header is row 1

# Create a summary dataframe
abbrev_df = pd.DataFrame({
    'Abbreviation': all_abbreviations.keys(),
    'Count': [len(v) for v in all_abbreviations.values()],
    'Found in Rows': [', '.join(map(str, v)) for v in all_abbreviations.values()]
})

In [9]:
def complex_abbreviations_detection(text):
    abbreviations = set()
    if not isinstance(text, str):  # handle NaN/empty values
        return []

    # Rule-based pattern (updated to capture common medical formats)
    abbrev_pattern = r'\b(?:[A-Z][a-z]*[A-Z]|[A-Z]{2,})[0-9]*s?\b|\b[0-9]+[A-Z]+[0-9A-Z]*\b'
    potential_abbrevs = re.findall(abbrev_pattern, text)

    # Semantic check
    doc = nlp(text)
    for token in doc:
        if token.text in potential_abbrevs:
            # Additional checks to exclude common words that slip through
            if (token.text.isupper() or
                (any(c.isupper() for c in token.text) and len(token.text) > 1)):
                if not nlp.vocab[token.text.lower()].is_stop:
                    abbreviations.add(token.text)

    return list(abbreviations)

In [None]:
# Print most common abbreviations
print(f"Found {len(abbrev_df)} unique abbreviations/codes:")
print(abbrev_df.sort_values('Count', ascending=False).head(20))

In [None]:
abbrev_df

In [13]:
abbrev_df.to_csv('detected_abbreviations.csv', index=False)

In [10]:
# Read CSV file
med_df = pd.read_csv('/content/Progression Dataset from Colab.csv')  # replace with your filename

# Process all impressions and collect abbreviations
all_med_abbreviations = defaultdict(list)

for idx, row in med_df.iterrows():
    impression = row['impressions']
    med_abbrevs = complex_abbreviations_detection(impression)

    # Store both the abbreviations and their source row
    for abbrev in med_abbrevs:
        all_med_abbreviations[abbrev].append(idx+2)  # +2 because CSV rows typically start at 1, and header is row 1

# Create a summary dataframe
med_abbrev_df = pd.DataFrame({
    'Abbreviation': all_med_abbreviations.keys(),
    'Count': [len(v) for v in all_med_abbreviations.values()],
    'Found in Rows': [', '.join(map(str, v)) for v in all_med_abbreviations.values()]
})

In [None]:
# Print most common abbreviations
print(f"Found {len(med_abbrev_df)} unique abbreviations/codes:")
print(med_abbrev_df.sort_values('Count', ascending=False).head(20))

In [None]:
med_abbrev_df

In [14]:
med_abbrev_df.to_csv('detected_med_abbreviations.csv', index=False)