In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import re
import os

In [19]:
def setup_nltk(custom_data_dir=None):
    """Setup NLTK with optional custom data directory"""
    if custom_data_dir:
        # Clean up existing paths to avoid duplicates
        if custom_data_dir in nltk.data.path:
            nltk.data.path.remove(custom_data_dir)
        
        try:
            # Test if directory is writable
            os.makedirs(custom_data_dir, exist_ok=True)
            test_file = os.path.join(custom_data_dir, 'test.txt')
            with open(test_file, 'w') as f:
                f.write('test')
            os.remove(test_file)
            
            # Add custom path to NLTK data path (only once)
            nltk.data.path.insert(0, custom_data_dir)
            print(f"Using custom NLTK data directory: {custom_data_dir}")
        except (PermissionError, OSError) as e:
            print(f"Warning: Cannot write to {custom_data_dir}: {str(e)}")
            print("Falling back to default locations")
            custom_data_dir = None
    
    # Show NLTK data paths (unique paths only)
    unique_paths = list(dict.fromkeys(nltk.data.path))
    print("NLTK Data Paths:", unique_paths)
    
    # Required packages with their subdirectories
    
    packages = {
        'tokenizers': ['punkt'],
        'taggers': ['averaged_perceptron_tagger', 'averaged_perceptron_tagger_eng'],  # Added eng version
        'corpora': ['wordnet', 'omw-1.4']
    }
    
    for subdir, pkgs in packages.items():
        for pkg in pkgs:
            try:
                nltk.data.find(f'{subdir}/{pkg}')
            except LookupError:
                nltk.download(pkg, download_dir=custom_data_dir, quiet=True)

custom_nltk_path = "/secure/shared_data/nltk_data"
setup_nltk(custom_nltk_path)

Using custom NLTK data directory: /secure/shared_data/nltk_data
NLTK Data Paths: ['/secure/shared_data/nltk_data', '/home/yl3427/nltk_data', '/home/yl3427/miniconda3/envs/umls_env_py310/nltk_data', '/home/yl3427/miniconda3/envs/umls_env_py310/share/nltk_data', '/home/yl3427/miniconda3/envs/umls_env_py310/lib/nltk_data', '/usr/share/nltk_data', '/usr/local/share/nltk_data', '/usr/lib/nltk_data', '/usr/local/lib/nltk_data']


In [22]:
class ClinicalLemmatizer:
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        # Common clinical abbreviations dictionary (you can expand this)
        self.clinical_abbrev = {
            'pt': 'patient',
            'dx': 'diagnosis',
            'tx': 'treatment',
            'hx': 'history',
            'temp': 'temperature',
            'hr': 'heart rate',
            'bp': 'blood pressure',
            'htn': 'hypertension',
            # Add more abbreviations as needed
        }

    def get_wordnet_pos(self, word):
        """Map POS tag to WordNet POS tag"""
        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {
            "J": wordnet.ADJ,
            "N": wordnet.NOUN,
            "V": wordnet.VERB,
            "R": wordnet.ADV
        }
        return tag_dict.get(tag, wordnet.NOUN)

    def lemmatize_text(self, text):
        # Convert to lowercase
        text = text.lower()
        
        # Tokenize
        words = word_tokenize(text)
        
        # Process each word
        lemmatized_words = []
        for word in words:
            # Check if it's an abbreviation
            if word in self.clinical_abbrev:
                lemmatized_words.append(self.clinical_abbrev[word])
            else:
                # Lemmatize based on POS
                pos = self.get_wordnet_pos(word)
                lemmatized_word = self.lemmatizer.lemmatize(word, pos)
                lemmatized_words.append(lemmatized_word)
        
        return ' '.join(lemmatized_words)

# Example usage
if __name__ == "__main__":
    clinical_lemmatizer = ClinicalLemmatizer()
    
    # Example text
    sample_text = "Pt presents with elevated BP and complaining of chest pains. Previous hx of HTN."
    
    lemmatized_text = clinical_lemmatizer.lemmatize_text(sample_text)
    print("Original text:", sample_text)
    print("Lemmatized text:", lemmatized_text)


Original text: Pt presents with elevated BP and complaining of chest pains. Previous hx of HTN.
Lemmatized text: patient present with elevate blood pressure and complain of chest pain . previous history of hypertension .


In [24]:
raw_text = """NSTEMI; Newly worsened MR; Hypotension: acutely hypotensive, concern was for ischemia/infarct
   versus oversedation; Respiratory distress: repeat CXR showed stable LL opacity. Also CHF; RP bleed"""
clinical_lemmatizer.lemmatize_text(raw_text)

'nstemi ; newly worsen mr ; hypotension : acutely hypotensive , concern be for ischemia/infarct versus oversedation ; respiratory distress : repeat cxr show stable ll opacity . also chf ; rp bleed'