<a href="https://colab.research.google.com/github/AllanApiny/Medication_Info_Extractor_-NLP-/blob/main/Another_copy_of_Medication_Info_Extractor_(NLP).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('maxent_ne_chunker_tab')
nltk.download('words')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker_tab.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [None]:
pip install fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [None]:
import re
from nltk.tokenize import word_tokenize, sent_tokenize
from fuzzywuzzy import fuzz, process

source_of_truth = [
    {
        'brand_name': 'Lipitor',
        'generic_name': 'atorvastatin',
        'dosage': '40mg',
        'category': 'statin'
    },
    {
        'brand_name': 'Metformin',
        'generic_name': 'metformin hydrochloride',
        'dosage': '500mg',
        'category': 'antidiabetic'
    },
    {
        'brand_name': 'Zestril',
        'generic_name': 'lisinopril',
        'dosage': '10mg',
        'category': 'Ace Inhibitor'
    }
]



In [None]:
def extract_medication_info(text):
  medication_true_names = []
  medication_brand_names = []
  medication_generic_names = []

  for item in source_of_truth:
    medication_true_names.extend([item['brand_name'].lower(),item['generic_name'].lower()])
    medication_brand_names.append(item['brand_name'].lower())
    medication_generic_names.append(item['generic_name'].lower())

  # Compile patterns
  dosage_pattern = re.compile(r'\b(\d+\s*[mM]?g)\b')
  data_extracted = []

  # Process each sentence
  for sentence in sent_tokenize(text):
      # Split sentence into words and clean them
      words = word_tokenize(sentence)
      cleaned_words = [word.lower() for word in words if word.isalnum()]

      # Find medication matches using fuzzy matching
      potential_medications = set()
      for word in cleaned_words:
          # Use process.extractBests to get multiple potential matches
          matches = process.extractBests(word, medication_true_names,
                                      scorer=fuzz.token_set_ratio,
                                      score_cutoff=80,
                                      limit=2)
          for match, score in matches:
              potential_medications.add(match)

      # Convert matches to proper format and find associated names
      medications = {}
      for med in potential_medications:
          if med in medication_brand_names:
              medications['brand_name']= med.title()
          elif med in medication_generic_names:
              medications['generic_name']= med.title()

      # Extract dosages
      medications['dosage'] = dosage_pattern.findall(sentence)


      data_extracted.append(medications)
  return data_extracted

In [None]:
extract_medication_info(text)

[{'brand_name': 'Metformin',
  'generic_name': 'Metformin Hydrochloride',
  'dosage': ['500mg']},
 {'brand_name': 'Lipitor', 'dosage': ['40mg']},
 {'generic_name': 'Lisinopril', 'dosage': ['10mg']}]

In [None]:
# Test the function
text = """
    The patient was prescribed Metformin 500mg by Novo Nordisk. Lipitor 40mg from Pfizer was recommended for cholesterol.
    Also consider lisinopril 10mg if needed.
    """

result = extract_medication_info(text)
for i, sentence_data in enumerate(result):
    print(f"\nSentence {i+1}:")
    print("Medications:", sentence_data)


Sentence 1:
Medications: {'brand_name': 'Metformin', 'generic_name': 'Metformin Hydrochloride', 'dosage': ['500mg']}

Sentence 2:
Medications: {'brand_name': 'Lipitor', 'dosage': ['40mg']}

Sentence 3:
Medications: {'generic_name': 'Lisinopril', 'dosage': ['10mg']}
