In [123]:
#!/usr/bin/env python
# -*-coding:utf-8 -*-
'''
@File    :   GCAF_Investigate_Related_Condition_Names.ipynb
@Time    :   2024/09/18 13:45:31
@Author  :   Asra Aslam 
@Version :   1.0
@Contact :   a.aslam2@leeds.ac.uk
@License :   (C)Copyright Asra Aslam DynAIRX
@Desc    :   None
'''
import pandas as pd
import re
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize

# Download necessary resources for part-of-speech tagging
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hssaas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\hssaas\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [124]:
# Read Excel file
df_comments = pd.read_excel('Clincians_Intervention_Split_Keep_Group.xlsx')
df_comments

Unnamed: 0,eFI2 Deficits,Comments_eFI,Liverpool Diseases,Comments_LW
0,"'Abdominal pain',",Liverpool list doesn't include chronic abdo pa...,Abdominal Aortic Aneurysm,Keep this - not currently covered by eFI list
1,"'Activity limitation',",,Abdominal Hernia,Keep this - not currently covered by eFI list
2,"'Alcohol',",I would prefer to have distinct lists for alco...,Acne,Keep this - not currently covered by eFI list
3,"'Anaemia & haematinic deficiency',",I would prefer to separate these out and have ...,Acoustic Neuroma,benign tumour - we could put all benign tumour...
4,"'Anxiety',",need to compare code lists for consistency.,Actinic keratosis,Keep this - not currently covered by eFI list
...,...,...,...,...
177,,,Urolithiasis,Keep this - not currently covered by eFI list
178,,,Uterovaginal Genital Prolapse,Keep this - not currently covered by eFI list
179,,,Valve disorder non-rheumatic multiple,can merge all the valve disorder lists
180,,,Visual Impairment and Blindness,compare with eFI visual impairment list and me...


In [125]:
# Sample DataFrame (with collections of words in columns)
# data = {
#     'Liverpool Diseases': ['apple and banana', 'orange and grape', 'peach and plum', 'mango and papaya'],
#     'Comments_LW': ['red car and blue bike', 'fast train and slow plane', 'small boat', 'big ship'],
#     'Column3': ['dog and cat', 'bird and fish', 'elephant and giraffe', 'lion and tiger']
# }

# df = pd.DataFrame(data)
# Initial list of keywords to search for
initial_keywords = ['Macular', 'Degeneration']

In [126]:
# Function to extract words from a string (case-insensitive)
def extract_words(text):
    return re.findall(r'\b\w+\b', text.lower())  # Extract words and convert to lowercase

In [127]:
# Function to check if a word is a noun
def is_noun(word):
    # Tokenize and apply POS tagging
    pos_info = pos_tag(word_tokenize(word))
    
    # POS tag for noun starts with 'NN' (singular NN or plural NNS)
    return pos_info[0][1] in ['NN', 'NNS']

# Function to filter out nouns from a list of keywords
def filter_nouns(keywords):
    nouns = [word for word in keywords if is_noun(word)]
    return nouns

# Function to filter out nouns and specific words from a list of keywords
def filter_specific_words(keywords, specific_words):
    non_nouns = [word for word in keywords if word not in specific_words]
    return non_nouns

In [128]:
def search_keywords_recursive(df, keywords, all_found_keywords=None):
    if all_found_keywords is None:
        all_found_keywords = set()  # Track all found keywords to avoid endless loops

    # Convert keywords to lowercase for case-insensitive matching
    keywords_orig = [keyword.lower() for keyword in keywords]

    # Get the keywords that are not nouns
    keywords_nouns = filter_nouns(keywords_orig)

    # Specific words to remove
    specific_words = {'isn', 'm', 's'}
    keywords = filter_specific_words(keywords_nouns, specific_words)
    

    # Dictionary to store matches
    result = {}

    # Track new keywords to keep searching
    new_keywords = set()

    # Iterate through each keyword and search in both columns
    for keyword in keywords:
        # Skip keywords already fully searched
        if keyword in all_found_keywords:
            continue

        all_found_keywords.add(keyword)
        result[keyword] = {'Liverpool Diseases': [], 'Comments_LW': []}

        for col in ['Liverpool Diseases', 'Comments_LW']:
            # Handle NaN values by converting all values to strings before searching
            matching_rows = df[df[col].astype(str).str.lower().str.contains(keyword, na=False)]

            # Collect matching entries for these rows and extract new keywords
            if not matching_rows.empty:
                result[keyword][col] = matching_rows[col].tolist()

                # Extract words from the matching rows in both Liverpool Diseases and Comments_LW
                for row in matching_rows.itertuples():
                    col1_text = str(row[1])  # Convert Liverpool Diseases data to string
                    col2_text = str(row[2])  # Convert Comments_LW data to string

                    # Immediately add new keywords found in matching rows
                    new_keywords.update(extract_words(col1_text))
                    new_keywords.update(extract_words(col2_text))

    # Remove already searched keywords from the new keywords
    new_keywords -= all_found_keywords

    # If there are new keywords, perform the recursive search
    if new_keywords:
        further_results = search_keywords_recursive(df, list(new_keywords), all_found_keywords)

        # Combine results from the current iteration with recursive results
        for keyword, col_dict in further_results.items():
            if keyword in result:
                result[keyword]['Liverpool Diseases'].extend(col_dict['Liverpool Diseases'])
                result[keyword]['Comments_LWs_LW'].extend(col_dict['Comments_LW'])
            else:
                result[keyword] = col_dict

    return result

In [129]:
# Perform the recursive search
result = search_keywords_recursive(df_comments, initial_keywords)
result

{'macular': {'Liverpool Diseases': ['Macular Degeneration'],
  'Comments_LW': []},
 'degeneration': {'Liverpool Diseases': ['Macular Degeneration'],
  'Comments_LW': []},
 'nan': {'Liverpool Diseases': [],
  'Comments_LW': [nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan]},
 'method': {'Liverpool Diseases': [], 'Comments_LW': []},
 'use': {'Liverpool Diseases': ['Psychoactive Substance Misuse'],
  'Comments_LW': ['Keep this - not currently covered by eFI list and should probably be on its own and not lumped with the other benign tumours owing to its affect on neighbouring structures and can cause serious illness']},
 'stage': {'Liverpool Diseases': ['End Stage Renal Disease'],
  'Comments_LW': []},
 'definition': {'Liverpool Diseases': [], 'Comments_LW': []},
 'copd': {'Liverpool Diseases': ['Chronic Obstructive Pulmonary Disease (COPD)'],
  'Comments_LW': []},
 'management': {'Liverpool Diseases': [], 'Comment

In [130]:
# Output the result
for keyword, columns in result.items():
    print(f"Keyword '{keyword}' found in:")
    for col, entries in columns.items():
        print(f"  {col}: {entries}")

Keyword 'macular' found in:
  Liverpool Diseases: ['Macular Degeneration']
  Comments_LW: []
Keyword 'degeneration' found in:
  Liverpool Diseases: ['Macular Degeneration']
  Comments_LW: []
Keyword 'nan' found in:
  Liverpool Diseases: []
  Comments_LW: [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]
Keyword 'method' found in:
  Liverpool Diseases: []
  Comments_LW: []
Keyword 'use' found in:
  Liverpool Diseases: ['Psychoactive Substance Misuse']
  Comments_LW: ['Keep this - not currently covered by eFI list and should probably be on its own and not lumped with the other benign tumours owing to its affect on neighbouring structures and can cause serious illness']
Keyword 'stage' found in:
  Liverpool Diseases: ['End Stage Renal Disease']
  Comments_LW: []
Keyword 'definition' found in:
  Liverpool Diseases: []
  Comments_LW: []
Keyword 'copd' found in:
  Liverpool Diseases: ['Chronic Obstructive Pulmonary Disease (COPD)']
  Comments_LW: []
Keyword 'ma