In [48]:
# library Imports 

import pandas as pd
from pandas.io.stata import StataReader
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import DBSCAN
import string
import re
from contractions import fix
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt
import spacy
import numpy as np

# Loading the data

In [49]:
data_encoded = pd.read_stata('ukcv.dta', convert_categoricals=False)
data_encoded.head()

Unnamed: 0,psid,supportname,age,gender,genderbirth,ethnicity,bmi,smoke,preautoimmunity,preautoimmunitymeds,...,dxwhy_dismiss,dxwhy_help,dxwhy_guide,dxwhy_research,dxwhy_aware,dxwhy_oth,story_open,consented,supportgroup,trigvax
0,907633-907615-96019409,UKCV Covid Vaccine - Long Haul Autoimmune Sup...,2,2,1.0,3,2,1,0,0,...,0,1,1,1,1,0,,10.0,1.0,1.0
1,907633-907615-96026239,Uk cv group,8,1,1.0,3,3,3,0,2,...,0,0,0,0,0,0,,10.0,1.0,1.0
2,907633-907615-96025984,,3,2,1.0,4,3,3,0,0,...,1,1,1,1,1,1,A day after my first (and only) Pfizer vaccine...,10.0,3.0,1.0
3,907633-907615-96027621,Uk Cv family,4,2,1.0,3,2,3,0,0,...,1,1,1,0,1,0,It was quite tricky to fill in the time line o...,10.0,1.0,1.0
4,907633-907615-96027090,Ukcv family Facebook,7,2,1.0,3,4,2,0,0,...,0,1,1,1,1,0,Ive had mixed experiences with my GP practice...,10.0,1.0,1.0


Filtering for the textual data - focusing on text for this small demo

In [50]:
# The textual columns that I'm looking at:
textual_data = data_encoded.select_dtypes(include=['object'])
textual_data = textual_data.drop(['psid', 'supportname', 'batchno', 'story_open'], axis=1)
textual_data.head(2)

Unnamed: 0,opensxtrig_fatigue,opensxtrig_lhead,opensxtrig_vert,opensxtrig_cog,opensxtrig_tin,opensxtrig_naus,opensxtrig_weak,opensxtrig_ins,opensxtrig_breath,opensxtrig_exint,...,opensxtrig_food,opensxtrig_bb,opensxtrig_anx,opensxtrig_dep,opensxtrig_hal,opensxtrig_mood,sx_other_open,othertests,other_diag_open,tx_other_open
0,Periods Post-exertional malaise,Periods,Periods Sensory stimulation Video calls Pho...,Periods,Music Sensory stimulation,Periods Fatigue Hunger,Periods Fatigue,Relapses,Sitting upright Being active (e.g. being at w...,,...,,,,,,,Numb hands and feet,,,Omega 3 Folate
1,,,,,,,,,,,...,,,,,,,,,,


# Generating the medical_terms list

Loading the bio-medical model, this is used to identify medical terms in the text

In [51]:
punctuation = string.punctuation
stopwords_list = stopwords.words('english')
nlp = spacy.load("en_core_sci_lg")

Below is a small demo showing how the terms are found in the sentence in the code below.

In [52]:
# Process biomedical text
doc = nlp("This is a sample sentence containing medical terms like diabetes, hypertension, and cancer, PEM, Post-exertional malaise, light headed")

# Access biomedical entities
for ent in doc.ents:
    print(ent.text, ent.label_)

sample sentence ENTITY
medical ENTITY
diabetes ENTITY
hypertension ENTITY
cancer ENTITY
PEM ENTITY
Post-exertional malaise ENTITY
light headed ENTITY


The code below is doing the majority of the transformations, trying to combine similar words ect.
There are some notes scattered throughout the code below, however below is an overview of what it does:

1. Finds medical terms
2. Expands the contractions - i.e. can't -> cannot (easier to process later)
3. Removed punctuation
4. Lemmatizes - attempts to reduce words into their base-form - i.e. running -> run
5. Lower-cases everything and removes stop-words (words like "the", "and", "but" ect...)
6. Removes any extra spaces, making the words unreadable
7. Contains a transformations dictionary - this contains various codings for transformations that can be applied -> this needs to be added to
8. That transformation dictionary is applied on the data

In [79]:
#Function that I'm working on - still working on this
symptoms_list = []

def find_symptoms(data):
    
    # Step 1: Join
    data = " ".join(data)
    doc = nlp(data)
    
    # Step 2: Find the medical terms
    symptoms = []
    for entity in doc.ents:
        sep = re.split(r'\s{2,}|/', entity.text) # seperating terms with "/" and inputs that have elongated spaces
        symptoms.extend(sep)  # creating a symptom list
    
    # Step 3: Do lemma, punct removal, and lower-case
    symptoms = [fix(x) for x in symptoms] # Expand Contractions
    symptoms = [re.sub(r"[^\w\s]|_", " ", x) for x in symptoms] # Remove Punctuation
    symptoms = [WordNetLemmatizer().lemmatize(x) for x in symptoms] # Lemmatize
    symptoms = [x.lower() for x in symptoms if x.lower() not in stopwords_list] # Lower-case and remove stopwords
    symptoms = [re.sub(' +', ' ', x) for x in symptoms] # Removing uneeded spaces
    
    
    # Step 4: Applying transformations
    # Tranformation dictionary - I'm still adding to this dictionary
    transformations = {
        ("crash","relapses"): 'relapse', # crash and relapses replaced with relapse
        ("bruh",) : 'word',
        ('histamine',) : 'histamine intolerance',
        ('period',) : 'periods',
        ('pace', 'rest') : 'pacing',
        ('exhuasion',) : 'fatigue', 
        ('thinking', 'concentration') : 'cognitive',
        ('anxiety',) : 'anxiety',
        ('c reactive protein') : '',
        ('vaccination', 'vaccine') : 'vaccine',
        ('walking', 'walk', 'activity') : 'fatigue',
    }
    
    # building and applying the patterns set above
    for pattern, replacement in transformations.items():
        if isinstance(pattern, tuple):
            pattern = "|".join([re.escape(subpattern) for subpattern in pattern])
            pattern = rf".*\b({pattern})\b.*"
            symptoms = [re.sub(pattern, replacement, x) for x in symptoms]
    
    #print(symptoms)
    return symptoms

symptoms_list = []

The below is just running the above, and writing the unique entries to a file

In [54]:
def test_code(data):
    filt_data = data
    filt_data = filt_data.applymap(lambda x: x.encode('latin-1').decode('cp1252')) # conversions for some punctuation in the incorrect format
    filt_data['medical_terms'] = filt_data.apply(find_symptoms, axis=1)
    
    mtl = filt_data['medical_terms'].to_list()
    mtf = [term for sublist in mtl for term in sublist]
    mtc = np.unique(mtf)
    
    with open('medical_terms_demo.txt', 'w', encoding='utf-8') as output:
        for term in mtc:
            term = str(term)
            output.write(f"{term}\n")
    
    return filt_data.medical_terms

The code below is running all of the above to produce a word list. I've applied it on a smaller sample so its readable (for 10 records).
Its essentially extracted a bunch of keywords from the text, and I intend to use these to build the model. The method to build the models is shown a little later.

In [55]:
test_code(textual_data[0:10])

['periods', 'post exertional malaise', 'periods', 'periods', 'sensory stimulation', 'video', 'phone', 'periods', 'music', 'sensory stimulation', 'periods', 'fatigue', 'hunger', 'periods', 'fatigue', 'sitting', 'day', 'relapse', 'relapse', 'post exertional malaise', 'relapse', 'relapse', 'relapse', 'sensory stimulation', 'video', 'phone', 'fatigue', 'sensory stimulation', 'video', 'phone', 'fine motor activities', 'standing', '', 'sensory stimulation', 'relapse', 'sensory stimulation', 'fatigue', 'relapse', 'numb hands', 'foot', '', 'folate']
[]
['night', 'fatigue', 'foot', 'hearing test', 'tinnitus', 'c reactive protein', 'esr', 'month', 'vaccine', 'tinnitus', 'iron supplements', 'menthol', 'topical cream']
['periods']
['being active', 'shower', 'house', 'fatigue', 'tired', 'concentrate', 'conversation', 'rash', 'shingle', 'active', 'active i', 'body', 'sun', 'active', 'fatigue', 'day', 'active', 'fatigue', 'day', 'sitting', 'vaccine', 'i am tired', 'conversation', 'resting', 'standing

0    [periods, post exertional malaise, periods, pe...
1                                                   []
2    [night, fatigue, foot, hearing test, tinnitus,...
3                                            [periods]
4    [being active, shower, house, fatigue, tired, ...
5    [symptom, triggered by, eating, histamine into...
6    [seems, histamine intolerance, histamine intol...
7    [worse, exertion, sitting, sun, trigger, stand...
8    [month, fluctuation, , started, measuring, mon...
9                                             [niacin]
Name: medical_terms, dtype: object

# Building the models

The above is mostly the process in which the words are generated. After generating the words, I would need to apply this onto the data itself, to be to do the clusters.
I'm currently thinking of two approaches:

1: I would keep columns like opensxtrig_fatigue, however they would expanded significantly into opensxtrig_word, based on the above generated words. I'm worried that the scale of this will grow too large.

2: The columns like opensxtrig_fatigue, would essentially all be replaced with the symptoms themselves. This would allow for direct clustering of the symptoms.



Below are some small demos of each method, we can discuss these further after - probably moreso after the medical_terms list, is a lot more refined.

## Method 1

Symptoms become the columns

Repost of below function -> needed a small change to run below

In [98]:
#Function that I'm working on - still working on this
symptoms_list = []

def find_symptoms_adj(data):
    
    # Step 1: Join
    # data = " ".join(data)
    doc = nlp(data)
    
    # Step 2: Find the medical terms
    symptoms = []
    for entity in doc.ents:
        sep = re.split(r'\s{2,}|/', entity.text) # seperating terms with "/" and inputs that have elongated spaces
        symptoms.extend(sep)  # creating a symptom list
    
    # Step 3: Do lemma, punct removal, and lower-case
    symptoms = [fix(x) for x in symptoms] # Expand Contractions
    symptoms = [re.sub(r"[^\w\s]|_", " ", x) for x in symptoms] # Remove Punctuation
    symptoms = [WordNetLemmatizer().lemmatize(x) for x in symptoms] # Lemmatize
    symptoms = [x.lower() for x in symptoms if x.lower() not in stopwords_list] # Lower-case and remove stopwords
    symptoms = [re.sub(' +', ' ', x) for x in symptoms] # Removing uneeded spaces
    
    
    # Step 4: Applying transformations
    # Tranformation dictionary - I'm still adding to this dictionary
    transformations = {
        ("crash","relapses"): 'relapse', # crash and relapses replaced with relapse
        ("bruh",) : 'word',
        ('histamine',) : 'histamine intolerance',
        ('period',) : 'periods',
        ('pace', 'rest') : 'pacing',
        ('exhuasion',) : 'fatigue', 
        ('thinking', 'concentration') : 'cognitive',
        ('anxiety',) : 'anxiety',
        ('c reactive protein') : '',
        ('vaccination', 'vaccine') : 'vaccine',
        ('walking', 'walk', 'activity') : 'fatigue',
    }
    
    # building and applying the patterns set above
    for pattern, replacement in transformations.items():
        if isinstance(pattern, tuple):
            pattern = "|".join([re.escape(subpattern) for subpattern in pattern])
            pattern = rf".*\b({pattern})\b.*"
            symptoms = [re.sub(pattern, replacement, x) for x in symptoms]
    
    print(symptoms)
    return symptoms

symptoms_list = []

In [None]:
textual_col_names = textual_data.columns.tolist() # column names

vectorizer = TfidfVectorizer(lowercase=False, tokenizer=find_symptoms_adj, stop_words=None, preprocessor=None) # used to apply tfidf


# Create an empty DataFrame to store the transformed data
tfidf_data = pd.DataFrame()

# Loop through each text column and apply TF-IDF
for column in textual_col_names:
    tfidf_array = vectorizer.fit_transform(textual_data[column]).toarray()
    
    prefixed_feature_names = [f"{column}_{feature_name}" for feature_name in vectorizer.get_feature_names_out()]
    
    tfidf_df = pd.DataFrame(tfidf_array, columns=prefixed_feature_names)
    
    tfidf_data = pd.concat([tfidf_data, tfidf_df], axis=1)

The below data shows how the columns have been adjusted - The values correspond to whether the word appears, and then is scaled by how often that word appeared in that column.
This process was repeated for all textual columns

In [59]:
tfidf_data

Unnamed: 0,opensxtrig_fatigue_afternoon,opensxtrig_fatigue_afternoon fatigue,opensxtrig_fatigue_allergy,opensxtrig_fatigue_anticoagulant,opensxtrig_fatigue_anticoagulants,opensxtrig_fatigue_antihistamine,opensxtrig_fatigue_anything,opensxtrig_fatigue_awake,opensxtrig_fatigue_b12,opensxtrig_fatigue_bacterial products,...,tx_other_open_vitamin b6,tx_other_open_vitamin c,tx_other_open_vitamin e,tx_other_open_week,tx_other_open_worked,tx_other_open_worsened,tx_other_open_xanax,tx_other_open_xolair,tx_other_open_zeolite,tx_other_open_zolfran
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
226,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
227,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
228,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Method 2

In [93]:
textual_data = textual_data.drop('medical_terms', axis=1)
# textual_data['medical_terms'] = textual_data.apply(lambda x: find_symptoms(x), axis=1)
textual_data['combined_text'] = textual_data.apply(lambda row: ' '.join(row), axis=1)

In [94]:
textual_data

Unnamed: 0,opensxtrig_fatigue,opensxtrig_lhead,opensxtrig_vert,opensxtrig_cog,opensxtrig_tin,opensxtrig_naus,opensxtrig_weak,opensxtrig_ins,opensxtrig_breath,opensxtrig_exint,...,opensxtrig_bb,opensxtrig_anx,opensxtrig_dep,opensxtrig_hal,opensxtrig_mood,sx_other_open,othertests,other_diag_open,tx_other_open,combined_text
0,Periods Post-exertional malaise,Periods,Periods Sensory stimulation Video calls Pho...,Periods,Music Sensory stimulation,Periods Fatigue Hunger,Periods Fatigue,Relapses,Sitting upright Being active (e.g. being at w...,,...,,,,,,Numb hands and feet,,,Omega 3 Folate,Periods Post-exertional malaise Periods Perio...
1,,,,,,,,,,,...,,,,,,,,,,...
2,,,,,,,,,,,...,,,,,,,I had a hearing test for the tinnitus. It was ...,Tinnitus,"Iron supplements, didnt help. Menthol topical...",Really bad at night. Also af...
3,,,,,,,,,,,...,,,,,,,,,,My period itself ...
4,"Being active, just taking a shower, going out ...",,,"When tired, trying to concentrate eg having a ...",,,,,,,...,,,,,,,"Ct thorax with contrast MRI lumbar cervical,...",Unexplained neurological symptoms in the after...,,"Being active, just taking a shower, going out ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225,Fatigue worse after exertion. PEM. But never g...,,,,,,,,Worse on exertion and also suddenly at random ...,,...,,The first two weeks of the reaction were like ...,This one is just a normal reaction to having y...,"Mild olfactory hallucinations when crashing, A...",Again hard to untie from standard response to ...,Loss of feeling in bladder Loss of sexual fee...,Think I've ticked the right box re. urine but ...,,,Fatigue worse after exertion. PEM. But never g...
226,,,,,,,,,,,...,,,,,,,Serum protein electrophoresis,Pericardial effusion Gastroenteritis,Beta blockers PPIs,"Period Period, fo..."
227,before hormonal periods,after walking or trying to do things,,gets worse after eating,gets better when other symptoms abate,,depends on covid reinfections,depends on food eaten,depends on covid reinfection,depends on covid reinfection,...,depends of what I've eaten and covid reinfection,depends of what I've eaten and covid reinfection,depends of what I've eaten and covid reinfection,,depends of what I've eaten and covid reinfection,ear and nose cartilage seems to be breaking down,24 hr sodium,,,before hormonal periods after walking or tryin...
228,,Accompanied by heart palpitations,,,,Accompanied by heart palpitations,,,Accompanied by heart palpitations,"Exercise on an incline (e.g., going up stairs,...",...,,,,,,,Stress echocardiogram,,Fermented cod liver oil,Accompanied by heart palpitations Accompan...


In [100]:
vectorizer = TfidfVectorizer(lowercase=False, tokenizer=find_symptoms_adj, stop_words=None, preprocessor=None) # used to apply tfidf
tfidf_matrix = vectorizer.fit_transform(textual_data['combined_text'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())



['periods', 'post exertional malaise', 'periods', 'periods', 'sensory stimulation', 'video', 'phone', 'periods', 'music', 'sensory stimulation', 'periods', 'fatigue', 'hunger', 'periods', 'fatigue', 'sitting', 'day', 'relapse', 'relapse', 'post exertional malaise', 'relapse', 'relapse', 'relapse', 'sensory stimulation', 'video', 'phone', 'fatigue', 'sensory stimulation', 'video', 'phone', 'fine motor activities', 'standing', '', 'sensory stimulation', 'relapse', 'sensory stimulation', 'fatigue', 'relapse', 'numb hands', 'foot', '', 'folate']
[]
['night', 'fatigue', 'foot', 'hearing test', 'tinnitus', 'c reactive protein', 'esr', 'month', 'vaccine', 'tinnitus', 'iron supplements', 'didn t help', 'menthol', 'topical cream']
['periods']
['being active', 'shower', 'house', 'fatigue', 'tired', 'concentrate', 'conversation', 'rash', 'shingle', 'active', 'active i', 'body', 'sun', 'active', 'fatigue', 'day', 'active', 'fatigue', 'day', 'sitting', 'vaccine', 'i m tired', 'conversation', 'i m r

This is what the data looks like when it has been processed this way. It would be basically be converted into its symptoms/bio-medical data. The below contains a lot of noise, and will need to be cleaned further.

In [101]:
tfidf_df

Unnamed: 0,Unnamed: 1,Unnamed: 2,insect,laser,once,passed,pem,silent,throat,102lbs,...,yoyo,zap,zap feeling i,zap like feeling,zap of energy,zeolite,zinc,zipping near my heart muscle,zolfran,zyrtec
0,0.066272,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.153751,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225,0.030717,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
226,0.240466,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
227,0.010564,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
228,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
