In [1]:
# Importing libraries

import pandas as pd
from pandas.io.stata import StataReader
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import DBSCAN
import string
import re
from contractions import fix
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt
import spacy
import numpy as np
from warnings import simplefilter 


simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

# Original Demo 2

Loading data + reading some libraries

In [2]:
data_encoded = pd.read_stata('ukcv.dta', convert_categoricals=False)
data_encoded.head()

Unnamed: 0,psid,supportname,age,gender,genderbirth,ethnicity,bmi,smoke,preautoimmunity,preautoimmunitymeds,...,dxwhy_dismiss,dxwhy_help,dxwhy_guide,dxwhy_research,dxwhy_aware,dxwhy_oth,story_open,consented,supportgroup,trigvax
0,907633-907615-96019409,UKCV Covid Vaccine - Long Haul Autoimmune Sup...,2,2,1.0,3,2,1,0,0,...,0,1,1,1,1,0,,10.0,1.0,1.0
1,907633-907615-96026239,Uk cv group,8,1,1.0,3,3,3,0,2,...,0,0,0,0,0,0,,10.0,1.0,1.0
2,907633-907615-96025984,,3,2,1.0,4,3,3,0,0,...,1,1,1,1,1,1,A day after my first (and only) Pfizer vaccine...,10.0,3.0,1.0
3,907633-907615-96027621,Uk Cv family,4,2,1.0,3,2,3,0,0,...,1,1,1,0,1,0,It was quite tricky to fill in the time line o...,10.0,1.0,1.0
4,907633-907615-96027090,Ukcv family Facebook,7,2,1.0,3,4,2,0,0,...,0,1,1,1,1,0,Ive had mixed experiences with my GP practice...,10.0,1.0,1.0


In [3]:
textual_data = data_encoded.select_dtypes(include=['object'])
textual_data = textual_data.drop(['psid', 'supportname', 'batchno', 'story_open'], axis=1)
textual_data = textual_data.applymap(lambda x: x.encode('latin-1').decode('cp1252'))

This below is the treatment dictionary. It contains all the terms that will be found/converted in the data.
The format works as the following:

treatment dict = {<br>
    &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;"column_name": ["variations"],<br>
}<br>

- "column_name": The name of the column that will be generated (these contain binary values)
- "variations": If any of these words appear in the text, it will show a binary value 1, for said column

In [4]:
treament_dict = {
    "omega 3 folate": ['omega 3 folate', "omega-3", "omega 3"],
    "iron supplements": ['iron supplements',],
    "menthol tropical cream": ['menthod topical cream',],
    "red light therapy": ["red light therapy",],
    "anti-inflammatory": ["anti-inflammatory supplement", "anti inflammatory supplement"],
    "dark chocolate": ['dark chocolate'],
    "green tea": ['green tea'],
    "niacin": ["niacin"],
    "aciclovir": ["aciclovir"],
    "invabradine": ["ivabradine"],
    "vitamin_b1": ["b1", "vitamin b1"],
    "vitamin_b2": ["b2", "vitamin b2"],
    "vitamin_b3": ["b3", "vitamin b3"],
    "vitamin_c": ["vitamin c", "vit c"],
    "vitamin_k": ["vitamin k", "vit k"],
    "vitamin_b6": ["vitamin b6", "vit b6"],
    "vitamin_e": ["vitamin e", "vit e"],
    "quercitin": ["quercitin"],
    "propanolol": ["propranolol"],
    "candesartan": ['candesartan'],
    "gluten_free": ['gluten_free_diet'],
    "CBD_oil": ["cbd oil", "cbd"],
    "amitriptyline": ["amitriptyline"],
    "florastor": ["florastor"],
    "pepcid": ["pepcid"],
    "tricinimonolone": ["triciniminolone"],
    "beta-blockers": ["beta blockers", "beta-blockers", "beta blocker"],
    "calcium": ["calcium", "k2"],
    "glucosamine": ["glucosamine"],
    "biotin": ["biotin"],
    "quercetin": ["quercetin", "quercatin"],
    "black seed oil": ["black seed oil"],
    "oxycodone": ["oxycodone"],
    "phenylalanine": ["dlpa", "phenylalanine"],
    "probiotic": ["probiotic"],
    "moringa_supplement": ["moringa supplement", "moringa", "moringa-supplement"],
    "tvns": ["transcutaneous vagus nerve stimulation", "tvns", "vagus nerve exercises", "vagal never stimulation"],
    "omalizumab": ["omalizumab"],
    "xolair_injections": ["xolair"],
    "ozone_infusion": ["ozone"],    
    "nad_infusion": ["NAD+"],
    "glutationine": ["glutationine"],
    "aripiprazole": ["aripriprazole"],
    "botox": ["botox"],
    "xanax": ["xanax"],
    "tizanidine": ["tizanidine"],
    "hypnosis": ["hypnosis"],
    "physio": ["physio"],
    "acupuncture": ["acupuncture"],
    "melatonin": ["melatonin"],
    "tumerical_capsules": ["tumeric capsules", "tumeric"],
    "hawthorn_tincture": ["hawthorn tincture"],
    "intestinal_binders": ["intestinal binders", "charcoal", "clay"],
    "fennel_tea": ["fennel tea"],
    "maraviroc": ["maraviroc"],
    "dandelion_tea": ["dandelion tea"],
    "tollovid": ["tollovid"],
    "sudafed": ["sudafed"],
    "clonazapem": ["clonazapen"],
    "blood_supplements": ["blood supplements"],
    "fludrocotisone": ["fludrocortisone"],
    "mestinon": ["mestinon"],
    "IV": ["iv"],
    "blood_donation": ["blood donation"],
    "cold water therapy": ["cold water therapy"],
    "homeopathy": ["homeopathic remedies"],
    "acetaminophen": ["acetaminophen"],
    "pantoprazole": ["pantoprazole"],
    "propionic acid": ["propionsäure", "propionic acid"],
    "herbs": ["herbs"],
    "palmitoylethanolamide": ["palmitoylethanolamide", "pea"],
    "butterburr": ["butterburr", "butter burr"],
    "nortriptylline": ["nortriptylline"],
    "prednisolone": ["prednisolone"],
    "magnesium": ["magnesium"],
    "aminophylline": ["aminophylline"],
    "selenium": ["selenium"],
    "phosphate": ["phosphate"],
    "pravastatin": ["pravastatin"],
    "pine_needle_drops": ["pine needle drops"],
    "prebiotics": ["prebiotics", "prebiotic"],
    "probiotics": ["probiotics", "probiotic"],
    "i-caritine": ["i-carnitine", "i carnitine"],
    "alpha_lipoic_acid": ["alpha lipeic acid", "alpha lipoic acid"],
    "beetroot_extract": ["beet root extract", "beetroot extract", "beets"],
    "lamotrigin": ["lamotrigin"],
    "antihistamines": ["antihistamines", "anti-histamines", "anti histamines", "claritin", "h1", "h2", "h3"],
    "montelukas": ["montelukas"],
    "mertazepine": ["mertazepine"],
    "escitalopram": ["escitalopram"],
    "gabapentin": ["gabapentin"],
    "lumbrokinase": ["lumbrokinase"],
    "cold_showers": ["cold shower", "cold showers"],
    "COQ10": ["COQ10"],
    "anti-vertigo_treatment": ["anti vertigo"],
    "antibiotics": ["antibiotics"],
    "chiropractic_treatment": ["chiropractic treatment"],
    "glutathione": ["glutathione"],
    "baking_soda": ["baking soda"],
    "essential_oils": ["essential oils"],
    "oestrogen": ["oestrogen"],
    "hrt": ["hrt"],
    "l_methylfolate": ["l-methylfolate", "methylfolate"],
    "meclizine": ["meclizine"],
    "midodrine": ["midodrine"],
    "zolfran": ["zolfran"],
    "progesterone": ["progesterone"],
    "diazepam": ["diazepam"],
    "exogenous_nitric_oxide": ["exogenous nitric oxide"],
    "ppq": ["ppq"],
    "bacopa": ["bacopa"],
    "berberine": ["berberine"],
    "ashwaganda": ["ashwaganda"],
    "leeks": ["leeks"],
    "ox_bile": ["ox bile", "ox-bile", "oxbile"],
    "bovine": ["bovine"],
    "silymarin": ["silymarin"],
    "serrapeptase": ["serrapeptase"],
    "pycogenol": ["pycogenol"],
    "triphala": ["triphala"],
    "low_carb_diet": ["low carb high protein diet"],
    "zeolite": ["zeolite"],
    "meditation": ["meditation"],
    "movement": ["exercise", "walking"],
    "fodmap_diet": ["fodmap"],
    "nac": ["nac"],
    "lugols_iodine": ["lugols iodine"],
    "levetiracetem": ["levetiracetam"],
    "manuka honey": ["manuka honey", "manuka"],
    "fish oil": ["fish oil", "fermented cod liver oil"],
    "mms": ["mms"], #ask about this one
    "dmso": ["dmso"],
    "biotoxin_binder": ["biotoxin binders", "biotoxin binder"],
    "molybdenum": ["molybdenum"],
    "pseudoephedrine": ["pseudoephedrine"],
    "phosphatidyl_serine": ["phosphatidyl serine"],
    "phosphatidyl_choline": ["phosphatidyl choline"],
    "ppis": ["ppis"]
}

Small function that will apply that'll make use of the above dictionary, and recode binary-wise

In [5]:
def create_binary_cols(df, col, prefix, word_dict):
    
    df[col] = df[col].str.lower() # lower-case the data
    df[col] = df[col].apply(lambda x: re.sub(r'\s+', ' ', x)) # removing extra spaces

    # loop creating the query and appying the word search
    for word in word_dict:
        
        col_name = f'{prefix}_{word}'
        regex = '|'.join([re.escape(word) for word in word_dict[word]])
        regex = rf".*\b({regex})\b.*"
        
        df[col_name] = np.where(df[col].str.match(regex), 1, 0) # creating binary column/recoding

Below shows the above code being run, and the created binary columns. I've left the original in for reference.

In [6]:
treatments = textual_data['tx_other_open'].to_frame().copy()
create_binary_cols(treatments, 'tx_other_open', 'tx', treament_dict)
treatments

Unnamed: 0,tx_other_open,tx_omega 3 folate,tx_iron supplements,tx_menthol tropical cream,tx_red light therapy,tx_anti-inflammatory,tx_dark chocolate,tx_green tea,tx_niacin,tx_aciclovir,...,tx_manuka honey,tx_fish oil,tx_mms,tx_dmso,tx_biotoxin_binder,tx_molybdenum,tx_pseudoephedrine,tx_phosphatidyl_serine,tx_phosphatidyl_choline,tx_ppis
0,omega 3 folate,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"iron supplements, didn’t help. menthol topical...",0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225,,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
226,beta blockers ppis,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
227,,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
228,fermented cod liver oil,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


# Demo 2 - Added Conversions

Some of the columns might already appear in the data - and would need to be recoded - i.e. it wouldn't generate a binary column for those values<br>
Below I'm going to use a smaller demo dataset, to show how it works, after I will apply it to the actual dataset.

The code below is similar to the above, but with adjusted workings:<br>

The dictionary in this case would have the column names from already existing columns, and the words the would be categorised into them i.e.:<br>

dict = {<br>
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;"arb_txhelp": ["candesartan"],<br>
}<br>

The way the recoding words is a little different. If the user already has some answer in that part of the form, no changes are made. However also no new binary column is generated, since it already exists.<br>

Similarly if in the original column the data is shown as missing, but there is a match found in the text columns. It will recode it to 8, which means "unsure" (8 is the labelling given to "unsure" in the given dataset)<br>

The below function applies the logic above.

In [20]:
def repeat_conversions(df, col, word_dict):
    
    # df[col] = df[col].str.lower() # lower-case the data
    # df[col] = df[col].apply(lambda x: re.sub(r'\s+', ' ', x)) # removing extra spaces

    for word in word_dict:
        regex = '|'.join([re.escape(word) for word in word_dict[word]])
        regex = rf".*\b({regex})\b.*"
        
        for index, row in df.iterrows():
            if pd.isna(row[word]) and re.search(regex, row[col]): # If empty + matches search criteria
                df.loc[index, word] = 8 # Replacing the value, with 8
                print(f"Row {index} for column {word} has been adjusted")

Below I've given a small demo with a smaller generated dataset. Its a little difficult to show how it works in the larger/actual dataset

I've generated a small dataset below with two missing values, represented by NaN.

In [21]:
demo_dataset = {
    "fruits_grape?": [1, 3, 2, 1, 4, None],
    "fruits_apple?": [2, None, 2, 5, 4, 5],
    "fruit_type": ["banana", "apple", "pear", "orange", "watermelon", "pink grapes"]
}

demo_df = pd.DataFrame(demo_dataset)
demo_df

Unnamed: 0,fruits_grape?,fruits_apple?,fruit_type
0,1.0,2.0,banana
1,3.0,,apple
2,2.0,2.0,pear
3,1.0,5.0,orange
4,4.0,4.0,watermelon
5,,5.0,pink grapes


Below is the dictionary that will apply transformations to the original columns.

The below dictionary is saying if the word grape appears in the fruit_types column, and the value in the "fruits_grape?" column is empty. It should be coded to 8 (for "unsure"). This applies to the same with apple.
You can see in the resulting table below, how no new binary column was generated.

In [24]:
fruit_duplicates = {
    "fruits_grape?": ["grapes"],
    "fruits_apple?": ["apple"]
}

repeat_conversions(demo_df, 'fruit_type', fruit_duplicates)
demo_df

Unnamed: 0,fruits_grape?,fruits_apple?,fruit_type
0,1.0,2.0,banana
1,3.0,8.0,apple
2,2.0,2.0,pear
3,1.0,5.0,orange
4,4.0,4.0,watermelon
5,8.0,5.0,pink grapes


Likewise I can apply the code that generates binary columns to this example. It'll make new columns based on the dictionary in the code below (and as shown in the original example).

You may notice no column for "orange" was generated, since it wasn't specified.

In [28]:
fruit_dict = {
    "banana?": ["banana"],
    "pear?": ["pear"],
    "watermelon?": ["watermelon"]
}

create_binary_cols(demo_df, 'fruit_type', 'fruits', fruit_dict)
demo_df

Unnamed: 0,fruits_grape?,fruits_apple?,fruit_type,fruits_banana?,fruits_pear?,fruits_watermelon?
0,1.0,2.0,banana,1,0,0
1,3.0,8.0,apple,0,0,0
2,2.0,2.0,pear,0,1,0
3,1.0,5.0,orange,0,0,0
4,4.0,4.0,watermelon,0,0,1
5,8.0,5.0,pink grapes,0,0,0


This logic is now being applied to the actual dataset

In [None]:
treatments_duplicates = {
    "arb_txhelp": ["candesartan"],
    "mc_txhelp": ["omalizumab", "xolair", "montelukas"],
    "opiates_txhelp": ["oxycodone"],
    "antidep_txhelp": ["xanax", "mertazepine", "escitalopram", "diazepam"],
    "vene_txhelp": ["blood donation"],
    "steroids_txhelp": ["prednisolone"],
    "statins_txhelp": ["pravastatin"],
    "aisupp_txhelp": ["antihistamines", "anti-histamines", "anti histamines", "claritin", "h1", "h2", "h3", "pepcid"],
    # "fibrinolytic_supplement": ["lumbrokinase", "serrapeptase"],
    "coq10_txhelp": ["COQ10"],
    "folate_txhelp": ["l-methylfolate", "methylfolate", "folate"],
    "keto_txhelp": ["low carb high protein diet"],
    "nac_txhelp": ["nac"],
    "aisupp_txhelp": ["anti-inflammatory supplement", "anti inflammatory supplement", "tumeric capsules", "tumeric", "blood supplements", "glutathione"],
    "mg_txhelp": ["magnesium"],
}

BEFORE:

The person in record 119 had a missing value for arb_txhelp. However mentioned "candesartan" in "tx_other_open". This would need to be recoded.

In [34]:
data_encoded[["arb_txhelp", "tx_other_open"]].iloc[119].to_frame()

Unnamed: 0,119
arb_txhelp,
tx_other_open,"Nortriptylline, candesartan Every time I go..."


The below is apply the recodings based on the dictionary above. 

In [9]:
copy_treatments = textual_data['tx_other_open'].to_frame().copy()
data_copy = data_encoded.copy()

repeat_conversions(data_copy, 'tx_other_open', treatments_duplicates)

Row 119 for column arb_txhelp has been adjusted
Row 141 for column mc_txhelp has been adjusted
Row 52 for column antidep_txhelp has been adjusted
Row 154 for column coq10_txhelp has been adjusted
Row 174 for column mg_txhelp has been adjusted
Row 221 for column mg_txhelp has been adjusted


AFTER:

Now has the label for "unsure" which is 8.

In [35]:
data_copy[["arb_txhelp", "tx_other_open"]].iloc[119].to_frame()

Unnamed: 0,119
arb_txhelp,8.0
tx_other_open,"Nortriptylline, candesartan Every time I go..."
