In [11]:
# Importing libraries

import pandas as pd
from pandas.io.stata import StataReader
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import DBSCAN
import string
import re
from contractions import fix
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt
import spacy
import numpy as np
from warnings import simplefilter 


simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

Loading data + reading some libraries

In [12]:
data_encoded = pd.read_stata('ukcv.dta', convert_categoricals=False)
data_encoded.head()

Unnamed: 0,psid,supportname,age,gender,genderbirth,ethnicity,bmi,smoke,preautoimmunity,preautoimmunitymeds,...,dxwhy_dismiss,dxwhy_help,dxwhy_guide,dxwhy_research,dxwhy_aware,dxwhy_oth,story_open,consented,supportgroup,trigvax
0,907633-907615-96019409,UKCV Covid Vaccine - Long Haul Autoimmune Sup...,2,2,1.0,3,2,1,0,0,...,0,1,1,1,1,0,,10.0,1.0,1.0
1,907633-907615-96026239,Uk cv group,8,1,1.0,3,3,3,0,2,...,0,0,0,0,0,0,,10.0,1.0,1.0
2,907633-907615-96025984,,3,2,1.0,4,3,3,0,0,...,1,1,1,1,1,1,A day after my first (and only) Pfizer vaccine...,10.0,3.0,1.0
3,907633-907615-96027621,Uk Cv family,4,2,1.0,3,2,3,0,0,...,1,1,1,0,1,0,It was quite tricky to fill in the time line o...,10.0,1.0,1.0
4,907633-907615-96027090,Ukcv family Facebook,7,2,1.0,3,4,2,0,0,...,0,1,1,1,1,0,Ive had mixed experiences with my GP practice...,10.0,1.0,1.0


In [13]:
textual_data = data_encoded.select_dtypes(include=['object'])
textual_data = textual_data.drop(['psid', 'supportname', 'batchno', 'story_open'], axis=1)
textual_data = textual_data.applymap(lambda x: x.encode('latin-1').decode('cp1252'))

This below is the treatment dictionary. It contains all the terms that will be found/converted in the data.
The format works as the following:

treatment dict = {<br>
    &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;"column_name": ["variations"],<br>
}<br>

- "column_name": The name of the column that will be generated (these contain binary values)
- "variations": If any of these words appear in the text, it will show a binary value 1, for said column

In [14]:
treament_dict = {
    "omega 3 folate": ['omega 3 folate', "omega-3", "omega 3"],
    "iron supplements": ['iron supplements',],
    "menthol tropical cream": ['menthod topical cream',],
    "red light therapy": ["red light therapy",],
    "anti-inflammatory": ["anti-inflammatory supplement", "anti inflammatory supplement"],
    "dark chocolate": ['dark chocolate'],
    "green tea": ['green tea'],
    "niacin": ["niacin"],
    "aciclovir": ["aciclovir"],
    "invabradine": ["ivabradine"],
    "vitamin_b1": ["b1", "vitamin b1"],
    "vitamin_b2": ["b2", "vitamin b2"],
    "vitamin_b3": ["b3", "vitamin b3"],
    "vitamin_c": ["vitamin c", "vit c"],
    "vitamin_k": ["vitamin k", "vit k"],
    "vitamin_b6": ["vitamin b6", "vit b6"],
    "vitamin_e": ["vitamin e", "vit e"],
    "quercitin": ["quercitin"],
    "propanolol": ["propranolol"],
    "candesartan": ['candesartan'],
    "gluten_free": ['gluten_free_diet'],
    "CBD_oil": ["cbd oil", "cbd"],
    "amitriptyline": ["amitriptyline"],
    "florastor": ["florastor"],
    "pepcid": ["pepcid"],
    "tricinimonolone": ["triciniminolone"],
    "beta-blockers": ["beta blockers", "beta-blockers", "beta blocker"],
    "calcium": ["calcium", "k2"],
    "glucosamine": ["glucosamine"],
    "biotin": ["biotin"],
    "quercetin": ["quercetin", "quercatin"],
    "black seed oil": ["black seed oil"],
    "oxycodone": ["oxycodone"],
    "phenylalanine": ["dlpa", "phenylalanine"],
    "probiotic": ["probiotic"],
    "moringa_supplement": ["moringa supplement", "moringa", "moringa-supplement"],
    "tvns": ["transcutaneous vagus nerve stimulation", "tvns", "vagus nerve exercises", "vagal never stimulation"],
    "omalizumab": ["omalizumab"],
    "xolair_injections": ["xolair"],
    "ozone_infusion": ["ozone"],    
    "nad_infusion": ["NAD+"],
    "glutationine": ["glutationine"],
    "aripiprazole": ["aripriprazole"],
    "botox": ["botox"],
    "xanax": ["xanax"],
    "tizanidine": ["tizanidine"],
    "hypnosis": ["hypnosis"],
    "physio": ["physio"],
    "acupuncture": ["acupuncture"],
    "melatonin": ["melatonin"],
    "tumerical_capsules": ["tumeric capsules", "tumeric"],
    "hawthorn_tincture": ["hawthorn tincture"],
    "intestinal_binders": ["intestinal binders", "charcoal", "clay"],
    "fennel_tea": ["fennel tea"],
    "maraviroc": ["maraviroc"],
    "dandelion_tea": ["dandelion tea"],
    "tollovid": ["tollovid"],
    "sudafed": ["sudafed"],
    "clonazapem": ["clonazapen"],
    "blood_supplements": ["blood supplements"],
    "fludrocotisone": ["fludrocortisone"],
    "mestinon": ["mestinon"],
    "IV": ["iv"],
    "blood_donation": ["blood donation"],
    "cold water therapy": ["cold water therapy"],
    "homeopathy": ["homeopathic remedies"],
    "acetaminophen": ["acetaminophen"],
    "pantoprazole": ["pantoprazole"],
    "propionic acid": ["propionsäure", "propionic acid"],
    "herbs": ["herbs"],
    "palmitoylethanolamide": ["palmitoylethanolamide", "pea"],
    "butterburr": ["butterburr", "butter burr"],
    "nortriptylline": ["nortriptylline"],
    "prednisolone": ["prednisolone"],
    "magnesium": ["magnesium"],
    "aminophylline": ["aminophylline"],
    "selenium": ["selenium"],
    "phosphate": ["phosphate"],
    "pravastatin": ["pravastatin"],
    "pine_needle_drops": ["pine needle drops"],
    "prebiotics": ["prebiotics", "prebiotic"],
    "probiotics": ["probiotics", "probiotic"],
    "i-caritine": ["i-carnitine", "i carnitine"],
    "alpha_lipoic_acid": ["alpha lipeic acid", "alpha lipoic acid"],
    "beetroot_extract": ["beet root extract", "beetroot extract", "beets"],
    "lamotrigin": ["lamotrigin"],
    "antihistamines": ["antihistamines", "anti-histamines", "anti histamines", "claritin", "h1", "h2", "h3"],
    "montelukas": ["montelukas"],
    "mertazepine": ["mertazepine"],
    "escitalopram": ["escitalopram"],
    "gabapentin": ["gabapentin"],
    "lumbrokinase": ["lumbrokinase"],
    "cold_showers": ["cold shower", "cold showers"],
    "COQ10": ["COQ10"],
    "anti-vertigo_treatment": ["anti vertigo"],
    "antibiotics": ["antibiotics"],
    "chiropractic_treatment": ["chiropractic treatment"],
    "glutathione": ["glutathione"],
    "baking_soda": ["baking soda"],
    "essential_oils": ["essential oils"],
    "oestrogen": ["oestrogen"],
    "hrt": ["hrt"],
    "l_methylfolate": ["l-methylfolate", "methylfolate"],
    "meclizine": ["meclizine"],
    "midodrine": ["midodrine"],
    "zolfran": ["zolfran"],
    "progesterone": ["progesterone"],
    "diazepam": ["diazepam"],
    "exogenous_nitric_oxide": ["exogenous nitric oxide"],
    "ppq": ["ppq"],
    "bacopa": ["bacopa"],
    "berberine": ["berberine"],
    "ashwaganda": ["ashwaganda"],
    "leeks": ["leeks"],
    "ox_bile": ["ox bile", "ox-bile", "oxbile"],
    "bovine": ["bovine"],
    "silymarin": ["silymarin"],
    "serrapeptase": ["serrapeptase"],
    "pycogenol": ["pycogenol"],
    "triphala": ["triphala"],
    "low_carb_diet": ["low carb high protein diet"],
    "zeolite": ["zeolite"],
    "meditation": ["meditation"],
    "movement": ["exercise", "walking"],
    "fodmap_diet": ["fodmap"],
    "nac": ["nac"],
    "lugols_iodine": ["lugols iodine"],
    "levetiracetem": ["levetiracetam"],
    "manuka honey": ["manuka honey", "manuka"],
    "fish oil": ["fish oil", "fermented cod liver oil"],
    "mms": ["mms"], #ask about this one
    "dmso": ["dmso"],
    "biotoxin_binder": ["biotoxin binders", "biotoxin binder"],
    "molybdenum": ["molybdenum"],
    "pseudoephedrine": ["pseudoephedrine"],
    "phosphatidyl_serine": ["phosphatidyl serine"],
    "phosphatidyl_choline": ["phosphatidyl choline"],
    "ppis": ["ppis"]
}

Small function that will apply that'll make use of the above dictionary, and recode binary-wise

In [15]:
def create_binary_cols(df, col, prefix, word_dict):
    
    df[col] = df[col].str.lower() # lower-case the data
    df[col] = df[col].apply(lambda x: re.sub(r'\s+', ' ', x)) # removing extra spaces

    # loop creating the query and appying the word search
    for word in word_dict:
        
        col_name = f'{prefix}_{word}'
        regex = '|'.join([re.escape(word) for word in word_dict[word]])
        regex = rf".*\b({regex})\b.*"
        
        df[col_name] = np.where(df[col].str.match(regex), 1, 0) # creating binary column/recoding

Below shows the above code being run, and the created binary columns. I've left the original in for reference.

In [16]:
treatments = textual_data['tx_other_open'].to_frame().copy()
create_binary_cols(treatments, 'tx_other_open', 'tx', treament_dict)
treatments

Unnamed: 0,tx_other_open,tx_omega 3 folate,tx_iron supplements,tx_menthol tropical cream,tx_red light therapy,tx_anti-inflammatory,tx_dark chocolate,tx_green tea,tx_niacin,tx_aciclovir,...,tx_manuka honey,tx_fish oil,tx_mms,tx_dmso,tx_biotoxin_binder,tx_molybdenum,tx_pseudoephedrine,tx_phosphatidyl_serine,tx_phosphatidyl_choline,tx_ppis
0,omega 3 folate,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"iron supplements, didn’t help. menthol topical...",0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225,,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
226,beta blockers ppis,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
227,,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
228,fermented cod liver oil,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
