In [47]:
import csv
import pandas as pd
import numpy as np
import os 
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Mute pandas performance warning
warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)


In [48]:
df = pd.read_csv('WeInspectB.csv')

Manually Parsed Column Division

In [49]:
### Declaring known value sets ### 

#Empty Columns & Non Valueble columns 
    #(Dropping health concerns because all entries are yes)
emptyColumns = ['Unnamed: 42', 'Unnamed: 43','Health Concerns?'] 

#Location Columns
locationColumns = ['City','State','Zip']

#Syptom Columns (Columns that need to be one-hot encoded)
symptomColumns = ['Diagnoses', 
                'Eyes, Ears, Nose, & Throat',
                'Resipiratory', 'Digestive',
                'Circulatory', 
                'Skin', 
                'Brain', 
                'Nervous', 
                'Urinary', 
                'Immune', 
                'Reproductive']

#Health Information 
healthInfoColumns = [
        'Sum of the Logs (Group I)',
        'Sum of the Logs (Group II)',
        'ERMI Score (Group I - Group II)']

#Molds 
moldColumns = ['Aspergillus flavus/oryzae',
       'Aspergillus fumigatus', 'Aspergillus niger', 'Aspergillus ochraceus',
       'Aspergillus penicillioides', 'Aspergillus restrictus*',
       'Aspergillus sclerotiorum', 'Aspergillus sydowii', 'Aspergillus unguis',
       'Aspergillus versicolor', 'Aureobasidium pullulans',
       'Chaetomium globosum', 'Cladosporium sphaerospermum',
       'Eurotium (Asp.) amstelodami*', 'Paecilomyces variotii',
       'Penicillium brevicompactum', 'Penicillium corylophilum',
       'Penicillium crustosum*', 'Penicillium purpurogenum',
       'Penicillium spinulosum*', 'Penicillium variabile',
       'Scopulariopsis brevicaulis/fusca', 'Scopulariopsis chartarum',
       'Stachybotrys chartarum', 'Trichoderma viride*', 'Wallemia sebi','Acremonium strictum',
       'Alternaria alternata', 'Aspergillus ustus',
       'Cladosporium cladosporioides 1', 'Cladosporium cladosporioides 2',
       'Cladosporium herbarum', 'Epicoccum nigrum', 'Mucor amphibiorum*',
       'Penicillium chrysogenum', 'Rhizopus stolonifer']

# 0 Values 
zeroValues = ['ND','<1','nd','N/D','N D','Nd', 'Hhh', 'nan', 
               'Hh', 'ND, <1','NDNDND', 'na','Ne','NF','<','n/d',
               'o', 'BD','NaN', 'N d','ND,<2','1<']


Dataframe Parsing

In [50]:
# Creating a dataframe only handling symptom columns 
symptomDF = df[symptomColumns]
print("Symptom Dataframe Columns: ", len(symptomDF.columns))
print(symptomDF.columns)

Symptom Dataframe Columns:  11
Index(['Diagnoses', 'Eyes, Ears, Nose, & Throat', 'Resipiratory', 'Digestive',
       'Circulatory', 'Skin', 'Brain', 'Nervous', 'Urinary', 'Immune',
       'Reproductive'],
      dtype='object')


In [51]:
# Drop all rows where Zip , City, and State are missing
print("Inital Columns Count: ", len(df.columns))
df = df.drop(emptyColumns,axis=1)
print("Count after dropping useless Columns: ", len(df.columns))

Inital Columns Count:  56
Count after dropping useless Columns:  53


In [52]:
# Creating a dataframe only handling symptom columns 
print("Initial Rows before City/State/Zip Drop: ", len(df))
df = df.dropna(subset=['Zip', 'City', 'State'])
print("Rows after Dropping:" ,len(df))

Initial Rows before City/State/Zip Drop:  581
Rows after Dropping: 553


In [53]:

# Creating dataframe handling location Data 
locationDF = df[locationColumns]
print("Location Dataframe Columns: ", len(locationDF.columns) )
print(locationDF.columns)

Location Dataframe Columns:  3
Index(['City', 'State', 'Zip'], dtype='object')


In [54]:
# Creating dataframe handling health information columns 
healthInfoDF = df[healthInfoColumns]
print("Health Information Dataframe Columns: ", len(healthInfoDF.columns))
print(healthInfoDF.columns)

Health Information Dataframe Columns:  3
Index(['Sum of the Logs (Group I)', 'Sum of the Logs (Group II)',
       'ERMI Score (Group I - Group II)'],
      dtype='object')


In [55]:
# Creating dataframe handling mold
moldDF = df[moldColumns]
print("Mold Dataframe Columns: ", len(moldDF.columns))
print(moldDF.columns)

Mold Dataframe Columns:  36
Index(['Aspergillus flavus/oryzae', 'Aspergillus fumigatus',
       'Aspergillus niger', 'Aspergillus ochraceus',
       'Aspergillus penicillioides', 'Aspergillus restrictus*',
       'Aspergillus sclerotiorum', 'Aspergillus sydowii', 'Aspergillus unguis',
       'Aspergillus versicolor', 'Aureobasidium pullulans',
       'Chaetomium globosum', 'Cladosporium sphaerospermum',
       'Eurotium (Asp.) amstelodami*', 'Paecilomyces variotii',
       'Penicillium brevicompactum', 'Penicillium corylophilum',
       'Penicillium crustosum*', 'Penicillium purpurogenum',
       'Penicillium spinulosum*', 'Penicillium variabile',
       'Scopulariopsis brevicaulis/fusca', 'Scopulariopsis chartarum',
       'Stachybotrys chartarum', 'Trichoderma viride*', 'Wallemia sebi',
       'Acremonium strictum', 'Alternaria alternata', 'Aspergillus ustus',
       'Cladosporium cladosporioides 1', 'Cladosporium cladosporioides 2',
       'Cladosporium herbarum', 'Epicoccum nigrum'

One Hot Encode Symptoms

In [56]:
def oneHotEncodeSymptoms(symptomDF):
    # For every column in symptom DF columns, looking at each row value find all unique comma separated symptoms and add them to a list 
    for column in symptomDF.columns:
        systemSymptomList = (df[column]
                        .dropna()             # Drop any remaining NaNs (though there shouldn't be any after fillna)
                        .str.lower()          # Convert to lowercase
                        .str.replace(" ", "") # Remove spaces
                        .str.split(',')       # Split by commas
                        .explode()            # Explode lists to rows
                        .unique())            # Get unique values
        
        for symptom in systemSymptomList:
            newColumn = f"{column}_{symptom}"
            symptomDF[newColumn] = df[column].apply(lambda x: 1 if symptom in str(x).lower().replace(" ", "") else 0)
     
        systemSymptomList = []
        #print(systemSymptomList)
    return symptomDF

In [57]:
# One Hot Encoding Symptoms 
print("One Hot Encoding Symptom DF, Inital Columns: ", len(symptomDF.columns))

symptomDF = oneHotEncodeSymptoms(symptomDF)
print("One Hot Encoded Columns: ", len(symptomDF.columns))
symptomDF.astype(int)
symptomDF.head(5)

#One hot encoding complete 11 columns turned to 118 with binary values 

One Hot Encoding Symptom DF, Inital Columns:  11
One Hot Encoded Columns:  118


Unnamed: 0,Diagnoses,"Eyes, Ears, Nose, & Throat",Resipiratory,Digestive,Circulatory,Skin,Brain,Nervous,Urinary,Immune,...,Immune_chronicmonoorepstein-barrvirus,Immune_frequentherpesoutbreaks,Immune_viralinfectionsbecomebacterial,Immune_increasedsusceptibilitytocancer,Reproductive_changesinmenstrualcycle,Reproductive_inconsistentmenstrualcycle,Reproductive_vaginalyeastorbacterialinfections,Reproductive_jockitch,Reproductive_infertilityinbothgenders,Reproductive_hormoneimbalances
0,"Hyperthyroid/Hypothyroid, IBS, PCOS","Sneezing/Allergies, Dry/Irritated eyes, Chroni...",,"Diarrhea/Constipation, Bloating, Abdominal pai...","Raynaud's phenomenon, Low or reactive blood pr...","Itchy/Peeling skin, Skin rash, Fungal infections",Brain fog,"Anxiousness, Fatigue",Overactive bladder,,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
1,,"Sneezing/Allergies, Runny nose/Post-nasal drip...","Shortness of breath, Chronic dry cough, Chroni...","Significant weight gain/loss, Diarrhea/Constip...","Cherry angiomas, Easy bruising/bleeding","Itchy/Peeling skin, Skin rash, Eczema","Brain fog, Memory loss, Anger/Aggressiveness","Anxiousness, Depression, Headache, Migraine, I...",,Long-lasting colds,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,"CIRS, MCAS","Sneezing/Allergies, Runny nose/Post-nasal drip...","Shortness of breath, Heaviness in the chest","Significant weight gain/loss, Bloating, Sweet ...","Spider veins, Easy bruising/bleeding, Iron def...","Itchy/Peeling skin, Flushing, Fungal infections","Brain fog, Confusion, Memory loss, Trouble fin...","Anxiousness, Depression, Headache, Insomnia, F...",Overactive bladder,"Increased susceptibility to infection, Chronic...",...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
3,"Auto-Immune Disease, Lyme Disease, Hyperthyroi...",,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"CIRS, Hyperthyroid/Hypothyroid, PCOS, Candidia...","Runny nose/Post-nasal drip, Ear popping/ringin...",,Significant weight gain/loss,"Spider veins, Cherry angiomas, Raynaud's pheno...",Eczema,"Brain fog, Trouble finding the right word, Ang...","Anxiousness, Fatigue",,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
