In [246]:
import csv
import pandas as pd
import numpy as np
import os 
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [247]:
df = pd.read_csv('WeInspectB.csv')

Manually Parsed Column Division

In [248]:
### Declaring known value sets ### 

#Empty Columns & Non Valueble columns 
    #(Dropping health concerns because all entries are yes)
emptyColumns = ['Unnamed: 42', 'Unnamed: 43','Health Concerns?'] 

#Location Columns
locationColumns = ['City','State','Zip']

#Syptom Columns (Columns that need to be one-hot encoded)
symptomColumns = ['Diagnoses', 
                'Eyes, Ears, Nose, & Throat',
                'Resipiratory', 'Digestive',
                'Circulatory', 
                'Skin', 
                'Brain', 
                'Nervous', 
                'Urinary', 
                'Immune', 
                'Reproductive']

#Health Information 
healthInfoColumns = [
        'Sum of the Logs (Group I)',
        'Sum of the Logs (Group II)',
        'ERMI Score (Group I - Group II)']

#Molds 
moldColumns = ['Aspergillus flavus/oryzae',
       'Aspergillus fumigatus', 'Aspergillus niger', 'Aspergillus ochraceus',
       'Aspergillus penicillioides', 'Aspergillus restrictus*',
       'Aspergillus sclerotiorum', 'Aspergillus sydowii', 'Aspergillus unguis',
       'Aspergillus versicolor', 'Aureobasidium pullulans',
       'Chaetomium globosum', 'Cladosporium sphaerospermum',
       'Eurotium (Asp.) amstelodami*', 'Paecilomyces variotii',
       'Penicillium brevicompactum', 'Penicillium corylophilum',
       'Penicillium crustosum*', 'Penicillium purpurogenum',
       'Penicillium spinulosum*', 'Penicillium variabile',
       'Scopulariopsis brevicaulis/fusca', 'Scopulariopsis chartarum',
       'Stachybotrys chartarum', 'Trichoderma viride*', 'Wallemia sebi','Acremonium strictum',
       'Alternaria alternata', 'Aspergillus ustus',
       'Cladosporium cladosporioides 1', 'Cladosporium cladosporioides 2',
       'Cladosporium herbarum', 'Epicoccum nigrum', 'Mucor amphibiorum*',
       'Penicillium chrysogenum', 'Rhizopus stolonifer']

# 0 Values 
zeroValues = ['ND','<1','nd','N/D','N D','Nd', 'Hhh', 'nan', 
               'Hh', 'ND, <1','NDNDND', 'na','Ne','NF','<','n/d',
               'o', 'BD','NaN', 'N d','ND,<2','1<']


Dataframe Parsing

In [249]:
# Creating a dataframe only handling symptom columns 
symptomDF = df[symptomColumns]
print("Symptom Dataframe Columns: ", len(symptomDF.columns))
print(symptomDF.columns)

Symptom Dataframe Columns:  11
Index(['Diagnoses', 'Eyes, Ears, Nose, & Throat', 'Resipiratory', 'Digestive',
       'Circulatory', 'Skin', 'Brain', 'Nervous', 'Urinary', 'Immune',
       'Reproductive'],
      dtype='object')


In [250]:
# Drop all rows where Zip , City, and State are missing
print("Inital Columns Count: ", len(df.columns))
df = df.drop(emptyColumns,axis=1)
print("Count after dropping useless Columns: ", len(df.columns))

Inital Columns Count:  56
Count after dropping useless Columns:  53


In [251]:
# Creating a dataframe only handling symptom columns 
print("Initial Rows before City/State/Zip Drop: ", len(df))
df = df.dropna(subset=['Zip', 'City', 'State'])
print("Rows after Dropping:" ,len(df))

Initial Rows before City/State/Zip Drop:  581
Rows after Dropping: 553


In [252]:

# Creating dataframe handling location Data 
locationDF = df[locationColumns]
print("Location Dataframe Columns: ", len(locationDF.columns) )
print(locationDF.columns)

Location Dataframe Columns:  3
Index(['City', 'State', 'Zip'], dtype='object')


In [253]:
# Creating dataframe handling health information columns 
healthInfoDF = df[healthInfoColumns]
print("Health Information Dataframe Columns: ", len(healthInfoDF.columns))
print(healthInfoDF.columns)

Health Information Dataframe Columns:  3
Index(['Sum of the Logs (Group I)', 'Sum of the Logs (Group II)',
       'ERMI Score (Group I - Group II)'],
      dtype='object')


In [254]:
# Creating dataframe handling mold
moldDF = df[moldColumns]
print("Mold Dataframe Columns: ", len(moldDF.columns))
print(moldDF.columns)

Mold Dataframe Columns:  36
Index(['Aspergillus flavus/oryzae', 'Aspergillus fumigatus',
       'Aspergillus niger', 'Aspergillus ochraceus',
       'Aspergillus penicillioides', 'Aspergillus restrictus*',
       'Aspergillus sclerotiorum', 'Aspergillus sydowii', 'Aspergillus unguis',
       'Aspergillus versicolor', 'Aureobasidium pullulans',
       'Chaetomium globosum', 'Cladosporium sphaerospermum',
       'Eurotium (Asp.) amstelodami*', 'Paecilomyces variotii',
       'Penicillium brevicompactum', 'Penicillium corylophilum',
       'Penicillium crustosum*', 'Penicillium purpurogenum',
       'Penicillium spinulosum*', 'Penicillium variabile',
       'Scopulariopsis brevicaulis/fusca', 'Scopulariopsis chartarum',
       'Stachybotrys chartarum', 'Trichoderma viride*', 'Wallemia sebi',
       'Acremonium strictum', 'Alternaria alternata', 'Aspergillus ustus',
       'Cladosporium cladosporioides 1', 'Cladosporium cladosporioides 2',
       'Cladosporium herbarum', 'Epicoccum nigrum'

One Hot Encode

In [255]:
def oneHotEncodeSymptoms(symptomDF_input):
    for i in symptomDF_input.columns:
        systemSymptomList = (df[i].dropna()   # Drop any remaining NaNs (though there shouldn't be any after fillna)
                        .str.lower()         # Convert to lowercase
                        .str.replace(" ", "") # Remove spaces
                        .str.split(',')      # Split by commas
                        .explode()           # Explode lists to rows
                        .unique())           # Get unique values
        
        for j in systemSymptomList:
            newColumn = str(i) + "_" + str(j)
            df[newColumn] = df[i].apply(lambda x: 1 if j in str(x).lower().replace(" ", "") else 0)
        systemSymptomList = []
    return df
    


In [256]:
# One Hot Encoding Symptoms 
print("One Hot Encoding Symptom DF, Inital Columns: ", len(symptomDF.columns))

symptomDF = oneHotEncodeSymptoms(symptomDF)
pd.set_option('display.max_columns', None)  # Display all columns
print(symptomDF.columns)

#Fixing split entry 'candidiasis(i.e. mouth, genitals, skin, internal)'



#split_symptomDF = oneHotEncodeSymptoms(symptomDF)
#print("One Hot Encoded Columns: ", len(split_symptomDF.columns))

One Hot Encoding Symptom DF, Inital Columns:  11


  df[newColumn] = df[i].apply(lambda x: 1 if j in str(x).lower().replace(" ", "") else 0)
  df[newColumn] = df[i].apply(lambda x: 1 if j in str(x).lower().replace(" ", "") else 0)
  df[newColumn] = df[i].apply(lambda x: 1 if j in str(x).lower().replace(" ", "") else 0)
  df[newColumn] = df[i].apply(lambda x: 1 if j in str(x).lower().replace(" ", "") else 0)
  df[newColumn] = df[i].apply(lambda x: 1 if j in str(x).lower().replace(" ", "") else 0)
  df[newColumn] = df[i].apply(lambda x: 1 if j in str(x).lower().replace(" ", "") else 0)
  df[newColumn] = df[i].apply(lambda x: 1 if j in str(x).lower().replace(" ", "") else 0)
  df[newColumn] = df[i].apply(lambda x: 1 if j in str(x).lower().replace(" ", "") else 0)
  df[newColumn] = df[i].apply(lambda x: 1 if j in str(x).lower().replace(" ", "") else 0)


Index(['City', 'State', 'Zip', 'Aspergillus flavus/oryzae',
       'Aspergillus fumigatus', 'Aspergillus niger', 'Aspergillus ochraceus',
       'Aspergillus penicillioides', 'Aspergillus restrictus*',
       'Aspergillus sclerotiorum',
       ...
       'Immune_chronicmonoorepstein-barrvirus',
       'Immune_frequentherpesoutbreaks',
       'Immune_viralinfectionsbecomebacterial',
       'Immune_increasedsusceptibilitytocancer',
       'Reproductive_changesinmenstrualcycle',
       'Reproductive_inconsistentmenstrualcycle',
       'Reproductive_vaginalyeastorbacterialinfections',
       'Reproductive_jockitch', 'Reproductive_infertilityinbothgenders',
       'Reproductive_hormoneimbalances'],
      dtype='object', length=160)