In [469]:
import csv
import pandas as pd
import numpy as np
import os 
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Mute pandas performance warning
warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)
warnings.filterwarnings("ignore", category=pd.core.generic.SettingWithCopyWarning)




In [470]:
df = pd.read_csv('WeInspectB.csv')
df.shape

(581, 56)

In [471]:

nan_count = np.sum(df.isnull(), axis = 0)
print(nan_count)

City                                  5
State                                23
Zip                                   0
Aspergillus flavus/oryzae             0
Aspergillus fumigatus                 0
Aspergillus niger                     0
Aspergillus ochraceus                 0
Aspergillus penicillioides          350
Aspergillus restrictus*               0
Aspergillus sclerotiorum              0
Aspergillus sydowii                   0
Aspergillus unguis                    0
Aspergillus versicolor                0
Aureobasidium pullulans               0
Chaetomium globosum                   0
Cladosporium sphaerospermum           0
Eurotium (Asp.) amstelodami*          0
Paecilomyces variotii                 0
Penicillium brevicompactum            0
Penicillium corylophilum              0
Penicillium crustosum*                0
Penicillium purpurogenum              0
Penicillium spinulosum*               0
Penicillium variabile                 0
Scopulariopsis brevicaulis/fusca      0


Manually Parsed Column Division

In [472]:
### Declaring known value sets ### 

#Empty Columns & Non Valueble columns 
    #(Dropping health concerns because all entries are yes)
emptyColumns = ['Unnamed: 42', 'Unnamed: 43','Health Concerns?'] 

#Location Columns
locationColumns = ['City','State','Zip']

#Syptom Columns (Columns that need to be one-hot encoded)
symptomColumns = ['Diagnoses', 
                'Eyes, Ears, Nose, & Throat',
                'Resipiratory', 'Digestive',
                'Circulatory', 
                'Skin', 
                'Brain', 
                'Nervous', 
                'Urinary', 
                'Immune', 
                'Reproductive']

#Health Information 
healthInfoColumns = [
        'Sum of the Logs (Group I)',
        'Sum of the Logs (Group II)',
        'ERMI Score (Group I - Group II)']

#Molds 
moldColumns = ['Aspergillus flavus/oryzae',
       'Aspergillus fumigatus', 'Aspergillus niger', 'Aspergillus ochraceus',
       'Aspergillus penicillioides', 'Aspergillus restrictus*',
       'Aspergillus sclerotiorum', 'Aspergillus sydowii', 'Aspergillus unguis',
       'Aspergillus versicolor', 'Aureobasidium pullulans',
       'Chaetomium globosum', 'Cladosporium sphaerospermum',
       'Eurotium (Asp.) amstelodami*', 'Paecilomyces variotii',
       'Penicillium brevicompactum', 'Penicillium corylophilum',
       'Penicillium crustosum*', 'Penicillium purpurogenum',
       'Penicillium spinulosum*', 'Penicillium variabile',
       'Scopulariopsis brevicaulis/fusca', 'Scopulariopsis chartarum',
       'Stachybotrys chartarum', 'Trichoderma viride*', 'Wallemia sebi','Acremonium strictum',
       'Alternaria alternata', 'Aspergillus ustus',
       'Cladosporium cladosporioides 1', 'Cladosporium cladosporioides 2',
       'Cladosporium herbarum', 'Epicoccum nigrum', 'Mucor amphibiorum*',
       'Penicillium chrysogenum', 'Rhizopus stolonifer']

Dataframe Parsing

In [473]:
# Drop all rows where Zip , City, and State are missing
print("Inital Columns Count: ", len(df.columns))
df = df.drop(emptyColumns,axis=1)
print("Count after dropping useless Columns: ", len(df.columns))

Inital Columns Count:  56
Count after dropping useless Columns:  53


In [474]:
# Creating a dataframe only handling symptom columns 
print("Initial Rows before City/State/Zip Drop: ", len(df))
df = df.dropna(subset=['Zip', 'City', 'State'])
print("Rows after Dropping:" ,len(df))

Initial Rows before City/State/Zip Drop:  581
Rows after Dropping: 553


In [475]:

# Creating dataframe handling location Data 
locationDF = df[locationColumns]
print("Location Dataframe Columns: ", len(locationDF.columns) )
print(locationDF.shape)

Location Dataframe Columns:  3
(553, 3)


In [476]:
# Creating dataframe handling health information columns 
healthInfoDF = df[healthInfoColumns]
print("Health Information Dataframe Columns: ", len(healthInfoDF.columns))
print(healthInfoDF.shape)

Health Information Dataframe Columns:  3
(553, 3)


In [477]:
# Creating dataframe handling mold
moldDF = df[moldColumns]
print("Mold Dataframe Columns: ", len(moldDF.columns))
print(moldDF.columns)
nan_count = np.sum(df.isnull(), axis = 0)

Mold Dataframe Columns:  36
Index(['Aspergillus flavus/oryzae', 'Aspergillus fumigatus',
       'Aspergillus niger', 'Aspergillus ochraceus',
       'Aspergillus penicillioides', 'Aspergillus restrictus*',
       'Aspergillus sclerotiorum', 'Aspergillus sydowii', 'Aspergillus unguis',
       'Aspergillus versicolor', 'Aureobasidium pullulans',
       'Chaetomium globosum', 'Cladosporium sphaerospermum',
       'Eurotium (Asp.) amstelodami*', 'Paecilomyces variotii',
       'Penicillium brevicompactum', 'Penicillium corylophilum',
       'Penicillium crustosum*', 'Penicillium purpurogenum',
       'Penicillium spinulosum*', 'Penicillium variabile',
       'Scopulariopsis brevicaulis/fusca', 'Scopulariopsis chartarum',
       'Stachybotrys chartarum', 'Trichoderma viride*', 'Wallemia sebi',
       'Acremonium strictum', 'Alternaria alternata', 'Aspergillus ustus',
       'Cladosporium cladosporioides 1', 'Cladosporium cladosporioides 2',
       'Cladosporium herbarum', 'Epicoccum nigrum'

In [478]:
# Creating a dataframe only handling symptom columns 
symptomDF = df[symptomColumns]
print("Symptom Dataframe Columns: ", len(symptomDF.columns))
print(symptomDF.columns)

Symptom Dataframe Columns:  11
Index(['Diagnoses', 'Eyes, Ears, Nose, & Throat', 'Resipiratory', 'Digestive',
       'Circulatory', 'Skin', 'Brain', 'Nervous', 'Urinary', 'Immune',
       'Reproductive'],
      dtype='object')


One Hot Encode Symptoms

In [479]:
def oneHotEncodeSymptoms(symptomDF):
    # For every column(body system) in symptom DF columns
    for column in symptomDF.columns:
        systemSymptomList = (df[column]
                        .dropna()             # Drop any remaining NaNs (though there shouldn't be any after fillna)
                        .str.lower()          # Convert to lowercase
                        .str.replace(" ", "") # Remove spaces
                        .str.split(',')       # Split by commas
                        .explode()            # Explode lists to rows
                        .unique())            # Get unique values
        # For every symptom in the list, create [body_system]_[symptom]
        # and for every row enter 1 if symptom string exists in element and 0 if not 
        for symptom in systemSymptomList:
            newColumn = f"{column}_{symptom}"
            symptomDF[newColumn] = df[column].apply(lambda x: 1 if symptom in str(x).lower().replace(" ", "") else 0)
        systemSymptomList = []
    return symptomDF

In [480]:
# One Hot Encoding Symptoms 
print("One Hot Encoding Symptom DF, Inital Columns: ", symptomDF.shape)

symptomDF = oneHotEncodeSymptoms(symptomDF)
print("One Hot Encoded Columns: ", symptomDF.shape)
columns = ['Diagnoses', 'Eyes, Ears, Nose, & Throat', 'Resipiratory', 'Digestive', 'Circulatory', 'Skin', 'Brain', 
           'Nervous', 'Urinary', 'Immune', 'Reproductive']

drop = ["Diagnoses_mouth",
"Diagnoses_genitals",
"Diagnoses_skin",
"Diagnoses_internal)"]

#Drop all original and extra columns 
columns = columns + drop
symptomDF.drop(columns=columns,inplace=True)

print("One Hot Encoded Columns, dropped: ", symptomDF.shape)

symptomDF.shape

One Hot Encoding Symptom DF, Inital Columns:  (553, 11)
One Hot Encoded Columns:  (553, 118)
One Hot Encoded Columns, dropped:  (553, 103)


(553, 103)

In [481]:
symptomDF.head(5)

Unnamed: 0,Diagnoses_hyperthyroid/hypothyroid,Diagnoses_ibs,Diagnoses_pcos,Diagnoses_cirs,Diagnoses_mcas,Diagnoses_auto-immunedisease,Diagnoses_lymedisease,Diagnoses_pots,Diagnoses_chronicfatiguesyndrome,Diagnoses_candidiasis(i.e.,...,Immune_chronicmonoorepstein-barrvirus,Immune_frequentherpesoutbreaks,Immune_viralinfectionsbecomebacterial,Immune_increasedsusceptibilitytocancer,Reproductive_changesinmenstrualcycle,Reproductive_inconsistentmenstrualcycle,Reproductive_vaginalyeastorbacterialinfections,Reproductive_jockitch,Reproductive_infertilityinbothgenders,Reproductive_hormoneimbalances
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,1,1,1,1,1,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
2,0,0,0,1,1,0,0,0,0,0,...,1,0,0,0,1,1,0,0,0,1
3,1,1,0,0,0,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
4,1,0,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


Cleaning Mold Columns

In [482]:
values_to_replace=['ND','<1','nd','N/D','N D','Nd', 'Hhh', 'nan', 'Hh', 'ND, <1','NDNDND', 'na','Ne','NF','<','n/d','o', 'BD','NaN', 'N d', 'ND<1','ND<2', 'ND<3', 'Donotseelisted','NE', '0`', '<!', 'I9', '1<', 'NE', '4/40', '1/10', 'O', '>1','NS', '3/30', 'ND ', 'ND']

for i in moldColumns:
    for j in values_to_replace:
        moldDF[i] = moldDF[i].replace(j, '0')
        moldDF[i] = moldDF[i].fillna(0)
        moldDF[i] = moldDF[i].astype(str)
        moldDF[i] = moldDF[i].str.replace('*', '')
        moldDF[i] = moldDF[i].str.replace(' ', '')
        moldDF[i] = moldDF[i].str.replace(',', '')
        moldDF[i] = moldDF[i].str.replace('.', '')

def f(data):
    print(data)
    return data

for column in moldDF.columns: 
    #print(column)
    for row in moldDF[column].items():
        if not row[1].isnumeric() :
            print(row[1])
            
#Function above prints nothing, which means everything is numeric now 



moldDF.shape

(553, 36)

One hot encoding City and State Columns, Not accounting zipcode 

In [483]:
locationDF.shape

(553, 3)

In [484]:
#Before 
locationDF.head(5)

Unnamed: 0,City,State,Zip
0,Mountain Brook,Alabama,35223
1,Wasilla,Alaska,99623
2,Soldotna,Alaska,99669-8654
3,Soldotna,Alaska,99669
4,Anchorage,Alaska,99508-4014


In [485]:

from sklearn.preprocessing import OneHotEncoder
# Assuming df is your DataFrame and 'Category' is the categorical column
encoder = OneHotEncoder(sparse=False)
encoded_array = encoder.fit_transform(locationDF[['City']])
encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out(['City']))
encoded_df.shape




(553, 433)

In [486]:
encoded_df.head(5)

Unnamed: 0,City_A,City_Abbeville,City_Akron,City_Alameda,City_Albany,City_Albuquerque,City_Aldie,City_Aledo,City_Alhambra,City_Allen Park,...,City_lake St Louis,City_los angeles,City_oFallon,City_oceanside,City_pasadena,City_philadelphia,City_st charles,City_st marys,City_towson,City_west linn
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [487]:
#Doing States now 
locationDF['State'] = df['State'].str.replace('\n','') #fixed one of the extra ones, there was a \n issue 

encoder = OneHotEncoder(sparse=False)
encoded_array = encoder.fit_transform(locationDF[['State']])
encoded_df_states = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out(['State']))
encoded_df_states.shape




(553, 53)

In [488]:
encoded_df_states.head()

Unnamed: 0,State_Alabama,State_Alaska,State_Alberta,State_Arizona,State_Arkansas,State_British Columbia,State_California,State_Colorado,State_Connecticut,State_Delaware,...,State_Saskatchewan,State_South Carolina,State_Tennessee,State_Texas,State_Utah,State_Vermont,State_Virginia,State_Washington,State_Wisconsin,State_Wyoming
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [489]:
#Combining two edited one hot encoded columns 
#encoded_df.shape
locationDF = pd.concat([encoded_df,encoded_df_states],axis=1)
locationDF.shape

(553, 486)

In [490]:


nan_count = np.sum(locationDF.isnull(), axis = 0)
print(nan_count)

City_A              0
City_Abbeville      0
City_Akron          0
City_Alameda        0
City_Albany         0
                   ..
State_Vermont       0
State_Virginia      0
State_Washington    0
State_Wisconsin     0
State_Wyoming       0
Length: 486, dtype: int64


Combining Data Frames and Writing to csv

In [491]:
print(locationDF.shape)
print(moldDF.shape)
print(healthInfoDF.shape)
print(symptomDF.shape)
locationDF = locationDF.reset_index(drop=True)
moldDF = moldDF.reset_index(drop=True)
healthInfoDF = healthInfoDF.reset_index(drop=True)
symptomDF = symptomDF.reset_index(drop=True)

df_final = pd.concat([locationDF,moldDF,healthInfoDF,symptomDF],axis =1)
df_final.shape

(553, 486)
(553, 36)
(553, 3)
(553, 103)


(553, 628)

In [492]:
df_final.to_csv("cleanedData.csv",index=False)
df_final.shape

(553, 628)