In [1]:
import csv
import os
import pandas as pd

from sklearn.preprocessing import StandardScaler, MinMaxScaler


In [2]:
filename = os.path.join(os.getcwd(), "WeInspectB.csv")
df = pd.read_csv(filename, header=0)

df.head()

Unnamed: 0,City,State,Zip,Aspergillus flavus/oryzae,Aspergillus fumigatus,Aspergillus niger,Aspergillus ochraceus,Aspergillus penicillioides,Aspergillus restrictus*,Aspergillus sclerotiorum,...,"Eyes, Ears, Nose, & Throat",Resipiratory,Digestive,Circulatory,Skin,Brain,Nervous,Urinary,Immune,Reproductive
0,Mountain Brook,Alabama,35223,ND,Nd,14,Nd,,11,Nd,...,"Sneezing/Allergies, Dry/Irritated eyes, Chroni...",,"Diarrhea/Constipation, Bloating, Abdominal pai...","Raynaud's phenomenon, Low or reactive blood pr...","Itchy/Peeling skin, Skin rash, Fungal infections",Brain fog,"Anxiousness, Fatigue",Overactive bladder,,"Changes in menstrual cycle, Inconsistent menst..."
1,Wasilla,Alaska,99623,ND,ND,12,ND,5.0,ND,ND,...,"Sneezing/Allergies, Runny nose/Post-nasal drip...","Shortness of breath, Chronic dry cough, Chroni...","Significant weight gain/loss, Diarrhea/Constip...","Cherry angiomas, Easy bruising/bleeding","Itchy/Peeling skin, Skin rash, Eczema","Brain fog, Memory loss, Anger/Aggressiveness","Anxiousness, Depression, Headache, Migraine, I...",,Long-lasting colds,"Vaginal yeast or bacterial infections, Hormone..."
2,Soldotna,Alaska,99669-8654,ND,3,3,234,,6,ND,...,"Sneezing/Allergies, Runny nose/Post-nasal drip...","Shortness of breath, Heaviness in the chest","Significant weight gain/loss, Bloating, Sweet ...","Spider veins, Easy bruising/bleeding, Iron def...","Itchy/Peeling skin, Flushing, Fungal infections","Brain fog, Confusion, Memory loss, Trouble fin...","Anxiousness, Depression, Headache, Insomnia, F...",Overactive bladder,"Increased susceptibility to infection, Chronic...","Changes in menstrual cycle, Inconsistent menst..."
3,Soldotna,Alaska,99669,ND,ND,ND,ND,,ND,ND,...,,,,,,,,,,
4,Anchorage,Alaska,99508-4014,64,5,32,ND,,6,ND,...,"Runny nose/Post-nasal drip, Ear popping/ringin...",,Significant weight gain/loss,"Spider veins, Cherry angiomas, Raynaud's pheno...",Eczema,"Brain fog, Trouble finding the right word, Ang...","Anxiousness, Fatigue",,,


In [3]:
# Tracking the number of columns before dropping.
cols = len(df.columns)

In [4]:
# Dropping empty columns.
df = df.dropna(axis = 1, how = 'all')

In [5]:
# Dropping constant columns.
constCols = df.columns[df.nunique() == 1]
df = df.drop(columns = constCols)

In [6]:
# Dropping 'Zip' column.
df = df.drop(columns = 'Zip', axis = 1)

In [7]:
# Removing '*' from column names.
df.columns = df.columns.str.replace('*', '', regex = True)

In [8]:
# Tracking the number of columns that have been dropped.
cols2 = len(df.columns)

print('Number of columns before: ')
print(cols)
print('Number of columns after: ')
print(cols2)

Number of columns before: 
56
Number of columns after: 
52


In [9]:
# Dropping empty rows within location columns.
print('Number of rows before: ')
print(len(df.index))

locCols = df.columns[0:2]
df = df.dropna(subset = locCols)

print('Number of rows after: ')
print(len(df.index)) 

Number of rows before: 
581
Number of rows after: 
553


In [10]:
# Replacing nan values in symptom columns.

symptomsStart = df.columns.get_loc('Diagnoses')
symptomsEnd   = df.columns.get_loc('Reproductive')

for columnName in df.columns[symptomsStart:symptomsEnd + 1]:
    df[columnName] = df[columnName].fillna('No symptoms')

In [11]:
# Special Functions

# Function to remove special chars from values:
specialChars = ['*', ',', '<', '>', ' ']

def removeChars(s):
    if not isinstance(s, float) and '|'.join(specialChars) in s:
        return s.replace(specialChars, '')
    return s 

# Function to convert non-numeric values to float or 0:
def convertValues(value):
    try:
        return float(value)
    except ValueError:
        return 0

In [12]:
# Mold columns clean-up.

moldStart = df.columns.get_loc('Aspergillus flavus/oryzae')
moldEnd   = df.columns.get_loc('ERMI Score (Group I - Group II)')

for columnName in df.columns[moldStart:moldEnd + 1]:
    df[columnName] = df[columnName].apply(removeChars)
    df[columnName] = df[columnName].apply(convertValues)
    df[columnName] = df[columnName].fillna(0)


In [13]:
# Standardizing values within mold columns.
moldCols = df.columns[moldStart:moldEnd]

scaler = StandardScaler()
df[moldCols] = scaler.fit_transform(df[moldCols])

### Prepared Data Frame 
- No One-Hot Encoding

In [14]:
# df.to_csv('Prepared_Data.csv', index=False)

df

Unnamed: 0,City,State,Aspergillus flavus/oryzae,Aspergillus fumigatus,Aspergillus niger,Aspergillus ochraceus,Aspergillus penicillioides,Aspergillus restrictus,Aspergillus sclerotiorum,Aspergillus sydowii,...,"Eyes, Ears, Nose, & Throat",Resipiratory,Digestive,Circulatory,Skin,Brain,Nervous,Urinary,Immune,Reproductive
0,Mountain Brook,Alabama,-0.103755,-0.074702,-0.322177,-0.056391,-0.068051,-0.148075,-0.095310,-0.084083,...,"Sneezing/Allergies, Dry/Irritated eyes, Chroni...",No symptoms,"Diarrhea/Constipation, Bloating, Abdominal pai...","Raynaud's phenomenon, Low or reactive blood pr...","Itchy/Peeling skin, Skin rash, Fungal infections",Brain fog,"Anxiousness, Fatigue",Overactive bladder,No symptoms,"Changes in menstrual cycle, Inconsistent menst..."
1,Wasilla,Alaska,-0.103755,-0.074702,-0.324278,-0.056391,-0.067824,-0.203413,-0.095310,-0.084083,...,"Sneezing/Allergies, Runny nose/Post-nasal drip...","Shortness of breath, Chronic dry cough, Chroni...","Significant weight gain/loss, Diarrhea/Constip...","Cherry angiomas, Easy bruising/bleeding","Itchy/Peeling skin, Skin rash, Eczema","Brain fog, Memory loss, Anger/Aggressiveness","Anxiousness, Depression, Headache, Migraine, I...",No symptoms,Long-lasting colds,"Vaginal yeast or bacterial infections, Hormone..."
2,Soldotna,Alaska,-0.103755,-0.073301,-0.333732,0.048455,-0.068051,-0.173229,-0.095310,-0.078359,...,"Sneezing/Allergies, Runny nose/Post-nasal drip...","Shortness of breath, Heaviness in the chest","Significant weight gain/loss, Bloating, Sweet ...","Spider veins, Easy bruising/bleeding, Iron def...","Itchy/Peeling skin, Flushing, Fungal infections","Brain fog, Confusion, Memory loss, Trouble fin...","Anxiousness, Depression, Headache, Insomnia, F...",Overactive bladder,"Increased susceptibility to infection, Chronic...","Changes in menstrual cycle, Inconsistent menst..."
3,Soldotna,Alaska,-0.103755,-0.074702,-0.336884,-0.056391,-0.068051,-0.203413,-0.095310,-0.084083,...,No symptoms,No symptoms,No symptoms,No symptoms,No symptoms,No symptoms,No symptoms,No symptoms,No symptoms,No symptoms
4,Anchorage,Alaska,0.380883,-0.072366,-0.303268,-0.056391,-0.068051,-0.173229,-0.095310,-0.084083,...,"Runny nose/Post-nasal drip, Ear popping/ringin...",No symptoms,Significant weight gain/loss,"Spider veins, Cherry angiomas, Raynaud's pheno...",Eczema,"Brain fog, Trouble finding the right word, Ang...","Anxiousness, Fatigue",No symptoms,No symptoms,No symptoms
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
576,Greenfield,Wisconsin,-0.103755,-0.074235,-0.305369,-0.043397,-0.068051,-0.132983,-0.083153,-0.084083,...,No symptoms,No symptoms,"Significant weight gain/loss, Nausea/Vomiting,...",Cherry angiomas,"Itchy/Peeling skin, Burning sensation, Flushin...",No symptoms,Dysautonomia,No symptoms,No symptoms,"Changes in menstrual cycle, Vaginal yeast or b..."
577,Shorewood,Wisconsin,-0.020458,-0.067228,-0.158299,-0.053702,-0.068051,0.068247,-0.086193,0.167752,...,No symptoms,No symptoms,No symptoms,No symptoms,Itchy/Peeling skin,Brain fog,No symptoms,No symptoms,No symptoms,No symptoms
578,Waukesha,Wisconsin,-0.065893,-0.067228,-0.010178,-0.044293,-0.068051,-0.193352,-0.067957,-0.084083,...,"Dry/Irritated eyes, Chronic sinusitis",Asthma/Wheezing,Parasites,Raynaud's phenomenon,Itchy/Peeling skin,No symptoms,Fatigue,Bladder infection symptoms with no identifiabl...,Increased susceptibility to infection,No symptoms
579,Laramie,Wyoming,-0.081038,-0.071432,-0.180359,-0.056391,-0.064865,-0.203413,-0.095310,-0.084083,...,No symptoms,No symptoms,No symptoms,No symptoms,No symptoms,No symptoms,No symptoms,No symptoms,No symptoms,No symptoms
