In [35]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_absolute_error

In [20]:
#Read the data
df = pd.read_csv("food-allergy-analysis-Zenodo.csv")
df.head()

Unnamed: 0,SUBJECT_ID,BIRTH_YEAR,GENDER_FACTOR,RACE_FACTOR,ETHNICITY_FACTOR,PAYER_FACTOR,ATOPIC_MARCH_COHORT,AGE_START_YEARS,AGE_END_YEARS,SHELLFISH_ALG_START,...,CASHEW_ALG_END,ATOPIC_DERM_START,ATOPIC_DERM_END,ALLERGIC_RHINITIS_START,ALLERGIC_RHINITIS_END,ASTHMA_START,ASTHMA_END,FIRST_ASTHMARX,LAST_ASTHMARX,NUM_ASTHMARX
0,1,2006,S1 - Female,R1 - Black,E0 - Non-Hispanic,P1 - Medicaid,False,0.093087,3.164956,,...,,,,,,,,,,
1,2,1994,S1 - Female,R0 - White,E0 - Non-Hispanic,P0 - Non-Medicaid,False,12.232717,18.880219,,...,,,,,,,,12.262834,18.880219,2.0
2,3,2006,S0 - Male,R0 - White,E1 - Hispanic,P0 - Non-Medicaid,True,0.010951,6.726899,,...,,4.884326,,3.917864,6.157426,5.127995,,1.404517,6.157426,4.0
3,4,2004,S0 - Male,R4 - Unknown,E1 - Hispanic,P0 - Non-Medicaid,False,2.398357,9.111567,,...,,,,,,,,,,
4,5,2006,S1 - Female,R1 - Black,E0 - Non-Hispanic,P0 - Non-Medicaid,False,0.013689,6.193018,,...,,,,,,,,,,


In [21]:
#Displaying the missing data
df.isna()

Unnamed: 0,SUBJECT_ID,BIRTH_YEAR,GENDER_FACTOR,RACE_FACTOR,ETHNICITY_FACTOR,PAYER_FACTOR,ATOPIC_MARCH_COHORT,AGE_START_YEARS,AGE_END_YEARS,SHELLFISH_ALG_START,...,CASHEW_ALG_END,ATOPIC_DERM_START,ATOPIC_DERM_END,ALLERGIC_RHINITIS_START,ALLERGIC_RHINITIS_END,ASTHMA_START,ASTHMA_END,FIRST_ASTHMARX,LAST_ASTHMARX,NUM_ASTHMARX
0,False,False,False,False,False,False,False,False,False,True,...,True,True,True,True,True,True,True,True,True,True
1,False,False,False,False,False,False,False,False,False,True,...,True,True,True,True,True,True,True,False,False,False
2,False,False,False,False,False,False,False,False,False,True,...,True,False,True,False,False,False,True,False,False,False
3,False,False,False,False,False,False,False,False,False,True,...,True,True,True,True,True,True,True,True,True,True
4,False,False,False,False,False,False,False,False,False,True,...,True,True,True,True,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
333195,False,False,False,False,False,False,False,False,False,True,...,True,False,True,True,True,False,False,True,True,True
333196,False,False,False,False,False,False,False,False,False,True,...,True,True,True,True,True,True,True,True,True,True
333197,False,False,False,False,False,False,False,False,False,True,...,True,True,True,True,True,False,True,False,False,False
333198,False,False,False,False,False,False,False,False,False,True,...,True,True,True,True,True,True,True,True,True,True


In [22]:
#Displaying the sum of missing values within the dataset
df.isnull().sum()

SUBJECT_ID                      0
BIRTH_YEAR                      0
GENDER_FACTOR                   0
RACE_FACTOR                     0
ETHNICITY_FACTOR                0
PAYER_FACTOR                    0
ATOPIC_MARCH_COHORT             0
AGE_START_YEARS                 0
AGE_END_YEARS                   0
SHELLFISH_ALG_START        327954
SHELLFISH_ALG_END          332149
FISH_ALG_START             331404
FISH_ALG_END               332673
MILK_ALG_START             325911
MILK_ALG_END               328620
SOY_ALG_START              330781
SOY_ALG_END                331769
EGG_ALG_START              327135
EGG_ALG_END                329907
WHEAT_ALG_START            332054
WHEAT_ALG_END              332512
PEANUT_ALG_START           324547
PEANUT_ALG_END             331108
SESAME_ALG_START           332434
SESAME_ALG_END             333022
TREENUT_ALG_START          333199
TREENUT_ALG_END            333200
WALNUT_ALG_START           332496
WALNUT_ALG_END             333034
PECAN_ALG_STAR

In [23]:
#Check data types
df.dtypes

SUBJECT_ID                   int64
BIRTH_YEAR                   int64
GENDER_FACTOR               object
RACE_FACTOR                 object
ETHNICITY_FACTOR            object
PAYER_FACTOR                object
ATOPIC_MARCH_COHORT           bool
AGE_START_YEARS            float64
AGE_END_YEARS              float64
SHELLFISH_ALG_START        float64
SHELLFISH_ALG_END          float64
FISH_ALG_START             float64
FISH_ALG_END               float64
MILK_ALG_START             float64
MILK_ALG_END               float64
SOY_ALG_START              float64
SOY_ALG_END                float64
EGG_ALG_START              float64
EGG_ALG_END                float64
WHEAT_ALG_START            float64
WHEAT_ALG_END              float64
PEANUT_ALG_START           float64
PEANUT_ALG_END             float64
SESAME_ALG_START           float64
SESAME_ALG_END             float64
TREENUT_ALG_START          float64
TREENUT_ALG_END            float64
WALNUT_ALG_START           float64
WALNUT_ALG_END      

In [24]:
#Change columns type to numeric ones
df['ATOPIC_MARCH_COHORT'] = df['ATOPIC_MARCH_COHORT'].astype('int')
df['GENDER_FACTOR'] = pd.to_numeric(df['GENDER_FACTOR'], errors='coerce')
df['RACE_FACTOR'] = pd.to_numeric(df['RACE_FACTOR'], errors='coerce')
df['ETHNICITY_FACTOR'] = pd.to_numeric(df['ETHNICITY_FACTOR'], errors='coerce')
df['PAYER_FACTOR'] = pd.to_numeric(df['PAYER_FACTOR'], errors='coerce')

In [25]:
#Drop missing values
df.dropna(axis=1, inplace=True)
df

Unnamed: 0,SUBJECT_ID,BIRTH_YEAR,ATOPIC_MARCH_COHORT,AGE_START_YEARS,AGE_END_YEARS
0,1,2006,0,0.093087,3.164956
1,2,1994,0,12.232717,18.880219
2,3,2006,1,0.010951,6.726899
3,4,2004,0,2.398357,9.111567
4,5,2006,0,0.013689,6.193018
...,...,...,...,...,...
333195,333196,2006,0,0.736482,7.449692
333196,333197,2006,1,0.019165,6.984257
333197,333198,2006,0,0.443532,7.405886
333198,333199,2006,1,0.013689,7.017112


In [26]:
#Dataset splitting
y = df.ATOPIC_MARCH_COHORT
features = ['BIRTH_YEAR', 'AGE_START_YEARS', 'AGE_END_YEARS']
X = df[features].copy()

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [27]:
X_train.head()

Unnamed: 0,BIRTH_YEAR,AGE_START_YEARS,AGE_END_YEARS
250838,2003,0.060233,6.124572
51194,2000,1.779603,12.996578
163454,1989,11.759069,18.877481
261153,2003,1.201916,10.064339
285627,2003,1.352498,9.590691


In [36]:
#Create linear regression model, training phase
model = LogisticRegression()

model.fit(X_train, y_train)

LogisticRegression()

In [37]:
#Validation phase
validation_predictions = model.predict(X_valid)

mean_absolute_error(y_valid, validation_predictions)

0.10496698679471789