## Importing the data

In [42]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [47]:
def get_accuracy(y_pred,y_test):
    correctly_classified=0
    for count in range(np.size(y_pred)) :
        if y_test[count]==y_pred[count]:
           correctly_classified=correctly_classified+1
    acc=(correctly_classified/len(y_pred))*100
    print("Accuracy on test set is {:.2f}".format(acc))
    return acc 

### Importing the dataset

In [34]:
df=pd.read_csv("food-allergy-analysis-Zenodo.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 333200 entries, 0 to 333199
Data columns (total 50 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   SUBJECT_ID               333200 non-null  int64  
 1   BIRTH_YEAR               333200 non-null  int64  
 2   GENDER_FACTOR            333200 non-null  object 
 3   RACE_FACTOR              333200 non-null  object 
 4   ETHNICITY_FACTOR         333200 non-null  object 
 5   PAYER_FACTOR             333200 non-null  object 
 6   ATOPIC_MARCH_COHORT      333200 non-null  bool   
 7   AGE_START_YEARS          333200 non-null  float64
 8   AGE_END_YEARS            333200 non-null  float64
 9   SHELLFISH_ALG_START      5246 non-null    float64
 10  SHELLFISH_ALG_END        1051 non-null    float64
 11  FISH_ALG_START           1796 non-null    float64
 12  FISH_ALG_END             527 non-null     float64
 13  MILK_ALG_START           7289 non-null    float64
 14  MILK

In [35]:
df.head()

Unnamed: 0,SUBJECT_ID,BIRTH_YEAR,GENDER_FACTOR,RACE_FACTOR,ETHNICITY_FACTOR,PAYER_FACTOR,ATOPIC_MARCH_COHORT,AGE_START_YEARS,AGE_END_YEARS,SHELLFISH_ALG_START,...,CASHEW_ALG_END,ATOPIC_DERM_START,ATOPIC_DERM_END,ALLERGIC_RHINITIS_START,ALLERGIC_RHINITIS_END,ASTHMA_START,ASTHMA_END,FIRST_ASTHMARX,LAST_ASTHMARX,NUM_ASTHMARX
0,1,2006,S1 - Female,R1 - Black,E0 - Non-Hispanic,P1 - Medicaid,False,0.093087,3.164956,,...,,,,,,,,,,
1,2,1994,S1 - Female,R0 - White,E0 - Non-Hispanic,P0 - Non-Medicaid,False,12.232717,18.880219,,...,,,,,,,,12.262834,18.880219,2.0
2,3,2006,S0 - Male,R0 - White,E1 - Hispanic,P0 - Non-Medicaid,True,0.010951,6.726899,,...,,4.884326,,3.917864,6.157426,5.127995,,1.404517,6.157426,4.0
3,4,2004,S0 - Male,R4 - Unknown,E1 - Hispanic,P0 - Non-Medicaid,False,2.398357,9.111567,,...,,,,,,,,,,
4,5,2006,S1 - Female,R1 - Black,E0 - Non-Hispanic,P0 - Non-Medicaid,False,0.013689,6.193018,,...,,,,,,,,,,


### Preparing the dataset

We can see that we have variables that aren't numerical so we have to change that :

In [36]:
#We use the get_dummies function 
df=pd.get_dummies(df,columns=["GENDER_FACTOR","ETHNICITY_FACTOR","RACE_FACTOR","PAYER_FACTOR","ATOPIC_MARCH_COHORT"],dtype=int)
#For the columns that had only two optiions, we only keep one of the two resulting columns
df.drop(["GENDER_FACTOR_S1 - Female","ETHNICITY_FACTOR_E0 - Non-Hispanic","PAYER_FACTOR_P0 - Non-Medicaid","ATOPIC_MARCH_COHORT_False"],axis="columns",inplace=True)

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 333200 entries, 0 to 333199
Data columns (total 54 columns):
 #   Column                                      Non-Null Count   Dtype  
---  ------                                      --------------   -----  
 0   SUBJECT_ID                                  333200 non-null  int64  
 1   BIRTH_YEAR                                  333200 non-null  int64  
 2   AGE_START_YEARS                             333200 non-null  float64
 3   AGE_END_YEARS                               333200 non-null  float64
 4   SHELLFISH_ALG_START                         5246 non-null    float64
 5   SHELLFISH_ALG_END                           1051 non-null    float64
 6   FISH_ALG_START                              1796 non-null    float64
 7   FISH_ALG_END                                527 non-null     float64
 8   MILK_ALG_START                              7289 non-null    float64
 9   MILK_ALG_END                                4580 non-null    float64
 

Lastly we can also see that we have a lot of NaN values , so we will replace them with zeros instead :

In [40]:
df=df.fillna(0)

### Spliting the data

In [41]:
x=df.iloc[:,:-1].values
y=df.iloc[:,-1:].values

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)

### Building The logistic model

In [43]:
sk_model = LogisticRegression()

In [44]:
sk_model.fit(x_train,y_train)

  return f(*args, **kwargs)


LogisticRegression()

In [45]:
y_pred=sk_model.predict(x_test)

In [49]:
acc=get_accuracy(y_pred,y_test)

Accuracy on test set is 90.93


We got an accuracy of 91% which is an excellent result