In [1]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression 

In [2]:
# calculate the accuracy 
def get_accuracy(y_pred,y_test):
    correctly_classified = 0
    for count in range(np.size(y_pred)) :
        if y_test[count] == y_pred[count]:
            correctly_classified = correctly_classified +1
    acc = (correctly_classified / len(y_pred))*100
    print("Accuracy on testset = {:.2f}".format(acc))
    return acc 

In [3]:
df = pd.read_csv("food-allergy-analysis-Zenodo.csv")

In [4]:
df.head()

Unnamed: 0,SUBJECT_ID,BIRTH_YEAR,GENDER_FACTOR,RACE_FACTOR,ETHNICITY_FACTOR,PAYER_FACTOR,ATOPIC_MARCH_COHORT,AGE_START_YEARS,AGE_END_YEARS,SHELLFISH_ALG_START,...,CASHEW_ALG_END,ATOPIC_DERM_START,ATOPIC_DERM_END,ALLERGIC_RHINITIS_START,ALLERGIC_RHINITIS_END,ASTHMA_START,ASTHMA_END,FIRST_ASTHMARX,LAST_ASTHMARX,NUM_ASTHMARX
0,1,2006,S1 - Female,R1 - Black,E0 - Non-Hispanic,P1 - Medicaid,False,0.093087,3.164956,,...,,,,,,,,,,
1,2,1994,S1 - Female,R0 - White,E0 - Non-Hispanic,P0 - Non-Medicaid,False,12.232717,18.880219,,...,,,,,,,,12.262834,18.880219,2.0
2,3,2006,S0 - Male,R0 - White,E1 - Hispanic,P0 - Non-Medicaid,True,0.010951,6.726899,,...,,4.884326,,3.917864,6.157426,5.127995,,1.404517,6.157426,4.0
3,4,2004,S0 - Male,R4 - Unknown,E1 - Hispanic,P0 - Non-Medicaid,False,2.398357,9.111567,,...,,,,,,,,,,
4,5,2006,S1 - Female,R1 - Black,E0 - Non-Hispanic,P0 - Non-Medicaid,False,0.013689,6.193018,,...,,,,,,,,,,


In [5]:
# deal with the data types 
df.dtypes # from index 9 the Food alergies start  

SUBJECT_ID                   int64
BIRTH_YEAR                   int64
GENDER_FACTOR               object
RACE_FACTOR                 object
ETHNICITY_FACTOR            object
PAYER_FACTOR                object
ATOPIC_MARCH_COHORT           bool
AGE_START_YEARS            float64
AGE_END_YEARS              float64
SHELLFISH_ALG_START        float64
SHELLFISH_ALG_END          float64
FISH_ALG_START             float64
FISH_ALG_END               float64
MILK_ALG_START             float64
MILK_ALG_END               float64
SOY_ALG_START              float64
SOY_ALG_END                float64
EGG_ALG_START              float64
EGG_ALG_END                float64
WHEAT_ALG_START            float64
WHEAT_ALG_END              float64
PEANUT_ALG_START           float64
PEANUT_ALG_END             float64
SESAME_ALG_START           float64
SESAME_ALG_END             float64
TREENUT_ALG_START          float64
TREENUT_ALG_END            float64
WALNUT_ALG_START           float64
WALNUT_ALG_END      

In [6]:
# Convert the non-numerical data 

In [7]:
convert_type = df.iloc[:,2:6] # except the boolean variable 

In [8]:
for column in convert_type.columns : 
    df[column] = df[column].astype(str).str[1] 

In [9]:
# add the boolean var to change it values to int 
convert_type['ATOPIC_MARCH_COHORT'] = df['ATOPIC_MARCH_COHORT'] 
for column in convert_type.columns :
    df[column] = df[column].astype("int")

In [10]:
df.head() # great ! 

Unnamed: 0,SUBJECT_ID,BIRTH_YEAR,GENDER_FACTOR,RACE_FACTOR,ETHNICITY_FACTOR,PAYER_FACTOR,ATOPIC_MARCH_COHORT,AGE_START_YEARS,AGE_END_YEARS,SHELLFISH_ALG_START,...,CASHEW_ALG_END,ATOPIC_DERM_START,ATOPIC_DERM_END,ALLERGIC_RHINITIS_START,ALLERGIC_RHINITIS_END,ASTHMA_START,ASTHMA_END,FIRST_ASTHMARX,LAST_ASTHMARX,NUM_ASTHMARX
0,1,2006,1,1,0,1,0,0.093087,3.164956,,...,,,,,,,,,,
1,2,1994,1,0,0,0,0,12.232717,18.880219,,...,,,,,,,,12.262834,18.880219,2.0
2,3,2006,0,0,1,0,1,0.010951,6.726899,,...,,4.884326,,3.917864,6.157426,5.127995,,1.404517,6.157426,4.0
3,4,2004,0,4,1,0,0,2.398357,9.111567,,...,,,,,,,,,,
4,5,2006,1,1,0,0,0,0.013689,6.193018,,...,,,,,,,,,,


In [11]:
# deal with duplicate data if they exist 
df.duplicated().value_counts()  # no duplicate data 

False    333200
dtype: int64

In [12]:
# split the data 
# to classify the patients whether they got any type of food allergies
# extract x ( The first variables ) and y ( outcome : has alergie /no )  
df.columns.get_loc('CASHEW_ALG_START') # to know the index of last var we need  

39

In [13]:
df_y = df.iloc[:,9:40] # make in df_y the variables that tell if the patient has an allergie
i=9 ; 
for col in df_y.columns :
    if (i % 2) == 0:  # delete column 10,12,...,38 (ALG_END)
        df_y.drop(col, inplace=True, axis=1)
    i = i+1 
# keep only the ALG_START columns 

In [14]:
df_y

Unnamed: 0,SHELLFISH_ALG_START,FISH_ALG_START,MILK_ALG_START,SOY_ALG_START,EGG_ALG_START,WHEAT_ALG_START,PEANUT_ALG_START,SESAME_ALG_START,TREENUT_ALG_START,WALNUT_ALG_START,PECAN_ALG_START,PISTACH_ALG_START,ALMOND_ALG_START,BRAZIL_ALG_START,HAZELNUT_ALG_START,CASHEW_ALG_START
0,,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,,
2,,,1.002053,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
333195,,,,,,,,,,,,,,,,
333196,,,,,,,,,,,,,,,,
333197,,,,,,,,,,,,,,,,
333198,,,,,,,,,,,,,,,,


In [15]:
# create a column 'Outcome' that has 1 if the patient have any type of alergie or 0 in the opposite case
previous = False # false | true (in one not null) = true   
for col in df_y.columns :
   List = df_y[col].notnull() | previous 
   previous = List 
df['Outcome'] = List

In [16]:
df['Outcome'].value_counts()

False    310589
True      22611
Name: Outcome, dtype: int64

In [17]:
# if there is one different from NaN => its value is True (converted to int == 1) 
# => the patient has at least one type of allergies 
df['Outcome'] = df['Outcome'].astype("int")

In [18]:
df['Outcome'].value_counts()

0    310589
1     22611
Name: Outcome, dtype: int64

In [19]:
# extract x and y 
x = df.iloc[:,:9].values  # all except from 9_
y = df['Outcome'].values # the outcome

# splitting dataset into train and test 
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)

In [20]:
sk_model = LogisticRegression()
sk_model.fit(x_train,y_train)

LogisticRegression()

In [21]:
#calculate the prediction on our test set
y_pred = sk_model.predict(x_test)

In [22]:
# get the acc 
acc = get_accuracy(y_pred,y_test)

Accuracy on testset = 93.33
