Build a logistic regression model to classify the patients whether they got any type of food allergies using the Food Allergy Zenodo dataset.

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [3]:
df = pd.read_csv('./inputs/food-allergy-analysis-Zenodo.csv')
df.head()

Unnamed: 0,SUBJECT_ID,BIRTH_YEAR,GENDER_FACTOR,RACE_FACTOR,ETHNICITY_FACTOR,PAYER_FACTOR,ATOPIC_MARCH_COHORT,AGE_START_YEARS,AGE_END_YEARS,SHELLFISH_ALG_START,...,CASHEW_ALG_END,ATOPIC_DERM_START,ATOPIC_DERM_END,ALLERGIC_RHINITIS_START,ALLERGIC_RHINITIS_END,ASTHMA_START,ASTHMA_END,FIRST_ASTHMARX,LAST_ASTHMARX,NUM_ASTHMARX
0,1,2006,S1 - Female,R1 - Black,E0 - Non-Hispanic,P1 - Medicaid,False,0.093087,3.164956,,...,,,,,,,,,,
1,2,1994,S1 - Female,R0 - White,E0 - Non-Hispanic,P0 - Non-Medicaid,False,12.232717,18.880219,,...,,,,,,,,12.262834,18.880219,2.0
2,3,2006,S0 - Male,R0 - White,E1 - Hispanic,P0 - Non-Medicaid,True,0.010951,6.726899,,...,,4.884326,,3.917864,6.157426,5.127995,,1.404517,6.157426,4.0
3,4,2004,S0 - Male,R4 - Unknown,E1 - Hispanic,P0 - Non-Medicaid,False,2.398357,9.111567,,...,,,,,,,,,,
4,5,2006,S1 - Female,R1 - Black,E0 - Non-Hispanic,P0 - Non-Medicaid,False,0.013689,6.193018,,...,,,,,,,,,,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 333200 entries, 0 to 333199
Data columns (total 50 columns):
SUBJECT_ID                 333200 non-null int64
BIRTH_YEAR                 333200 non-null int64
GENDER_FACTOR              333200 non-null object
RACE_FACTOR                333200 non-null object
ETHNICITY_FACTOR           333200 non-null object
PAYER_FACTOR               333200 non-null object
ATOPIC_MARCH_COHORT        333200 non-null bool
AGE_START_YEARS            333200 non-null float64
AGE_END_YEARS              333200 non-null float64
SHELLFISH_ALG_START        5246 non-null float64
SHELLFISH_ALG_END          1051 non-null float64
FISH_ALG_START             1796 non-null float64
FISH_ALG_END               527 non-null float64
MILK_ALG_START             7289 non-null float64
MILK_ALG_END               4580 non-null float64
SOY_ALG_START              2419 non-null float64
SOY_ALG_END                1431 non-null float64
EGG_ALG_START              6065 non-null float64
E

### Data Preprocessing

In [10]:
df[['ASTHMA_START','ASTHMA_END', 'FIRST_ASTHMARX', 'LAST_ASTHMARX', 'NUM_ASTHMARX']]

Unnamed: 0,ASTHMA_START,ASTHMA_END,FIRST_ASTHMARX,LAST_ASTHMARX,NUM_ASTHMARX
0,,,,,
1,,,12.262834,18.880219,2.0
2,5.127995,,1.404517,6.157426,4.0
3,,,,,
4,,,,,
...,...,...,...,...,...
333195,5.426420,7.449692,,,
333196,,,,,
333197,2.321697,,2.321697,6.650240,4.0
333198,,,,,


Since we only need to know if a patient has some type of allergy, we are going to :

- Create a target variable y, that will be 1 when a patient have any type of allergy, 0 elsewhere
- The categorical columns, we are transforming them into numerical values using the OrdinalEncoder from the scikit-learn library.

In [1]:
from sklearn import preprocessing
from sklearn.preprocessing import OrdinalEncoder

In [21]:
def handle_categorical_columns(data:pd.DataFrame,columns:list)->pd.DataFrame:
    tmp = data.copy()
    
    df_cat = data[columns]
    encoder = OrdinalEncoder()
    df_cat_tran = encoder.fit_transform(df_cat)
    
    tmp[columns] = df_cat_tran
    return tmp
    
def preprocess_data(data : pd.DataFrame)->pd.DataFrame:
    # 
    #    
    tmp = data.copy(deep=True)
    food_allergies = ["".join((column.split("_")[0],"_ALG")) for column in df.columns[9:41]]
    tmp["y"] = [0 for _ in range(tmp.shape[0])]
    for food_allergie in food_allergies:
        tmp["y"] = tmp.y | (~tmp[''.join((food_allergie,"_START"))].isna()).astype(int)
       
    
    tmp[df.columns[9:41]] = tmp[df.columns[9:41]].fillna(-1)
    
    tmp[['ATOPIC_DERM_START','ATOPIC_DERM_END','ALLERGIC_RHINITIS_START','ALLERGIC_RHINITIS_END',
        'ASTHMA_START','ASTHMA_END','FIRST_ASTHMARX','LAST_ASTHMARX']
        ] = tmp[['ATOPIC_DERM_START','ATOPIC_DERM_END','ALLERGIC_RHINITIS_START','ALLERGIC_RHINITIS_END',
        'ASTHMA_START','ASTHMA_END','FIRST_ASTHMARX','LAST_ASTHMARX']].fillna(-1)
    
    tmp[['NUM_ASTHMARX']] = tmp[['NUM_ASTHMARX']].fillna(-1)
    
    tmp = handle_categorical_columns(tmp,df.columns[2:7])
    
    tmp.drop(['SUBJECT_ID','BIRTH_YEAR'],axis=1,inplace=True)
    return tmp

preprocessed_data = preprocess_data(df)
preprocessed_data.head()

Unnamed: 0,GENDER_FACTOR,RACE_FACTOR,ETHNICITY_FACTOR,PAYER_FACTOR,ATOPIC_MARCH_COHORT,AGE_START_YEARS,AGE_END_YEARS,SHELLFISH_ALG_START,SHELLFISH_ALG_END,FISH_ALG_START,...,ATOPIC_DERM_START,ATOPIC_DERM_END,ALLERGIC_RHINITIS_START,ALLERGIC_RHINITIS_END,ASTHMA_START,ASTHMA_END,FIRST_ASTHMARX,LAST_ASTHMARX,NUM_ASTHMARX,y
0,1.0,1.0,0.0,1.0,0.0,0.093087,3.164956,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0
1,1.0,0.0,0.0,0.0,0.0,12.232717,18.880219,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,12.262834,18.880219,2.0,0
2,0.0,0.0,1.0,0.0,1.0,0.010951,6.726899,-1.0,-1.0,-1.0,...,4.884326,-1.0,3.917864,6.157426,5.127995,-1.0,1.404517,6.157426,4.0,1
3,0.0,4.0,1.0,0.0,0.0,2.398357,9.111567,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0
4,1.0,1.0,0.0,0.0,0.0,0.013689,6.193018,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0


In [17]:
from sklearn.linear_model import LinearRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split

In [6]:
def get_accuracy(y_pred , y_test):
    correctly_classified = 0
    for count in range(np.size( y_pred )) :
        print(f"y_test[count]  = {y_test[count] }")
        print(f" y_pred[count] = { y_pred[count]}")
        if y_test[count] == y_pred[count]:
            correctly_classified = correctly_classified + 1
    acc = ( correctly_classified / len(y_pred) )*100
    print(f"Accuracy on testset = {acc}")
    return acc

In [22]:
X = preprocessed_data.drop(["y"],axis=1).values
Y = preprocessed_data["y"].values

In [23]:
#Split dataset into train and test set:
X_train , X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2,random_state=43)

In [24]:
model = LinearRegression()
model.fit(X_train, Y_train)

LinearRegression()

In [25]:
# make predictions
Y_pred = model.predict(X_test)
model.score(X_train,Y_train)

0.5595863905707072