In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.preprocessing import MultiLabelBinarizer, FunctionTransformer, PowerTransformer, KBinsDiscretizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report

from feature_engine.imputation import EndTailImputer

In [2]:
covid_flu = pd.read_csv('../../data/covid_flu.csv')

In [3]:
covid_flu.head()

Unnamed: 0,Diagnosis,InitialPCRDiagnosis,Age,Sex,neutrophil,serumLevelsOfWhiteBloodCell,lymphocytes,CReactiveProteinLevels,DurationOfIllness,CTscanResults,RiskFactors,GroundGlassOpacity,Diarrhea,Fever,Coughing,ShortnessOfBreath,SoreThroat,NauseaVomitting,Temperature,Fatigue
0,H1N1,,67.0,F,,,,,,,,,,Yes,Yes,,No,,38.111111,No
1,H1N1,,29.0,M,,,,,,,,,,,,,,,,
2,H1N1,,22.0,F,,,,,,,,,,,,,,,,
3,H1N1,,20.0,F,,,,,,,immuno,,,Yes,Yes,,No,,36.555556,Yes
4,H1N1,,21.0,M,,,,,,,,,,,,,,,,


In [4]:
(covid_flu.isna().sum() / covid_flu.shape[0]).sort_values()

Diagnosis                      0.000000
Age                            0.018893
Sex                            0.051282
Fever                          0.377193
Coughing                       0.420378
SoreThroat                     0.547908
Temperature                    0.576248
Fatigue                        0.641700
Diarrhea                       0.696356
NauseaVomitting                0.715924
RiskFactors                    0.858974
CTscanResults                  0.892713
lymphocytes                    0.894737
serumLevelsOfWhiteBloodCell    0.898111
CReactiveProteinLevels         0.907557
InitialPCRDiagnosis            0.929825
neutrophil                     0.930499
GroundGlassOpacity             0.937247
DurationOfIllness              0.941296
ShortnessOfBreath              0.949393
dtype: float64

In [5]:

for col in covid_flu.columns:
    print("-" * 50)
    print(covid_flu[col].value_counts())

--------------------------------------------------
Diagnosis
H1N1       1072
COVID19     410
Name: count, dtype: int64
--------------------------------------------------
InitialPCRDiagnosis
Yes    100
No       4
Name: count, dtype: int64
--------------------------------------------------
Age
5.00     53
6.00     52
4.00     50
7.00     49
8.00     47
         ..
0.75      1
77.00     1
0.42      1
0.92      1
1.40      1
Name: count, Length: 109, dtype: int64
--------------------------------------------------
Sex
M    748
F    658
Name: count, dtype: int64
--------------------------------------------------
neutrophil
3.20    3
2.40    2
2.00    2
3.13    2
3.00    2
       ..
0.78    1
4.48    1
5.30    1
3.45    1
3.73    1
Name: count, Length: 91, dtype: int64
--------------------------------------------------
serumLevelsOfWhiteBloodCell
10.60    3
6.60     3
4.20     3
2.60     2
3.40     2
        ..
6.15     1
10.15    1
9.12     1
4.49     1
5.30     1
Name: count, Length: 127, d

In [6]:
covid_flu["RiskFactors"]

0          NaN
1          NaN
2          NaN
3       immuno
4          NaN
         ...  
1477       NaN
1478       NaN
1479       NaN
1480       NaN
1481       NaN
Name: RiskFactors, Length: 1482, dtype: object

In [7]:
risk_factor = covid_flu["RiskFactors"].fillna("").str.replace(" ", "").str.split(",")
mulbin = MultiLabelBinarizer()
mul_risk_factor = mulbin.fit_transform(risk_factor)
mul_risk_factor

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]], shape=(1482, 42))

In [8]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MultiLabelBinarizer  # class to help make dummy variables

class DummifyRiskFactor(BaseEstimator,TransformerMixin):
    def __init__(self):
        self.label_binarizer = None
        
    def parse_risk_factors(self, comma_sep_factors):
        ''' asthma,heart disease -> ['asthma', 'heart disease'] '''
        try:
            return [s.strip().lower() for s in comma_sep_factors.split(',')]
        except:
            return []
    
    def fit(self, X, y=None):
        self.label_binarizer = MultiLabelBinarizer()
        self.label_binarizer.fit(X.apply(self.parse_risk_factors))  # create dummy variable for each risk factor
        return self
    
    def transform(self, X, y=None):
        return self.label_binarizer.transform(X.apply(self.parse_risk_factors))

In [9]:
risk_pipeline = Pipeline([
    ("select_risk_factor", FunctionTransformer(lambda df: df["RiskFactors"])),
    ("preprocess_risk", DummifyRiskFactor())
])

In [10]:
binary_col = [  # aggregate all binary columns in a list
    'Female', 'GroundGlassOpacity', 'CTscanResults', 'Diarrhea', 'Fever',
    'Coughing', 'SoreThroat', 'NauseaVomitting', 'Fatigue', 'InitialPCRDiagnosis'
]

numerical_col = ["Age", "neutrophil", "serumLevelsOfWhiteBloodCell", "lymphocytes", "DurationOfIllness", "Temperature"]

In [11]:
covid_flu["Female"] = covid_flu["Sex"] == "F"
covid_flu = covid_flu.drop("Sex", axis=1)
covid_flu = covid_flu.replace({"Yes": True, "No": False})

In [12]:
binary_pipeline = Pipeline([
    ("select_binary", FunctionTransformer(lambda df: df[binary_col])),
    ("fillna", SimpleImputer(strategy="constant", fill_value=False))
])

numerical_pipeline = Pipeline([
    ("select_numerical", FunctionTransformer(lambda df: df[numerical_col])),
    ("box-cox", PowerTransformer(method="yeo-johnson", standardize=True)),
    ("endtail", EndTailImputer()),
    ("binning", KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="uniform"))
])

In [None]:
feature_uni = FeatureUnion([
    ("risk_factor", risk_pipeline),
    ("binary_pipe", binary_pipeline),
    ("numerical_pipe", numerical_pipeline)
])



pipe = Pipeline([
    ("preprocess", feature_uni),
    ("classifier", RandomForestClassifier())
])

In [24]:

param = {
     "classifier__n_estimators": [10, 50, 100],
     "classifier__criterion": ["gini", "entropy"],
     "classifier__max_depth": [3, None]
}


grid = GridSearchCV(pipe, param, cv=5)



In [25]:
import warnings
warnings.filterwarnings('ignore')

In [26]:
X, y = covid_flu.drop("Diagnosis", axis=1), covid_flu["Diagnosis"]
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0, test_size=0.2)

grid.fit(X_train, y_train)

0,1,2
,estimator,Pipeline(step...lassifier())])
,param_grid,"{'classifier__criterion': ['gini', 'entropy'], 'classifier__max_depth': [3, None], 'classifier__n_estimators': [10, 50, ...]}"
,scoring,
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformer_list,"[('risk_factor', ...), ('binary_pipe', ...), ...]"
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True

0,1,2
,func,<function <la...t 0x1442742c0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,func,<function <la...t 0x144276d40>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,False
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<function <la...t 0x144276f20>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,method,'yeo-johnson'
,standardize,True
,copy,True

0,1,2
,imputation_method,'gaussian'
,tail,'right'
,fold,3
,variables,

0,1,2
,n_bins,5
,encode,'ordinal'
,strategy,'uniform'
,quantile_method,'warn'
,dtype,
,subsample,200000
,random_state,

0,1,2
,n_estimators,50
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [27]:
grid.best_params_

{'classifier__criterion': 'gini',
 'classifier__max_depth': None,
 'classifier__n_estimators': 50}

In [28]:
print(classification_report(y_test, grid.best_estimator_.predict(X_test)))

              precision    recall  f1-score   support

     COVID19       0.92      0.85      0.89        82
        H1N1       0.95      0.97      0.96       215

    accuracy                           0.94       297
   macro avg       0.93      0.91      0.92       297
weighted avg       0.94      0.94      0.94       297

