In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.preprocessing import MultiLabelBinarizer, FunctionTransformer, PowerTransformer, KBinsDiscretizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report

from feature_engine.imputation import EndTailImputer

In [2]:
covid_flu = pd.read_csv('../../data/covid_flu.csv')

In [3]:
covid_flu.head()

Unnamed: 0,Diagnosis,InitialPCRDiagnosis,Age,Sex,neutrophil,serumLevelsOfWhiteBloodCell,lymphocytes,CReactiveProteinLevels,DurationOfIllness,CTscanResults,RiskFactors,GroundGlassOpacity,Diarrhea,Fever,Coughing,ShortnessOfBreath,SoreThroat,NauseaVomitting,Temperature,Fatigue
0,H1N1,,67.0,F,,,,,,,,,,Yes,Yes,,No,,38.111111,No
1,H1N1,,29.0,M,,,,,,,,,,,,,,,,
2,H1N1,,22.0,F,,,,,,,,,,,,,,,,
3,H1N1,,20.0,F,,,,,,,immuno,,,Yes,Yes,,No,,36.555556,Yes
4,H1N1,,21.0,M,,,,,,,,,,,,,,,,


In [4]:
(covid_flu.isna().sum() / covid_flu.shape[0]).sort_values()

Diagnosis                      0.000000
Age                            0.018893
Sex                            0.051282
Fever                          0.377193
Coughing                       0.420378
SoreThroat                     0.547908
Temperature                    0.576248
Fatigue                        0.641700
Diarrhea                       0.696356
NauseaVomitting                0.715924
RiskFactors                    0.858974
CTscanResults                  0.892713
lymphocytes                    0.894737
serumLevelsOfWhiteBloodCell    0.898111
CReactiveProteinLevels         0.907557
InitialPCRDiagnosis            0.929825
neutrophil                     0.930499
GroundGlassOpacity             0.937247
DurationOfIllness              0.941296
ShortnessOfBreath              0.949393
dtype: float64

In [5]:

for col in covid_flu.columns:
    print("-" * 50)
    print(covid_flu[col].value_counts())

--------------------------------------------------
Diagnosis
H1N1       1072
COVID19     410
Name: count, dtype: int64
--------------------------------------------------
InitialPCRDiagnosis
Yes    100
No       4
Name: count, dtype: int64
--------------------------------------------------
Age
5.00     53
6.00     52
4.00     50
7.00     49
8.00     47
         ..
0.75      1
77.00     1
0.42      1
0.92      1
1.40      1
Name: count, Length: 109, dtype: int64
--------------------------------------------------
Sex
M    748
F    658
Name: count, dtype: int64
--------------------------------------------------
neutrophil
3.20    3
2.40    2
2.00    2
3.13    2
3.00    2
       ..
0.78    1
4.48    1
5.30    1
3.45    1
3.73    1
Name: count, Length: 91, dtype: int64
--------------------------------------------------
serumLevelsOfWhiteBloodCell
10.60    3
6.60     3
4.20     3
2.60     2
3.40     2
        ..
6.15     1
10.15    1
9.12     1
4.49     1
5.30     1
Name: count, Length: 127, d

In [6]:
covid_flu["RiskFactors"]

0          NaN
1          NaN
2          NaN
3       immuno
4          NaN
         ...  
1477       NaN
1478       NaN
1479       NaN
1480       NaN
1481       NaN
Name: RiskFactors, Length: 1482, dtype: object

In [7]:
risk_factor = covid_flu["RiskFactors"].fillna("").str.replace(" ", "").str.split(",")
mulbin = MultiLabelBinarizer()
mul_risk_factor = mulbin.fit_transform(risk_factor)
mul_risk_factor

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]], shape=(1482, 42))

In [8]:
def process_risk_factor(df: pd.DataFrame) -> pd.DataFrame:
    risk_mlt = MultiLabelBinarizer()
    risk_factor = df["RiskFactors"]
    risk_factor = risk_factor.fillna("").str.replace(" ", "").str.split(",")
    risk_factor = risk_mlt.fit_transform(risk_factor)
    return pd.DataFrame(risk_factor, columns=risk_mlt.classes_)

In [9]:
risk_pipeline = Pipeline([
    ("preprocess_risk", FunctionTransformer(process_risk_factor, validate=False))
])

In [10]:
binary_col = [  # aggregate all binary columns in a list
    'Sex', 'GroundGlassOpacity', 'CTscanResults', 'Diarrhea', 'Fever',
    'Coughing', 'SoreThroat', 'NauseaVomitting', 'Fatigue', 'InitialPCRDiagnosis'
]

numerical_col = ["Age", "neutrophil", "serumLevelsOfWhiteBloodCell", "lymphocytes", "DurationOfIllness", "Temperature"]

In [11]:
def sex_to_num(df):
    df = df.copy()
    df['Sex'] = df['Sex'].replace({'F': 0, 'M': 1})
    return df

def yesno2truefalse(df):
    df_copy = df.copy()
    df_copy = df_copy.replace({"Yes": True, "No": False})
    return df_copy

In [12]:
binary_pipeline = Pipeline([
    ("sex2num", FunctionTransformer(sex_to_num)),
    ("yes2true", FunctionTransformer(yesno2truefalse)),
    ("select_binary", FunctionTransformer(lambda df: df[binary_col])),
    ("fillna", SimpleImputer(strategy="constant", fill_value=False))
])

numerical_pipeline = Pipeline([
    ("select_numerical", FunctionTransformer(lambda df: df[numerical_col])),
    ("box-cox", PowerTransformer(method="yeo-johnson", standardize=True)),
    ("endtail", EndTailImputer()),
    ("binning", KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="uniform"))
])

In [13]:
feature_uni = FeatureUnion([
    ("risk_factor", risk_pipeline),
    ("binary_pipe", binary_pipeline),
    ("numerical_pipe", numerical_pipeline)
])

pipe = Pipeline([
    ("preprocess", feature_uni),
    ("classifier", RandomForestClassifier())
])

In [14]:
param = {
     "classifier__n_estimators": [100, 1000],
     "classifier__criterion": ["gini", "entropy", "log_loss"],
     "classifier__max_depth": [2, 5]
}


new_df = feature_uni.fit_transform(covid_flu.drop("Diagnosis", axis=1))

grid = GridSearchCV(pipe, param_grid=param, cv=5)

grid.fit(new_df, covid_flu["Diagnosis"])
# grid.fit(covid_flu.drop("Diagnosis", axis=1), covid_flu["Diagnosis"])
# cross_val_score(grid, covid_flu.drop("Diagnosis", axis=1), covid_flu["Diagnosis"], cv=5)


  df['Sex'] = df['Sex'].replace({'F': 0, 'M': 1})


ValueError: 
All the 60 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 1363, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/pipeline.py", line 653, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/pipeline.py", line 587, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/joblib/memory.py", line 326, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/pipeline.py", line 1539, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/utils/_set_output.py", line 316, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/pipeline.py", line 1962, in fit_transform
    results = self._parallel_func(X, y, _fit_transform_one, routed_params)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/pipeline.py", line 1984, in _parallel_func
    return Parallel(n_jobs=self.n_jobs)(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/utils/parallel.py", line 82, in __call__
    return super().__call__(iterable_with_config_and_warning_filters)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/joblib/parallel.py", line 1986, in __call__
    return output if self.return_generator else list(output)
                                                ^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/joblib/parallel.py", line 1914, in _get_sequential_output
    res = func(*args, **kwargs)
          ^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/utils/parallel.py", line 147, in __call__
    return self.function(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/pipeline.py", line 1539, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 1363, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/pipeline.py", line 729, in fit_transform
    return last_step.fit_transform(
           ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/utils/_set_output.py", line 316, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 895, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/utils/_set_output.py", line 316, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/preprocessing/_function_transformer.py", line 260, in transform
    out = self._transform(X, func=self.func, kw_args=self.kw_args)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/preprocessing/_function_transformer.py", line 387, in _transform
    return func(X, **(kw_args if kw_args else {}))
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/kd/gn_dg9w13px_cwn32p5bs4fc0000gn/T/ipykernel_5893/77022346.py", line 3, in process_risk_factor
    risk_factor = df["RiskFactors"]
                  ~~^^^^^^^^^^^^^^^
IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices
