In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression


In [3]:
df=pd.read_csv(r"C:\Users\91861\Desktop\Artifical Intelligence\Machine Leraning\7. LOGISTIC REGRESSION\covid19_patient_symptoms_diagnosis.csv")

In [4]:
df.head()


Unnamed: 0,patient_id,age,gender,fever,dry_cough,sore_throat,fatigue,headache,shortness_of_breath,loss_of_smell,loss_of_taste,oxygen_level,body_temperature,comorbidity,travel_history,contact_with_patient,chest_pain,covid_result
0,1,52,Male,1,0,1,1,0,0,0,0,98,37.1,Diabetes,0,0,0,0
1,2,15,Male,0,0,0,1,1,0,0,0,85,37.7,Diabetes,0,1,0,0
2,3,72,Male,1,0,1,0,0,1,0,1,99,36.8,,0,0,0,0
3,4,61,Female,0,0,1,1,1,0,1,1,86,36.0,,1,0,0,0
4,5,21,Female,1,1,0,0,0,1,0,1,90,36.4,,1,1,1,1


In [5]:
df.shape

(5000, 18)

In [6]:
df = df.drop(columns=['patient_id'])


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   age                   5000 non-null   int64  
 1   gender                5000 non-null   object 
 2   fever                 5000 non-null   int64  
 3   dry_cough             5000 non-null   int64  
 4   sore_throat           5000 non-null   int64  
 5   fatigue               5000 non-null   int64  
 6   headache              5000 non-null   int64  
 7   shortness_of_breath   5000 non-null   int64  
 8   loss_of_smell         5000 non-null   int64  
 9   loss_of_taste         5000 non-null   int64  
 10  oxygen_level          5000 non-null   int64  
 11  body_temperature      5000 non-null   float64
 12  comorbidity           2275 non-null   object 
 13  travel_history        5000 non-null   int64  
 14  contact_with_patient  5000 non-null   int64  
 15  chest_pain           

In [8]:
df['comorbidity'].value_counts()

comorbidity
Diabetes         1001
Heart Disease     792
Asthma            482
Name: count, dtype: int64

In [9]:
df['comorbidity'].unique()

array(['Diabetes', nan, 'Asthma', 'Heart Disease'], dtype=object)

In [10]:
num_cols = ['age', 'oxygen_level', 'body_temperature']
gender_col = ['gender']
comorbidity_col = ['comorbidity']


In [11]:
gender_categories = [['Female', 'Male']]

comorbidity_categories = [
    ['Unknown', 'Asthma', 'Diabetes', 'Heart Disease']
]


In [12]:
num_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler())
])


In [13]:
gender_pipeline = Pipeline(steps=[
    ('encoder', OneHotEncoder(
        categories=gender_categories,
        drop='first',
        handle_unknown='ignore'
    ))
])


In [14]:
comorbidity_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
    ('encoder', OneHotEncoder(
        categories=comorbidity_categories,
        drop='first',
        handle_unknown='ignore'
    ))
])


In [15]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, num_cols),
        ('gender', gender_pipeline, gender_col),
        ('comorbidity', comorbidity_pipeline, comorbidity_col)
    ]
)


In [16]:
pipe = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('model', LogisticRegression(
        max_iter=1000,
        class_weight='balanced'
    ))
])


In [17]:
X = df.drop(columns=['covid_result'])
y = df['covid_result']



In [18]:
X.head()

Unnamed: 0,age,gender,fever,dry_cough,sore_throat,fatigue,headache,shortness_of_breath,loss_of_smell,loss_of_taste,oxygen_level,body_temperature,comorbidity,travel_history,contact_with_patient,chest_pain
0,52,Male,1,0,1,1,0,0,0,0,98,37.1,Diabetes,0,0,0
1,15,Male,0,0,0,1,1,0,0,0,85,37.7,Diabetes,0,1,0
2,72,Male,1,0,1,0,0,1,0,1,99,36.8,,0,0,0
3,61,Female,0,0,1,1,1,0,1,1,86,36.0,,1,0,0
4,21,Female,1,1,0,0,0,1,0,1,90,36.4,,1,1,1


In [19]:
y.head()

0    0
1    0
2    0
3    0
4    1
Name: covid_result, dtype: int64

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,random_state=42,stratify=y)




In [21]:
pipe.fit(X_train, y_train)

In [22]:
from sklearn.metrics import classification_report, roc_auc_score

y_pred = pipe.predict(X_test)
y_prob = pipe.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))


              precision    recall  f1-score   support

           0       0.65      0.64      0.65       480
           1       0.68      0.69      0.68       520

    accuracy                           0.67      1000
   macro avg       0.67      0.67      0.67      1000
weighted avg       0.67      0.67      0.67      1000

ROC-AUC: 0.6837920673076923


In [23]:
y_prob

array([0.3639284 , 0.40186475, 0.6819607 , 0.50298985, 0.30095662,
       0.47502006, 0.50705226, 0.7126441 , 0.27171488, 0.43039182,
       0.57684332, 0.65162403, 0.68503057, 0.74044794, 0.71412658,
       0.39933237, 0.73809905, 0.57915681, 0.26630923, 0.64901125,
       0.47369048, 0.50394435, 0.54552016, 0.39718181, 0.61157542,
       0.74379792, 0.32870555, 0.71033559, 0.61692297, 0.27258331,
       0.29735482, 0.6192808 , 0.73909416, 0.70804346, 0.26430585,
       0.74003796, 0.71259656, 0.61603128, 0.64501832, 0.46808746,
       0.67664672, 0.47182736, 0.54216416, 0.42487238, 0.70981835,
       0.37069353, 0.54796568, 0.6459188 , 0.57763299, 0.30124092,
       0.39500165, 0.60586688, 0.67300404, 0.32999597, 0.46702969,
       0.33194783, 0.30149116, 0.33084321, 0.74096862, 0.68123862,
       0.71408396, 0.46604819, 0.74357164, 0.39704469, 0.64648099,
       0.46927268, 0.57441275, 0.36334109, 0.27546279, 0.39886378,
       0.74378707, 0.29499379, 0.64390931, 0.39873222, 0.43436

In [24]:
threshold = 0.55
y_pred_2 = (y_prob >= threshold).astype(int)


In [25]:
print(classification_report(y_test, y_pred_2))

              precision    recall  f1-score   support

           0       0.60      0.74      0.66       480
           1       0.70      0.54      0.61       520

    accuracy                           0.64      1000
   macro avg       0.65      0.64      0.64      1000
weighted avg       0.65      0.64      0.64      1000

