In [28]:
import pandas as pd 

In [29]:
df = pd.read_csv('covid_toy.csv')

df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


Todo

1. Handle missing values in the fever column using SimpleImputer (fill with mean).
2. Apply One-Hot Encoding for city and gender.
3. Apply Ordinal Encoding for cough (Mild < Strong).
4. Convert has_covid to binary (Yes → 1, No → 0).
5. Transform the dataset and display the result.
6. applying ML.

In [30]:
df['has_covid'] = df['has_covid'].map({'Yes':1, 'No':0})
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,0
1,27,Male,100.0,Mild,Delhi,1
2,42,Male,101.0,Mild,Delhi,0
3,31,Female,98.0,Mild,Kolkata,0
4,65,Female,101.0,Mild,Mumbai,0


In [31]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer

In [41]:
transformer = ColumnTransformer(transformers=[
    ('impute_fever', SimpleImputer(),['fever']),
    ('ordinal', OrdinalEncoder(categories=[['Mild','Strong']]), ['cough']),
    ('One_hot_g', OneHotEncoder(handle_unknown='ignore'), ['gender']),
    ('One_hot_c', OneHotEncoder(handle_unknown='ignore'), ['city'])

], remainder='passthrough')

In [42]:
X_transformed = transformer.fit_transform(df.drop(columns=['has_covid']))


new_df = pd.DataFrame(X_transformed, columns=transformer.get_feature_names_out())

new_df['has_covid'] = df['has_covid']

In [43]:
new_df.head()

Unnamed: 0,impute_fever__fever,ordinal__cough,One_hot_g__gender_Female,One_hot_g__gender_Male,One_hot_c__city_Bangalore,One_hot_c__city_Delhi,One_hot_c__city_Kolkata,One_hot_c__city_Mumbai,remainder__age,has_covid
0,103.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,60.0,0
1,100.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,27.0,1
2,101.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,42.0,0
3,98.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,31.0,0
4,101.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,65.0,0


In [50]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(new_df.drop(columns=['has_covid']), new_df['has_covid'], test_size=0.2, random_state=40)

X_train

Unnamed: 0,impute_fever__fever,ordinal__cough,One_hot_g__gender_Female,One_hot_g__gender_Male,One_hot_c__city_Bangalore,One_hot_c__city_Delhi,One_hot_c__city_Kolkata,One_hot_c__city_Mumbai,remainder__age
72,101.000000,0.0,1.0,0.0,0.0,0.0,1.0,0.0,83.0
66,104.000000,0.0,0.0,1.0,0.0,0.0,1.0,0.0,51.0
69,103.000000,0.0,1.0,0.0,0.0,1.0,0.0,0.0,73.0
67,99.000000,0.0,0.0,1.0,1.0,0.0,0.0,0.0,65.0
26,100.000000,0.0,1.0,0.0,0.0,0.0,1.0,0.0,19.0
...,...,...,...,...,...,...,...,...,...
56,100.844444,1.0,0.0,1.0,0.0,0.0,1.0,0.0,71.0
37,100.000000,0.0,0.0,1.0,0.0,0.0,1.0,0.0,55.0
7,100.844444,1.0,1.0,0.0,0.0,0.0,0.0,1.0,20.0
91,100.844444,0.0,0.0,1.0,0.0,1.0,0.0,0.0,38.0


In [51]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

model.fit(X_train, y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [52]:
pred = model.predict(X_test)

In [48]:
pred

array([0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1])

In [53]:
y_test

79    1
75    1
63    0
15    1
38    1
11    1
40    0
45    0
39    0
62    1
86    1
4     0
47    0
27    0
81    0
17    0
82    1
33    0
9     0
92    0
Name: has_covid, dtype: int64

In [None]:
from sklearn.metrics import accuracy_score, classification_report

print(accuracy_score(y_test,pred))

0.45


In [56]:
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           0       0.54      0.58      0.56        12
           1       0.29      0.25      0.27         8

    accuracy                           0.45        20
   macro avg       0.41      0.42      0.41        20
weighted avg       0.44      0.45      0.44        20



In [57]:
print(df["has_covid"].value_counts(normalize=True))


has_covid
0    0.55
1    0.45
Name: proportion, dtype: float64
