In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix, classification_report

In [None]:
# Load  dataset
df = pd.read_csv("covid_toy.csv")

In [None]:
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [None]:
df.isnull().sum()


Unnamed: 0,0
age,0
gender,0
fever,10
cough,0
city,0
has_covid,0


In [None]:
from sklearn.compose import ColumnTransformer
transformer = ColumnTransformer(transformers=[
    ('tnf1',SimpleImputer(),['fever']),
    ('tnf2',OrdinalEncoder(categories=[['Mild','Strong']]),['cough']),
    ('tnf3',OneHotEncoder(sparse_output=False,drop='first'),['gender','city'])
],remainder='passthrough')

# ColumnTransformer(...)
# A big manager that tells:
# “This column goes here 👈”
# “That column goes there 👉”
# “Others just pass through 🚪”
# Why we need it:
# In a dataset, different columns need different treatments.

# Numbers → fill missing values or scale them.
# Text categories → turn into numbers.
# Already clean columns → leave them as they are.

In [None]:
# transformers=[ ... ]
# This is a list of rules. Each rule is a tuple with 3 parts:
# ('name', transformer, ['columns'])

# tnf1 is 1st transformer

# SimpleImputer() → This is the machine that fills missing values.
# Default = fills with the mean for numbers.
# # Example: if fever column has [101, NaN, 98, 100] → it will replace NaN with average (say 99.7).

In [None]:
# 'tnf2' → transformer 2.

# OrdinalEncoder(...) → This machine converts categories into numbers.
# Example: ['Mild', 'Strong'] → [0, 1].
# It respects the order you give.
# categories=[['Mild','Strong']] means:
# Mild = 0
# Strong = 1
# ['cough'] → Apply this only to the cough column.
# 👉 So: Turn cough column (Mild/Strong) into numbers (0/1).

In [None]:
# 'tnf3' → nickname.

# OneHotEncoder(...) → This machine makes dummy variables (0/1 columns).
# Example:
# gender: ['Male', 'Female'] → becomes:
# Male = [1,0]
# Female = [0,1]
# sparse_output=False → make a normal array (dense) instead of a sparse matrix.
# Easier to see & use with pandas.
# drop='first' → drop the first category to avoid dummy variable trap (too many redundant columns).
# ['gender','city'] → Apply this to gender and city columns.
# 👉 So: Convert gender & city into dummy 0/1 columns.

In [None]:
# Step 3: remainder='passthrough'
# This tells ColumnTransformer:
# If a column is not listed above, just keep it as it is.
# (Don’t delete, don’t transform).
# If you wrote remainder='drop' → it would remove all the other columns.

In [None]:
transformer

In [None]:
df.isnull().sum()

Unnamed: 0,0
age,0
gender,0
fever,10
cough,0
city,0
has_covid,0


In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['has_covid']),df['has_covid'],
                                                test_size=0.2 ,random_state=40)

In [None]:
X_train

Unnamed: 0,age,gender,fever,cough,city
72,83,Female,101.0,Mild,Kolkata
66,51,Male,104.0,Mild,Kolkata
69,73,Female,103.0,Mild,Delhi
67,65,Male,99.0,Mild,Bangalore
26,19,Female,100.0,Mild,Kolkata
...,...,...,...,...,...
56,71,Male,,Strong,Kolkata
37,55,Male,100.0,Mild,Kolkata
7,20,Female,,Strong,Mumbai
91,38,Male,,Mild,Delhi


In [None]:
transformer.fit_transform(X_train).shape

(80, 7)

In [None]:
transformer.transform(X_test).shape

(20, 7)

In [None]:
X_train_transformed = transformer.fit_transform(X_train)
X_test_transformed  = transformer.transform(X_test)

In [None]:
# Step 2: Scale the transformed features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_transformed)
X_test_scaled  = scaler.transform(X_test_transformed)

In [None]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(X_train_scaled,y_train)

In [None]:
y_pred = clf.predict(X_test_scaled)
y_pred

array(['No', 'No', 'Yes', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'No',
       'No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No'],
      dtype=object)

In [None]:
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("Precision Score:", precision_score(y_test, y_pred, pos_label="Yes"))
print("Recall Score:", recall_score(y_test, y_pred, pos_label="Yes"))
print("F1 Score:", f1_score(y_test, y_pred, pos_label="Yes"))

Accuracy Score: 0.4
Precision Score: 0.3
Recall Score: 0.375
F1 Score: 0.3333333333333333
