In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

- SimpleImputer কী?

**SimpleImputer** হল একটি স্কিল (স্কিল মানে মেশিন লার্নিং এর টুল বা ক্লাস) যা মিসিং বা অনুপস্থিত ডেটা পূর্ণ করতে ব্যবহৃত হয়। এটা সাধারণত নানান ধরনের স্ট্র্যাটেজি ব্যবহার করে, যেমন:

1. **Mean** (গড় মান দিয়ে পূর্ণ করা)  
2. **Median** (মাঝারি মান দিয়ে পূর্ণ করা)  
3. **Most Frequent** (সবচেয়ে সাধারণ মান দিয়ে পূর্ণ করা)  

এটা ডেটার মধ্যে যদি কোন মান হারিয়ে যায়, তখন সেই জায়গায় এটি নিজে থেকে একটি মান বসিয়ে দেয়, যাতে মডেল আরও ভালোভাবে কাজ করতে পারে।


In [4]:
df = pd.read_csv('covid_toy.csv')

In [5]:
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [6]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

- ```df.isnull().sum()``` diye amra dekhe nilam kothao missing value ache kina. amra dekhte pelam fever e 10 ta missing value ache.

In [8]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df.iloc[:,0:5],df.iloc[:,-1],
                                                test_size = 0.2)

In [9]:
X_train

Unnamed: 0,age,gender,fever,cough,city
3,31,Female,98.0,Mild,Kolkata
5,84,Female,,Mild,Bangalore
2,42,Male,101.0,Mild,Delhi
81,65,Male,99.0,Mild,Delhi
64,42,Male,104.0,Mild,Mumbai
...,...,...,...,...,...
52,47,Female,100.0,Strong,Bangalore
66,51,Male,104.0,Mild,Kolkata
33,26,Female,98.0,Mild,Kolkata
4,65,Female,101.0,Mild,Mumbai


# Aam Zindagi

In [11]:
# adding simple imputer to fever col
si = SimpleImputer()
X_train_fever = si.fit_transform(X_train[['fever']])

# also the test data
X_test_fever = si.fit_transform(X_test[['fever']])
                                 
X_train_fever.shape

(80, 1)

In [12]:
# Ordinalencoding -> cough
oe = OrdinalEncoder(categories=[['Mild','Strong']])
X_train_cough = oe.fit_transform(X_train[['cough']])

# also the test data
X_test_cough = oe.fit_transform(X_test[['cough']])

X_train_cough.shape

(80, 1)

In [13]:
# OneHotEncoding -> gender,city
ohe = OneHotEncoder(drop='first', sparse_output=False)
X_train_gender_city = ohe.fit_transform(X_train[['gender','city']])

# also the test data
X_test_gender_city = ohe.fit_transform(X_test[['gender','city']])

X_train_gender_city.shape

(80, 4)

In [14]:
# Extracting Age
X_train_age = X_train.drop(columns=['gender','fever','cough','city']).values

# also the test data
X_test_age = X_test.drop(columns=['gender','fever','cough','city']).values

X_train_age.shape

(80, 1)

In [15]:
X_train_transformed = np.concatenate((X_train_age,X_train_fever,X_train_gender_city,X_train_cough),axis=1)
# also the test data
X_test_transformed = np.concatenate((X_test_age,X_test_fever,X_test_gender_city,X_test_cough),axis=1)

X_train_transformed.shape

(80, 7)

- ekhane amra ja korechi ta holo amra sob gula alada categorical column er alada kore simpleImputer , ordinal encoding and one hot encoding use korechi. mane jar jonno jeta lage arki.
- ekhon amra sob kichu eksathe merge korbo, mane sob column eksathe jure dibo.
- jure dite giye dekhlam j amader ```age``` er upor konokichu perform kori ni. tai eta alada kora nai.
- ekhon amader k ```age``` k alada korte hobe.
- then amra ```age``` k ekhan theke extract kore niyechi and then abar baki column gular sathe concatenate kore diyechi.

# Mentos Zindagi

In [18]:
from sklearn.compose import ColumnTransformer

In [19]:
transformer = ColumnTransformer(transformers=[
    ('tnf1', SimpleImputer(), ['fever']),  # NaN fill করার জন্য
    ('tnf2', OrdinalEncoder(categories=[['Mild', 'Strong']]), ['cough']),  # Category ঠিক করা
    ('tnf3', OneHotEncoder(drop='first', sparse_output=False), ['gender', 'city'])  # One-hot encoding
], remainder='passthrough')  # অন্যান্য কলাম অপরিবর্তিত থাকবে
 


# amra remainder e 'drop' or 'passthrough' dite pari. drop means column drop hobe. 
# and passthrough means oi column er kono operation perform hobe na, as it is thakbe.

In [20]:
transformer.fit_transform(X_train).shape

(80, 7)

In [41]:
transformer.fit_transform(X_test).shape

(20, 7)