In [1]:
import numpy as np
import pandas as pd

In [2]:
data = {
    "age": [25, 30, np.nan, 40, 30],
    "salary": [50000, 60000, 55000, 120000, 60000],
    "city": ["Colombo", "Kandy", "Colombo", None, "Kandy"],
    "target": [1, 0, 1, 0, 0]
}

df = pd.DataFrame(data)
print("Original DataFrame: ")
print(df)

Original DataFrame: 
    age  salary     city  target
0  25.0   50000  Colombo       1
1  30.0   60000    Kandy       0
2   NaN   55000  Colombo       1
3  40.0  120000     None       0
4  30.0   60000    Kandy       0


In [3]:
print("Missing values count:")
print(df.isnull().sum())

Missing values count:
age       1
salary    0
city      1
target    0
dtype: int64


In [4]:
# fill numeric with mean
df["age"] = df["age"].fillna(df["age"].mean())
print(df)

     age  salary     city  target
0  25.00   50000  Colombo       1
1  30.00   60000    Kandy       0
2  31.25   55000  Colombo       1
3  40.00  120000     None       0
4  30.00   60000    Kandy       0


In [5]:
# fill categorical with "unknown"
df["city"] = df["city"].fillna("Unknown")
print(df)

     age  salary     city  target
0  25.00   50000  Colombo       1
1  30.00   60000    Kandy       0
2  31.25   55000  Colombo       1
3  40.00  120000  Unknown       0
4  30.00   60000    Kandy       0


In [6]:
# both can be written as like this
df = df.fillna({
    "age": df["age"].mean(),
    "city": "Unknown"
})

print(df)

     age  salary     city  target
0  25.00   50000  Colombo       1
1  30.00   60000    Kandy       0
2  31.25   55000  Colombo       1
3  40.00  120000  Unknown       0
4  30.00   60000    Kandy       0


In [7]:
# find and remove duplicates
print("Duplicated rows: ")
print(df.duplicated())

Duplicated rows: 
0    False
1    False
2    False
3    False
4     True
dtype: bool


In [8]:
df = df.drop_duplicates()
print(df)

     age  salary     city  target
0  25.00   50000  Colombo       1
1  30.00   60000    Kandy       0
2  31.25   55000  Colombo       1
3  40.00  120000  Unknown       0


In [12]:
# convert categorical column using one-hot encoding
df_encoded = pd.get_dummies(df, columns=["city"])

print("After One-Hot Encoding: ")
print(df_encoded)

After One-Hot Encoding: 
     age  salary  target  city_Colombo  city_Kandy  city_Unknown
0  25.00   50000       1          True       False         False
1  30.00   60000       0         False        True         False
2  31.25   55000       1          True       False         False
3  40.00  120000       0         False       False          True


In [13]:
# apply min-max or standard scaling

df_encoded["salary_scaled"] = (
    df_encoded["salary"] - df_encoded["salary"].mean()
) / df_encoded["salary"].std()

print(df_encoded)

     age  salary  target  city_Colombo  city_Kandy  city_Unknown  \
0  25.00   50000       1          True       False         False   
1  30.00   60000       0         False        True         False   
2  31.25   55000       1          True       False         False   
3  40.00  120000       0         False       False          True   

   salary_scaled  
0      -0.648748  
1      -0.343455  
2      -0.496101  
3       1.488304  


In [None]:
# final ML split

X = df_encoded.dr