In [33]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris, load_wine
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

In [34]:
# import the datasets
df = pd.read_csv("titanic.csv") # using KNN imputer
print(f"\nMissing values count:\n{df.isnull().sum()}")

#print first ten before usinf simple imputer
#print(df.head(10))

numerical_columns = ['PassengerId','Survived','Pclass','Age','SibSp','Parch','Fare']
df_knn = df.copy()
imputer = KNNImputer(n_neighbors = 5)
df_knn[numerical_columns] = imputer.fit_transform(df_knn[numerical_columns])
df_knn = df_knn.fillna(method='bfill')
df_knn = df_knn.fillna(method='ffill')
print(f"\nMissing values count:\n{df_knn.isnull().sum()}")

#numerical values filled using knn and strings using forwardfill



Missing values count:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Missing values count:
PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64


  df_knn = df_knn.fillna(method='bfill')
  df_knn = df_knn.fillna(method='ffill')


In [35]:
# minmax scaling for fare
print(df.head(10))
scaler_minmax = MinMaxScaler()
df_knn[['Fare']] = scaler_minmax.fit_transform(df_knn[['Fare']])
print(df_knn.head(10))


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   
5            6         0       3   
6            7         0       1   
7            8         0       3   
8            9         1       3   
9           10         1       2   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   
5                                   Moran, Mr. James    male   NaN      0   
6                            McCarthy, Mr. Timothy J    male  54

In [36]:

# bining for age
df_knn['age_bins_equal'] = pd.cut(df_knn['Age'], bins=4, labels=['Young', 'Adult', 'Middle-aged', 'Senior'])
print(f"\nEqual-width binning for age:")
print(df_knn[['Age', 'age_bins_equal']].head(10))

#encoding sex and age_bins
le = LabelEncoder()
df_knn['Sex_encoded'] = le.fit_transform(df['Sex'])
df_knn['Age_encoded'] = le.fit_transform(df['age_bins_equal'])

print(df_knn.head(10))


Equal-width binning for age:
    Age age_bins_equal
0  22.0          Adult
1  38.0          Adult
2  26.0          Adult
3  35.0          Adult
4  35.0          Adult
5  26.0          Adult
6  54.0    Middle-aged
7   2.0          Young
8  27.0          Adult
9  14.0          Young
   PassengerId  Survived  Pclass  \
0          1.0       0.0     3.0   
1          2.0       1.0     1.0   
2          3.0       1.0     3.0   
3          4.0       1.0     1.0   
4          5.0       0.0     3.0   
5          6.0       0.0     3.0   
6          7.0       0.0     1.0   
7          8.0       0.0     3.0   
8          9.0       1.0     3.0   
9         10.0       1.0     2.0   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0    1.0   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0    1.0   
2                             Heikkinen, Miss. Laina  female  26.0    0.0   
3       Futr