In [1]:
############# Import necessary libraries #############

import pandas as pd
import numpy as np
import scipy as sc
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer #imputing the missing values for Age
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
############# Load the Titanic dataset #############

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 500)
pd.set_option('display.float_format', lambda x: '%.4f' % x)

df = pd.read_csv("dataset/titanic.csv", header=3)
df = df.drop(df.columns[[0, 1]], axis=1)

df.head()

In [None]:
############# Preprocessing Step (Handle Missing by filtering and median for) #############

df.info()  # there is only missing value in age column
print(df.isnull().sum())   # 4 data missing in age column

plt.hist(df['Age'], bins=20, color='skyblue', edgecolor='black')
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

imputer = SimpleImputer(strategy="mean")
df["Age"] = imputer.fit_transform(df[["Age"]])

In [None]:
############# Apply 1-Hot Encoding #############

#dropping passenger id, name, ticket
df = df.drop(['PassengerId', 'Name', 'Ticket'], axis=1)

label_encoder = LabelEncoder()
df['Survived'] = label_encoder.fit_transform(df['Survived'])  # 1: yes ,  2: no
df['Sex'] = label_encoder.fit_transform(df['Sex'])  # 1: female  , 2: male
df = pd.get_dummies(df, columns=["Embarked"])
print(df)

In [None]:
############# Separate features and target #############

X = df.drop('Survived', axis=1) #Selecting the features
y = df['Survived'] #Selecting the prediction target

In [None]:
############# Split the dataset into an 80-20 training-test set #############

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
############ Create an instance of the StandardScaler class #############
scaler = StandardScaler()
scaler.fit(X_train)

In [None]:
############# Fit the StandardScaler on the features from the training set and transform it #############
X_train_scaled = scaler.fit_transform(X_train)

In [None]:
############# Apply the transform to the test set #############
X_test_scaled = scaler.transform(X_test)

In [None]:
############# Print the scaled training and test datasets #############
print("Scaled Training Dataset:")
print(X_train_scaled)
print("\nScaled Test Dataset:")
print(X_test_scaled)