# Load Libraries and Dataset

In [1]:
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

# Load the Titanic Dataset from Seaborn
titanic = sns.load_dataset("titanic")

# Display the Dataset
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


# Define Columns for Cleaning and Preprocessing

In [2]:
# Define Numerical and Categorical Columns
numerical_columns = ['age', 'fare', 'survived', 'pclass', 'sibsp', 'parch']
categorical_columns = ['sex', 'embarked', 'class', 'who', 'deck', 'embark_town', 'alive', 'alone']

# Function to Clean and Preprocess the Dataset

In [3]:
def clean_and_preprocess(df):
    # Convert Categorical Columns to Strings before Filling Missing Values
    df[categorical_columns] = df[categorical_columns].astype(str)

    # Handle Missing Values
    imputer = SimpleImputer(strategy="mean")
    df[numerical_columns] = imputer.fit_transform(df[numerical_columns])
    df[categorical_columns] = df[categorical_columns].fillna("Unknown")

    # Normalize Numerical Columns
    scaler = StandardScaler()
    df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

    # Encode Categorical Columns
    encoder = OneHotEncoder(sparse=False, handle_unknown="ignore")
    encoded_cols = encoder.fit_transform(df[categorical_columns])
    # Get Feature Names from encoder.categories_
    feature_names = encoder.get_feature_names_out(categorical_columns)
    encoded_cols = pd.DataFrame(encoded_cols, columns=feature_names)

    # Concatenate Encoded Columns with Original DataFrame
    df = pd.concat([df.drop(categorical_columns, axis=1), encoded_cols], axis=1)

    return df


# Clean and Preprocess the Titanic Dataset
titanic_cleaned = clean_and_preprocess(titanic)

# Show the Cleaned Dataset
titanic_cleaned



Unnamed: 0,survived,pclass,age,sibsp,parch,fare,adult_male,sex_female,sex_male,embarked_C,...,deck_G,deck_nan,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton,embark_town_nan,alive_no,alive_yes,alone_False,alone_True
0,-0.789272,0.827377,-0.592481,0.432793,-0.473674,-0.502445,True,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
1,1.266990,-1.566107,0.638789,0.432793,-0.473674,0.786845,False,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
2,1.266990,0.827377,-0.284663,-0.474545,-0.473674,-0.488854,False,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
3,1.266990,-1.566107,0.407926,0.432793,-0.473674,0.420730,False,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
4,-0.789272,0.827377,0.407926,-0.474545,-0.473674,-0.486337,True,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,-0.789272,-0.369365,-0.207709,-0.474545,-0.473674,-0.386671,True,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
887,1.266990,-1.566107,-0.823344,-0.474545,-0.473674,-0.044381,False,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
888,-0.789272,0.827377,0.000000,0.432793,2.008933,-0.176263,False,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
889,1.266990,-1.566107,-0.284663,-0.474545,-0.473674,-0.044381,True,0.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
