In [6]:
# ============================
# Titanic Data Preprocessing
# Beginner Friendly Version
# ============================

import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# ---------------------------
# Load Dataset
# ---------------------------
titanic_data = pd.read_csv('train.csv')

print("--- Initial Data Preview ---")
print(titanic_data.head())

print("\n--- Dimensions of the Original DataFrame ---")
print("Original DataFrame dimensions:", titanic_data.shape)

# ---------------------------
# 🌟 Exercise 1: Duplicates
# ---------------------------
print("\n--- Checking for Duplicates ---")
print("Number of duplicate rows:", titanic_data.duplicated().sum())

df_cleaned = titanic_data.drop_duplicates()
print("Dimensions after removing duplicates:", df_cleaned.shape)

# ---------------------------
# 🌟 Exercise 2: Missing Values
# ---------------------------
print("\n--- Identifying Missing Values ---")
missing_values_count = df_cleaned.isnull().sum()
columns_with_missing_data = missing_values_count[missing_values_count > 0]
print(columns_with_missing_data)

# Drop 'Cabin' (too many missing values)
df_cleaned.drop('Cabin', axis=1, inplace=True)
print("\nDropped 'Cabin' column.")

# Fill 'Age' with median (using SimpleImputer)
imputer_age = SimpleImputer(strategy='median')
df_cleaned['Age'] = imputer_age.fit_transform(df_cleaned[['Age']]).ravel()
print(f"Filled missing Age with median: {imputer_age.statistics_[0]}")

# Fill 'Embarked' with most frequent value
imputer_embarked = SimpleImputer(strategy='most_frequent')
df_cleaned['Embarked'] = imputer_embarked.fit_transform(df_cleaned[['Embarked']]).ravel()
print("Filled missing Embarked with most frequent value.")

print("\nMissing values after cleaning:")
print(df_cleaned.isnull().sum())

# ---------------------------
# 🌟 Exercise 3: Feature Engineering
# ---------------------------
print("\n--- Feature Engineering ---")

# Family Size
df_cleaned['FamilySize'] = df_cleaned['SibSp'] + df_cleaned['Parch'] + 1

# Title from Name
df_cleaned['Title'] = df_cleaned['Name'].str.extract(' ([A-Za-z]+)\.')

# Encode categorical variables
df_cleaned = pd.get_dummies(df_cleaned, columns=['Sex','Embarked','Title'], drop_first=True)

print("Added FamilySize and Title features.")

# ---------------------------
# 🌟 Exercise 4: Outliers
# ---------------------------
print("\n--- Handling Outliers ---")

fare_cap = df_cleaned['Fare'].quantile(0.98)
df_cleaned['Fare'] = np.where(df_cleaned['Fare'] > fare_cap, fare_cap, df_cleaned['Fare'])
print(f"Capped Fare at 98th percentile: {fare_cap}")

# ---------------------------
# 🌟 Exercise 5: Scaling
# ---------------------------
print("\n--- Scaling Features ---")

scaler_standard = StandardScaler()
scaler_minmax = MinMaxScaler()

df_cleaned['Age_scaled'] = scaler_standard.fit_transform(df_cleaned[['Age']]).ravel()
df_cleaned['Fare_scaled'] = scaler_minmax.fit_transform(df_cleaned[['Fare']]).ravel()

print("Created Age_scaled (StandardScaler) and Fare_scaled (MinMaxScaler).")

# ---------------------------
# 🌟 Exercise 6: Final Encoding & Cleanup
# ---------------------------
print("\n--- Final Encoding & Cleanup ---")

df_cleaned = df_cleaned.drop(columns=['PassengerId','Name','Ticket'])
print("Dropped PassengerId, Name, Ticket.")

# ---------------------------
# 🌟 Exercise 7: Age Groups
# ---------------------------
print("\n--- Age Groups ---")

bins = [0, 12, 18, 60, 100]
labels = ['Child','Teen','Adult','Senior']
df_cleaned['AgeGroup'] = pd.cut(df_cleaned['Age'], bins=bins, labels=labels)

df_cleaned = pd.get_dummies(df_cleaned, columns=['AgeGroup'], drop_first=True)
print("Created AgeGroup categories.")

# ---------------------------
# Final Check
# ---------------------------
print("\n--- Final Dataset Info ---")
print("Final dataset shape:", df_cleaned.shape)
print(df_cleaned.head())


--- Initial Data Preview ---
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450

  df_cleaned['Title'] = df_cleaned['Name'].str.extract(' ([A-Za-z]+)\.')
