# # AI Bootcamp Assignment 1 Solution:

In [2]:
# import libs and packs:

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import LabelEncoder

# task 01: Load the Dataset
<ol>

<li>Import necessary libraries</li>
<li>Load the dataset</li>
<li>Display the first five rows of the dataset </li>
</ol>

**Expected Output: The first five rows of the dataset.**


In [5]:
#using pandas
df = pd.read_csv('datasets/titanic/train.csv') # load datasets

df.shape

#using seaborn's built in titanic

df = sns.load_dataset('titanic')
print("Dataset loaded successfully")
print(f"Dataset shape: {df.shape}")

Dataset loaded successfully
Dataset shape: (891, 15)


In [6]:
df.head() #print first 5 rows.

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


# task 02 : Handle Missing Values

<ol>
<li>Identify missing values in each column </li>
<li>Drop columns with too many missing values (threshold: more than 50% missing) </li>
<li>Fill missing numerical values with the median of the respective column </li>
<li>Fill missing categorical values with the most frequent value (mode) </li>
</ol>

**Expected Output: A cleaned dataset without missing values.**


In [8]:
#1
missing_values = df.isnull().sum()
missing_percentage = (df.isnull().sum() / len(df)) * 100
missing_info = pd.DataFrame({
    'Missing Count': missing_values,
    'Missing Percentage': missing_percentage
})
print(missing_info[missing_info['Missing Count'] > 0])

             Missing Count  Missing Percentage
age                    177           19.865320
embarked                 2            0.224467
deck                   688           77.216611
embark_town              2            0.224467


In [9]:
#2
threshold = 0.5
columns_to_drop = missing_info[missing_info['Missing Percentage'] > 50].index.tolist()
if columns_to_drop:
    print(f"\nColumns to drop (>50% missing): {columns_to_drop}")
    df = df.drop(columns=columns_to_drop)
    print(f"Dropped {len(columns_to_drop)} columns")
else:
    print("\nNo columns have more than 50% missing values")


Columns to drop (>50% missing): ['deck']
Dropped 1 columns


In [10]:
#3
numerical_columns = df.select_dtypes(include=[np.number]).columns.tolist()
print(f"\nNumerical columns: {numerical_columns}")

for col in numerical_columns:
    if df[col].isnull().sum() > 0:
        median_value = df[col].median()
        df[col].fillna(median_value, inplace=True)
        print(f"Filled missing values in '{col}' with median: {median_value:.2f}")



Numerical columns: ['survived', 'pclass', 'age', 'sibsp', 'parch', 'fare']
Filled missing values in 'age' with median: 28.00


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(median_value, inplace=True)


In [11]:
#4
categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()
print(f"\nCategorical columns: {categorical_columns}")

for col in categorical_columns:
    if df[col].isnull().sum() > 0:
        mode_value = df[col].mode().iloc[0]
        df[col].fillna(mode_value, inplace=True)
        print(f" Filled missing values in '{col}' with mode: '{mode_value}'")


Categorical columns: ['sex', 'embarked', 'class', 'who', 'embark_town', 'alive']
 Filled missing values in 'embarked' with mode: 'S'
 Filled missing values in 'embark_town' with mode: 'Southampton'


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(mode_value, inplace=True)


In [12]:
print(f"\nMissing values after cleaning:")
remaining_missing = df.isnull().sum().sum()
print(f"Total missing values: {remaining_missing}")


Missing values after cleaning:
Total missing values: 0


# task 03: Handle Duplicate Data
<ol>
    <li>Check for duplicate rows </li>
    <li>Remove duplicate rows </li>
</ol>

**Expected Output: The number of duplicate rows found and removed.**

In [13]:
#1
initial_rows = len(df)
duplicate_count = df.duplicated().sum()
print(f"Number of duplicate rows found: {duplicate_count}")

Number of duplicate rows found: 116


In [14]:
# 2

if duplicate_count > 0:
    df = df.drop_duplicates()
    final_rows = len(df)
    removed_rows = initial_rows - final_rows
    print(f"Removed {removed_rows} duplicate rows")
    print(f"Dataset shape after removing duplicates: {df.shape}")
else:
    print("No duplicate rows found")

Removed 116 duplicate rows
Dataset shape after removing duplicates: (775, 14)


# task 04: Convert Categorical Features to Numeric
<ol>
<li>Convert categorical columns (sex, embark_town, class, etc.) using one-hot encoding </li>
<li>Convert Boolean columns (alone, who) to numeric (0 and 1)</li>

</ol>

**Expected Output: The dataset with all categorical columns transformed into numeric values.**

In [15]:
#1

df_encoded = df.copy()

# 1. Convert categorical columns using one-hot encoding
categorical_for_encoding = []
for col in df_encoded.columns:
    if df_encoded[col].dtype == 'object' or df_encoded[col].dtype.name == 'category':
        categorical_for_encoding.append(col)

print(f"Categorical columns for one-hot encoding: {categorical_for_encoding}")

if categorical_for_encoding:
    # Apply one-hot encoding
    df_encoded = pd.get_dummies(df_encoded, columns=categorical_for_encoding, prefix=categorical_for_encoding)
    print(f"Applied one-hot encoding to {len(categorical_for_encoding)} columns")



Categorical columns for one-hot encoding: ['sex', 'embarked', 'class', 'who', 'embark_town', 'alive']
Applied one-hot encoding to 6 columns


In [16]:
#2
boolean_columns = df_encoded.select_dtypes(include=[bool]).columns.tolist()
if boolean_columns:
    print(f"Boolean columns to convert: {boolean_columns}")
    for col in boolean_columns:
        df_encoded[col] = df_encoded[col].astype(int)
        print(f"Converted boolean column '{col}' to numeric")
else:
    print("No boolean columns found to convert")

print(f"\nDataset shape after categorical encoding: {df_encoded.shape}")
print(f"All columns are now numeric: {df_encoded.select_dtypes(include=[np.number]).shape[1] == df_encoded.shape[1]}")



Boolean columns to convert: ['adult_male', 'alone', 'sex_female', 'sex_male', 'embarked_C', 'embarked_Q', 'embarked_S', 'class_First', 'class_Second', 'class_Third', 'who_child', 'who_man', 'who_woman', 'embark_town_Cherbourg', 'embark_town_Queenstown', 'embark_town_Southampton', 'alive_no', 'alive_yes']
Converted boolean column 'adult_male' to numeric
Converted boolean column 'alone' to numeric
Converted boolean column 'sex_female' to numeric
Converted boolean column 'sex_male' to numeric
Converted boolean column 'embarked_C' to numeric
Converted boolean column 'embarked_Q' to numeric
Converted boolean column 'embarked_S' to numeric
Converted boolean column 'class_First' to numeric
Converted boolean column 'class_Second' to numeric
Converted boolean column 'class_Third' to numeric
Converted boolean column 'who_child' to numeric
Converted boolean column 'who_man' to numeric
Converted boolean column 'who_woman' to numeric
Converted boolean column 'embark_town_Cherbourg' to numeric
Conve

# task 05: Feature Scaling
<ol> 
<li>Normalize numerical features (age, fare, etc.) using Min-Max Scaling </li>
<li>Standardize numerical features using StandardScaler and compare results </li>
</ol> 

**Expected Output: A scaled dataset where all numerical features are normalized/standardized.**

In [17]:
#1

continuous_features = []
for col in df_encoded.columns:
    # Skip binary columns (those with only 0 and 1 values)
    unique_values = df_encoded[col].nunique()
    if unique_values > 2 and df_encoded[col].dtype in [np.float64, np.int64]:
        continuous_features.append(col)

print(f"Continuous features to scale: {continuous_features}")

if continuous_features:
    # 1. Min-Max Scaling
    scaler_minmax = MinMaxScaler()
    df_minmax = df_encoded.copy()
    df_minmax[continuous_features] = scaler_minmax.fit_transform(df_encoded[continuous_features])
    
    print("\nMin-Max Scaling applied")
    print("Sample of Min-Max scaled data:")
    print(df_minmax[continuous_features].head())
    print(f"Min-Max scaled ranges:")
    for col in continuous_features:
        print(f"  {col}: [{df_minmax[col].min():.3f}, {df_minmax[col].max():.3f}]")
    

Continuous features to scale: ['pclass', 'age', 'sibsp', 'parch', 'fare']

Min-Max Scaling applied
Sample of Min-Max scaled data:
   pclass       age  sibsp  parch      fare
0     1.0  0.271174  0.125    0.0  0.014151
1     0.0  0.472229  0.125    0.0  0.139136
2     1.0  0.321438  0.000    0.0  0.015469
3     0.0  0.434531  0.125    0.0  0.103644
4     1.0  0.434531  0.000    0.0  0.015713
Min-Max scaled ranges:
  pclass: [0.000, 1.000]
  age: [0.000, 1.000]
  sibsp: [0.000, 1.000]
  parch: [0.000, 1.000]
  fare: [0.000, 1.000]


In [20]:
#2

scaler_standard = StandardScaler()
if continuous_features:
    df_standard = df_encoded.copy()
    df_standard[continuous_features] = scaler_standard.fit_transform(df_encoded[continuous_features])

    print("\nStandard Scaling applied")
    print("Sample of Standard scaled data:")
    print(df_standard[continuous_features].head())
    print(f"Standard scaled statistics:")
    for col in continuous_features:
        mean_val = df_standard[col].mean()
        std_val = df_standard[col].std()
        print(f"  {col}: mean={mean_val:.3f}, std={std_val:.3f}")



Standard Scaling applied
Sample of Standard scaled data:
     pclass       age     sibsp     parch      fare
0  0.883385 -0.551060  0.475876 -0.500754 -0.527515
1 -1.461216  0.611945  0.475876 -0.500754  0.695086
2  0.883385 -0.260308 -0.534545 -0.500754 -0.514627
3 -1.461216  0.393881  0.475876 -0.500754  0.347909
4  0.883385  0.393881 -0.534545 -0.500754 -0.512240
Standard scaled statistics:
  pclass: mean=-0.000, std=1.001
  age: mean=0.000, std=1.001
  sibsp: mean=-0.000, std=1.001
  parch: mean=0.000, std=1.001
  fare: mean=-0.000, std=1.001


# task 06: Outlier Detection using IQR Method
<ol>
<li>Compute the Interquartile Range (IQR) for numerical features (age, fare, etc.).</li>
<li>Identify outliers using the 1.5 * IQR rule.</li>
<li>Remove or replace outliers with appropriate values (e.g., mean/median).</li>
</ol>

**Expected Output: A dataset where outliers are handled using the IQR method.**

In [22]:
# work with encoders

df_outliers = df_encoded.copy()

if continuous_features:
    print("Outlier detection for continuous features:")

    outlier_info = {}

    for col in continuous_features:
        print(f"\nProcessing column: {col}")

        #1

        Q1 = df_outliers[col].quantile(0.25)
        Q3 = df_outliers[col].quantile(0.75)
        IQR = Q3 - Q1

        print(f"  Q1: {Q1:.2f}, Q3: {Q3:.2f}, IQR: {IQR:.2f}")

        #2

        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        print(f"  Outlier bounds: [{lower_bound:.2f}, {upper_bound:.2f}]")

        # Find outliers
        outliers_mask = (df_outliers[col] < lower_bound) | (df_outliers[col] > upper_bound)
        outliers_count = outliers_mask.sum()

        print(f"  Outliers found: {outliers_count}")

        outlier_info[col] = {
            'count': outliers_count,
            'percentage': (outliers_count / len(df_outliers)) * 100,
            'lower_bound': lower_bound,
            'upper_bound': upper_bound
        }


        #3

        if outliers_count > 0:
            median_value = df_outliers[col].median()
            df_outliers.loc[outliers_mask, col] = median_value
            print(f"Replaced {outliers_count} outliers with median: {median_value:.2f}")
        else:
            print("No outliers found")

Outlier detection for continuous features:

Processing column: pclass
  Q1: 1.00, Q3: 3.00, IQR: 2.00
  Outlier bounds: [-2.00, 6.00]
  Outliers found: 0
No outliers found

Processing column: age
  Q1: 21.00, Q3: 36.00, IQR: 15.00
  Outlier bounds: [-1.50, 58.50]
  Outliers found: 27
Replaced 27 outliers with median: 28.00

Processing column: sibsp
  Q1: 0.00, Q3: 1.00, IQR: 1.00
  Outlier bounds: [-1.50, 2.50]
  Outliers found: 39
Replaced 39 outliers with median: 0.00

Processing column: parch
  Q1: 0.00, Q3: 1.00, IQR: 1.00
  Outlier bounds: [-1.50, 2.50]
  Outliers found: 15
Replaced 15 outliers with median: 0.00

Processing column: fare
  Q1: 8.05, Q3: 34.20, IQR: 26.15
  Outlier bounds: [-31.17, 73.42]
  Outliers found: 102
Replaced 102 outliers with median: 15.90


In [24]:
 # Summary of outlier detection

if continuous_features:
    print("OUTLIER DETECTION SUMMARY")


    total_outliers = sum(info['count'] for info in outlier_info.values())
    print(f"Total outliers found and replaced: {total_outliers}")

    for col, info in outlier_info.items():
        if info['count'] > 0:
            print(f"{col}: {info['count']} outliers ({info['percentage']:.1f}%)")

OUTLIER DETECTION SUMMARY
Total outliers found and replaced: 183
age: 27 outliers (3.5%)
sibsp: 39 outliers (5.0%)
parch: 15 outliers (1.9%)
fare: 102 outliers (13.2%)


In [25]:
# FINAL SUMMARY

print("FINAL PREPROCESSING SUMMARY")

print(f"Original dataset shape: {df.shape}")
print(f"Final processed dataset shape: {df_outliers.shape}")
print(f"All missing values handled: {df_outliers.isnull().sum().sum() == 0}")
print(f"All features are numeric: {df_outliers.select_dtypes(include=[np.number]).shape[1] == df_outliers.shape[1]}")
print(f"Duplicate rows removed: {duplicate_count}")
print(f"Outliers handled using IQR method")

print("\nProcessed dataset info:")
print(df_outliers.info())

print("\nFirst 5 rows of final processed dataset:")

print(df_outliers.head())

print("PREPROCESSING COMPLETED SUCCESSFULLY!") 

FINAL PREPROCESSING SUMMARY
Original dataset shape: (775, 14)
Final processed dataset shape: (775, 24)
All missing values handled: True
All features are numeric: True
Duplicate rows removed: 116
Outliers handled using IQR method

Processed dataset info:
<class 'pandas.core.frame.DataFrame'>
Index: 775 entries, 0 to 890
Data columns (total 24 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   survived                 775 non-null    int64  
 1   pclass                   775 non-null    int64  
 2   age                      775 non-null    float64
 3   sibsp                    775 non-null    int64  
 4   parch                    775 non-null    int64  
 5   fare                     775 non-null    float64
 6   adult_male               775 non-null    int64  
 7   alone                    775 non-null    int64  
 8   sex_female               775 non-null    int64  
 9   sex_male                 775 non-null    int64 