In [1]:
# Import necessary libraries
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [2]:
# Q1. Data Loading and Exploration
# ----------------------------------------------

# 1. Load the Titanic dataset using Seaborn
titanic = sns.load_dataset('titanic')

In [3]:
# 2. Display the first 10 rows of the dataset
print("First 10 rows of the dataset:")
print(titanic.head(10))

First 10 rows of the dataset:
   survived  pclass     sex   age  sibsp  parch     fare embarked   class  \
0         0       3    male  22.0      1      0   7.2500        S   Third   
1         1       1  female  38.0      1      0  71.2833        C   First   
2         1       3  female  26.0      0      0   7.9250        S   Third   
3         1       1  female  35.0      1      0  53.1000        S   First   
4         0       3    male  35.0      0      0   8.0500        S   Third   
5         0       3    male   NaN      0      0   8.4583        Q   Third   
6         0       1    male  54.0      0      0  51.8625        S   First   
7         0       3    male   2.0      3      1  21.0750        S   Third   
8         1       3  female  27.0      0      2  11.1333        S   Third   
9         1       2  female  14.0      1      0  30.0708        C  Second   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman      

In [4]:
# 3. Find the total number of rows and columns
print("\nDataset shape (rows, columns):", titanic.shape)


Dataset shape (rows, columns): (891, 15)


In [5]:
# 4. Display column names, data types, and non-null counts
print("\nDataset information:")
print(titanic.info())


Dataset information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB
None


In [6]:
# 5. Identify the number of missing values in each column
print("\nMissing values in each column:")
print(titanic.isnull().sum())



Missing values in each column:
survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64


In [7]:
# 6. Insights
print("\nInsights:")
print("""
• The dataset contains information about 891 passengers.
• There are 15 columns with mixed data types: numeric, categorical, and boolean.
• Missing values are present notably in 'age', 'deck', and 'embark_town'.
• The dataset includes survival information and passenger demographics.
""")


Insights:

• The dataset contains information about 891 passengers.
• There are 15 columns with mixed data types: numeric, categorical, and boolean.
• Missing values are present notably in 'age', 'deck', and 'embark_town'.
• The dataset includes survival information and passenger demographics.



In [8]:
# Q2. Handling Missing Values
# ----------------------------------------------

# 1. List columns with missing values
missing_cols = titanic.columns[titanic.isnull().any()].tolist()
print("\nColumns with missing values:", missing_cols)



Columns with missing values: ['age', 'embarked', 'deck', 'embark_town']


In [9]:
# 2. Handle missing values in specific columns
# Fill 'age' with median
titanic['age'].fillna(titanic['age'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  titanic['age'].fillna(titanic['age'].median(), inplace=True)


In [10]:
# Fill 'embarked' and 'embark_town' with mode
titanic['embarked'].fillna(titanic['embarked'].mode()[0], inplace=True)
titanic['embark_town'].fillna(titanic['embark_town'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  titanic['embarked'].fillna(titanic['embarked'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  titanic['embark_town'].fillna(titanic['embark_town'].mode()[0], inplace=True)


In [11]:
# Drop 'deck' column (too many missing values)
titanic.drop(columns=['deck'], inplace=True)


In [12]:

# 3. Verify that missing values are handled
print("\nMissing values after handling:")
print(titanic.isnull().sum())


Missing values after handling:
survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
embark_town    0
alive          0
alone          0
dtype: int64


In [13]:
# Q3. Handling Duplicate Data
# ----------------------------------------------

# 1. Check for duplicate rows
duplicates = titanic.duplicated().sum()
print("\nNumber of duplicate rows:", duplicates)


Number of duplicate rows: 116


In [14]:
# 2. Remove duplicates if any
if duplicates > 0:
    titanic.drop_duplicates(inplace=True)
print("New dataset shape after removing duplicates:", titanic.shape)


New dataset shape after removing duplicates: (775, 14)


In [15]:
# Q4. Encoding Categorical Variables
# ----------------------------------------------

# 1. Identify categorical columns
categorical_cols = titanic.select_dtypes(include=['object', 'category', 'bool']).columns
print("\nCategorical columns:", list(categorical_cols))



Categorical columns: ['sex', 'embarked', 'class', 'who', 'adult_male', 'embark_town', 'alive', 'alone']


In [16]:

# 2. Convert 'sex' column to numeric form
titanic['sex'] = titanic['sex'].map({'male': 0, 'female': 1})

In [17]:
# 3. Apply one-hot encoding on selected categorical columns
titanic = pd.get_dummies(titanic, columns=['embarked', 'class', 'who', 'alone', 'adult_male'], drop_first=True)


In [18]:
print("\nColumns after encoding:")
print(titanic.columns)



Columns after encoding:
Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embark_town', 'alive', 'embarked_Q', 'embarked_S', 'class_Second',
       'class_Third', 'who_man', 'who_woman', 'alone_True', 'adult_male_True'],
      dtype='object')


In [19]:

# 4. Explanation
print("""
Encoding categorical variables is necessary because:
• Machine learning models require numerical input.
• It helps the model understand categorical distinctions.
• Prevents bias caused by arbitrary numerical labeling.
""")


Encoding categorical variables is necessary because:
• Machine learning models require numerical input.
• It helps the model understand categorical distinctions.
• Prevents bias caused by arbitrary numerical labeling.



In [20]:
# Q5. Feature Scaling
# ----------------------------------------------

# 1. Explanation
print("""
Feature scaling ensures all numeric features are on a similar scale.
It improves convergence in gradient-based algorithms and avoids dominance by features with large ranges.
""")


Feature scaling ensures all numeric features are on a similar scale.
It improves convergence in gradient-based algorithms and avoids dominance by features with large ranges.



In [21]:
# 2. Apply StandardScaler to 'age' and 'fare'
scaler = StandardScaler()
titanic[['age_scaled', 'fare_scaled']] = scaler.fit_transform(titanic[['age', 'fare']])


In [22]:
# 3. Compare scaled vs. original values
print("\nOriginal vs Scaled comparison (first 5 rows):")
print(titanic[['age', 'age_scaled', 'fare', 'fare_scaled']].head())


Original vs Scaled comparison (first 5 rows):
    age  age_scaled     fare  fare_scaled
0  22.0   -0.551060   7.2500    -0.527515
1  38.0    0.611945  71.2833     0.695086
2  26.0   -0.260308   7.9250    -0.514627
3  35.0    0.393881  53.1000     0.347909
4  35.0    0.393881   8.0500    -0.512240


In [23]:
# Q6. Feature Engineering
# ----------------------------------------------

# 1. Create family_size = sibsp + parch + 1
titanic['family_size'] = titanic['sibsp'] + titanic['parch'] + 1


In [24]:
# 2. Create is_child column (True if age < 18)
titanic['is_child'] = titanic['age'] < 18


In [25]:
# 3. Explanation
print("""
New features help prediction:
• family_size: Larger families might affect survival chances.
• is_child: Children may have been given evacuation priority.
""")



New features help prediction:
• family_size: Larger families might affect survival chances.
• is_child: Children may have been given evacuation priority.



In [26]:
# Q7. Dropping Irrelevant Columns
# ----------------------------------------------

# 1. Identify and drop irrelevant columns
irrelevant_cols = ['alive', 'embark_town']
titanic.drop(columns=irrelevant_cols, inplace=True)


In [27]:

# 2. Justification
print("""
Dropped columns:
• 'alive': Duplicate information (same as 'survived').
• 'embark_town': Redundant since 'embarked' already encodes this info.
""")



Dropped columns:
• 'alive': Duplicate information (same as 'survived').
• 'embark_town': Redundant since 'embarked' already encodes this info.



In [28]:
# Q8. Final Data Verification
# ----------------------------------------------

print("\nCleaned dataset (first 5 rows):")
print(titanic.head())

print("\nMissing values check after cleaning:")
print(titanic.isnull().sum())

print("\nDuplicate rows after cleaning:", titanic.duplicated().sum())

print("\nData types after cleaning:")
print(titanic.dtypes)

print("""
Summary of Cleaning Steps:
• Loaded dataset and explored structure.
• Handled missing values using median/mode.
• Removed duplicates.
• Encoded categorical columns.
• Scaled numeric columns.
• Engineered new features.
• Dropped irrelevant columns.
""")


Cleaned dataset (first 5 rows):
   survived  pclass  sex   age  sibsp  parch     fare  embarked_Q  embarked_S  \
0         0       3    0  22.0      1      0   7.2500       False        True   
1         1       1    1  38.0      1      0  71.2833       False       False   
2         1       3    1  26.0      0      0   7.9250       False        True   
3         1       1    1  35.0      1      0  53.1000       False        True   
4         0       3    0  35.0      0      0   8.0500       False        True   

   class_Second  class_Third  who_man  who_woman  alone_True  adult_male_True  \
0         False         True     True      False       False             True   
1         False        False    False       True       False            False   
2         False         True    False       True        True            False   
3         False        False    False       True       False            False   
4         False         True     True      False        True             Tr

In [29]:
# ----------------------------------------------
# Q9. Reflection / Discussion
# ----------------------------------------------

print("""
Reflection:
1. Data cleaning is critical because it removes noise, handles missing values, and ensures consistent data formats — improving model accuracy.
2. Handling missing values had the greatest impact, as missing 'age' and 'deck' could have misled analyses.
3. For large real-world datasets, automation, distributed processing (e.g., Spark), and scalable imputation techniques would be used.
""")


Reflection:
1. Data cleaning is critical because it removes noise, handles missing values, and ensures consistent data formats — improving model accuracy.
2. Handling missing values had the greatest impact, as missing 'age' and 'deck' could have misled analyses.
3. For large real-world datasets, automation, distributed processing (e.g., Spark), and scalable imputation techniques would be used.

