In [3]:
import pandas as pd
import numpy as np

# Loading of dataset

Loading the csv file into pandas and converting it into a dataframe

In [4]:
df = pd.read_csv("train.csv")

# 1. Basic info 

In [5]:
print("Initial Shape", df.shape)
print("\nBasic Info:")
print(df.info())
print(df.isnull().sum())

Initial Shape (891, 12)

Basic Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin       

# 2. Handle Missing Values

In [6]:
# Fill Age missing values with median

df['Age'] = df['Age'].fillna(df['Age'].median())

# Fill Embarked missing values with mode

.mode()[0]
We access the first most frequent value using [0].
Even if there are multiple values with the same frequency, [0] ensures we pick the first one.


In [7]:
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode())

# Drop Cabin (too many missing values)


In [8]:
df=df.drop(columns=['Cabin'])

# ==============================
# 3. Drop Irrelevant Features
# ==============================

In [9]:

df.drop(columns=['PassengerId', 'Ticket', 'Name'])

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.2500,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.9250,S
3,1,1,female,35.0,1,0,53.1000,S
4,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S
887,1,1,female,19.0,0,0,30.0000,S
888,0,3,female,28.0,1,2,23.4500,S
889,1,1,male,26.0,0,0,30.0000,C


# ==============================
# 4. Encoding Categorical Features
# ==============================

In [10]:
# Encode Sex: male -> 0, female -> 1

df['Sex']= df['Sex'].map({'male': 0, 'female':1})

# Encode Embarked(Boarding point) : 

df['Embarked']= df['Embarked'].map({'C': 0, 'Q':1, 'S':2})

# ==============================
# 5. Feature Engineering
# ==============================


In [11]:
# Create FamilySize: SibSp + Parch + 1 (self)
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

# Create IsAlone: 1 if FamilySize == 1, else 0
df['IsAlone'] = np.where(df['FamilySize'] == 1, 1, 0)

# ==============================
# 6. Remove Duplicates
# ==============================

In [12]:
df.drop_duplicates(inplace=True)

# ==============================
# 7. Final Checks
# ==============================


In [13]:
print("\nCleaned Data Info:")
print(df.info())
print("\nCleaned Dataset Preview:")
print(df.head())


Cleaned Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    int64  
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Embarked     889 non-null    float64
 11  FamilySize   891 non-null    int64  
 12  IsAlone      891 non-null    int64  
dtypes: float64(3), int64(8), object(2)
memory usage: 90.6+ KB
None

Cleaned Dataset Preview:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3      

# ==============================
# 8. Save Cleaned Dataset
# ==============================

In [14]:
df.to_csv('cleaned_titanic.csv', index= False)
print("\n✅ Cleaned data saved as 'cleaned_titanic.csv'")


✅ Cleaned data saved as 'cleaned_titanic.csv'
