# Handling Missing Data in Python (Pandas)
In this notebook, we will explore various ways to detect, analyze, and handle missing data using Pandas.

In [2]:
import pandas as pd 
import numpy as np 


In [3]:
df = pd.read_csv('titanic_toy.csv')

In [4]:
df.head()

Unnamed: 0,Age,Fare,Family,Survived
0,22.0,7.25,1,0
1,38.0,71.2833,1,1
2,26.0,7.925,0,1
3,35.0,53.1,1,1
4,35.0,8.05,0,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Age       714 non-null    float64
 1   Fare      846 non-null    float64
 2   Family    891 non-null    int64  
 3   Survived  891 non-null    int64  
dtypes: float64(2), int64(2)
memory usage: 28.0 KB


In [6]:
df.shape

(891, 4)

In [7]:
df.isnull().sum()

Age         177
Fare         45
Family        0
Survived      0
dtype: int64

In [8]:
!pip install scikit-learn


Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [10]:
X = df.drop(columns=['Survived'])
y=df['Survived']

In [11]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state=2)

In [12]:
mean_age = X_train['Age'].mean()
median_age = X_train['Age'].median()

mean_fare = X_train['Fare'].mean()
median_fare = X_train['Fare'].median()

In [14]:
X_train['Age_median']=X_train['Age'].fillna(median_age)
X_train['Age_mean']=X_train['Age'].fillna(mean_age)

X_train['Fare_median']=X_train['Fare'].fillna(median_fare)
X_train['Fare_mean']=X_train['Fare'].fillna(mean_fare)

In [15]:
X_train.sample(5)

Unnamed: 0,Age,Fare,Family,Age_median,Age_mean,Fare_median,Fare_mean
75,25.0,7.65,0,25.0,25.0,7.65,7.65
668,43.0,8.05,0,43.0,43.0,8.05,8.05
94,59.0,7.25,0,59.0,59.0,7.25,7.25
367,,7.2292,0,28.75,29.785904,7.2292,7.2292
233,5.0,31.3875,6,5.0,5.0,31.3875,31.3875
