## Data Cleansing Process

#### Dela with NaN values or missing values
- missing value in the data set
- if values are missing, the data set can not be used for further processing 
- methods
- remove the row(s) having NaN values
- remove the column(s) having NaN values
- replace the NaN values with mean, mode or median

In [1]:
# import required packages

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# prequisities

import warnings
warnings.filterwarnings(action='ignore')

In [3]:
# load the dataset
df=pd.read_csv('titanic.csv')
df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


### EDA

In [4]:
df.describe()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,body
count,1309.0,1309.0,1046.0,1309.0,1309.0,1308.0,121.0
mean,2.294882,0.381971,29.881135,0.498854,0.385027,33.295479,160.809917
std,0.837836,0.486055,14.4135,1.041658,0.86556,51.758668,97.696922
min,1.0,0.0,0.1667,0.0,0.0,0.0,1.0
25%,2.0,0.0,21.0,0.0,0.0,7.8958,72.0
50%,3.0,0.0,28.0,0.0,0.0,14.4542,155.0
75%,3.0,1.0,39.0,1.0,0.0,31.275,256.0
max,3.0,1.0,80.0,8.0,9.0,512.3292,328.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     1309 non-null   int64  
 1   survived   1309 non-null   int64  
 2   name       1309 non-null   object 
 3   sex        1309 non-null   object 
 4   age        1046 non-null   float64
 5   sibsp      1309 non-null   int64  
 6   parch      1309 non-null   int64  
 7   ticket     1309 non-null   object 
 8   fare       1308 non-null   float64
 9   cabin      295 non-null    object 
 10  embarked   1307 non-null   object 
 11  boat       486 non-null    object 
 12  body       121 non-null    float64
 13  home.dest  745 non-null    object 
dtypes: float64(3), int64(4), object(7)
memory usage: 143.3+ KB


In [6]:
df.columns

Index(['pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket',
       'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest'],
      dtype='object')

In [7]:
df.isna().sum()

pclass          0
survived        0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64

In [8]:
# axis = 0 for rows
# dropna() removes the row(s) containing at least one NA records and
# returns a new data frame (the original DF does NOT get affected)
df_new = df.dropna(axis=0)
df_new

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest


In [10]:
# instead of returning a new df, modify or update the existing df
# df.dropna(axis=0, inplace=True)

In [11]:
# axis = 1 for columns
# dropna() removes the column(s) containing at least one NA record and
# returns a new data frame (the original DF does NOT get affected)
df_new = df.dropna(axis=1)
df_new

Unnamed: 0,pclass,survived,name,sex,sibsp,parch,ticket
0,1,1,"Allen, Miss. Elisabeth Walton",female,0,0,24160
1,1,1,"Allison, Master. Hudson Trevor",male,1,2,113781
2,1,0,"Allison, Miss. Helen Loraine",female,1,2,113781
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,1,2,113781
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,1,2,113781
...,...,...,...,...,...,...,...
1304,3,0,"Zabour, Miss. Hileni",female,1,0,2665
1305,3,0,"Zabour, Miss. Thamine",female,1,0,2665
1306,3,0,"Zakarian, Mr. Mapriededer",male,0,0,2656
1307,3,0,"Zakarian, Mr. Ortin",male,0,0,2670


In [12]:
# instead of returning a new df, modify or update the existing df
# df.dropna(axis=1, inplace=True)

### replace the NA values with mean, mode or median

In [13]:
df['age'].mean()
np.float64(29.8811345124283)
# df['age'].fillna(df['age'].mean(), inplace=True)
df['age'] = df['age'].fillna(df['age'].mean())

#### replace the NA values with Mode of respective columns

In [14]:
cabin_mode = df['cabin'].mode()[0]
df['cabin'].mode()[0]

'C23 C25 C27'

In [15]:
df['cabin'] = df['cabin'].fillna(cabin_mode)

In [16]:
df.isna().sum()

pclass          0
survived        0
name            0
sex             0
age             0
sibsp           0
parch           0
ticket          0
fare            1
cabin           0
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64