In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('/content/train.csv')

In [40]:
df


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


# **Looking for the Null values**

In [41]:
df.isnull()
pd.DataFrame(df.isnull().sum()).T

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,0,0,0,0,177,0,0,0,0,687,2


In [42]:
missing_val=pd.DataFrame(df.isnull().sum())
missing_val[missing_val[0]>0]

Unnamed: 0,0
Age,177
Cabin,687
Embarked,2


#**Handling Null values seperately**

- Embarked : drop the 2 rows

In [11]:
df.dropna(subset=['Embarked'],inplace=True)
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [44]:
missing_val=pd.DataFrame(df.isnull().sum())
missing_val[missing_val[0]>0]

Unnamed: 0,0
Age,177
Cabin,687


- Cabin : Drop the whole column

In [45]:
df.drop('Cabin',axis=1,inplace=True)

In [46]:
missing_val=pd.DataFrame(df.isnull().sum())
missing_val[missing_val[0]>0]

Unnamed: 0,0
Age,177


- Age : replace null values with median

In [47]:
df.groupby("Pclass").agg({"Age":"median"})

Unnamed: 0_level_0,Age
Pclass,Unnamed: 1_level_1
1,37.0
2,29.0
3,24.0


In [48]:
df["Age"]=df["Age"].fillna(df.groupby("Pclass")["Age"].transform("median"))

In [49]:
missing_val=pd.DataFrame(df.isnull().sum())
missing_val[missing_val[0]>0]

Unnamed: 0,0


#**Filling Null values Function**

In [3]:
# Method to handle missing values in each column with different fill strategies

def handling_missing_values(df):

  # for embarked rows (very few rows containing a null value)
  # i will state that if the Null values are <5% of the dataset
  # Drop rows

  rows_number = len(df)
  percent_missing_per_col = pd.DataFrame(df.isnull().sum()/rows_number, columns=["missing_percent"])
  cols_with_few_missing = percent_missing_per_col[(percent_missing_per_col["missing_percent"] > 0) & (percent_missing_per_col["missing_percent"] < 0.05)].index
  df.dropna(subset=cols_with_few_missing, inplace=True)


  # fill missing values for numerical columns with median (Age)

  for col in df.select_dtypes(include=['number']):
    df[col] = df[col].fillna(df[col].median())

  # drop columns where more than 50% of the data is missing (cabin)

  col_to_drop = percent_missing_per_col[percent_missing_per_col["missing_percent"] > 0.5].index
  df.drop(columns=col_to_drop,inplace=True)

  return df

In [4]:
df = handling_missing_values(df)
print(df.isnull().sum())

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64
