# Pandas Day 5

# Data Filteration

### `Data filtration` refers to the process of selecting, isolating, or extracting specific subsets of data from a larger dataset based on certain conditions or criteria. This process is commonly used in data analysis to focus on relevant data, remove noise, or prepare data for further analysis. 

In [1]:
# Importing Libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load Titanic Dataset using Seaborn 
df  = sns.load_dataset('titanic')
# Display the first 5 rows of the dataset   
print(df.head())

   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  


In [3]:
# Let's check the names of the columns in the dataset
print(df.columns)

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')


In [4]:
# check for null values 
df.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [5]:
# Impute the missing values in the 'age' column with the mean age
df['age'] = df['age'].fillna(df['age'].mean())
# Again check for null values
print(df.isnull().sum())

survived         0
pclass           0
sex              0
age              0
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64


In [6]:
# Binning of age coloumns in to 7 categories
bins = [0,1,5,12,18,30,50,80]
labels = ["Infants","Toddlers","Kids","Teens","Youngs","Middle Aged","Old"]
#Which coloumn is converted into bins
pd.cut(df['age'], bins=bins, labels=labels)

0           Youngs
1      Middle Aged
2           Youngs
3      Middle Aged
4      Middle Aged
          ...     
886         Youngs
887         Youngs
888         Youngs
889         Youngs
890    Middle Aged
Name: age, Length: 891, dtype: category
Categories (7, object): ['Infants' < 'Toddlers' < 'Kids' < 'Teens' < 'Youngs' < 'Middle Aged' < 'Old']

In [7]:
# Adding new coloumn in a dataset on the base of other coloumn in a dataset.
bins = [0,1,5,12,18,30,50,80]
labels = ["Infants","Toddlers","Kids","Teens","Youngs","Middle Aged","Old"]
#Which coloumn is converted into bins
df["Age_groups"] = pd.cut(df['age'], bins=bins, labels=labels)
# Display the first 5 rows of the dataset
print(df.head())


   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone   Age_groups  
0    man        True  NaN  Southampton    no  False       Youngs  
1  woman       False    C    Cherbourg   yes  False  Middle Aged  
2  woman       False  NaN  Southampton   yes   True       Youngs  
3  woman       False    C  Southampton   yes  False  Middle Aged  
4    man        True  NaN  Southampton    no   True  Middle Aged  


In [8]:
# Filter the dataset with coloumns include survived, age_groups, fare and class
df_1 = df[["survived","Age_groups","fare","class"]]
# Display the first 5 rows of the dataset
print(df_1.head())

   survived   Age_groups     fare  class
0         0       Youngs   7.2500  Third
1         1  Middle Aged  71.2833  First
2         1       Youngs   7.9250  Third
3         1  Middle Aged  53.1000  First
4         0  Middle Aged   8.0500  Third


In [9]:
# Get the information about the dataset
df_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   survived    891 non-null    int64   
 1   Age_groups  891 non-null    category
 2   fare        891 non-null    float64 
 3   class       891 non-null    category
dtypes: category(2), float64(1), int64(1)
memory usage: 16.3 KB


In [10]:
#Filter the data on the base of row criteria 
df_2 = df_1[df_1["class"] == "First"]
# Display the first 5 rows of the filtered dataset
df_2.head()

Unnamed: 0,survived,Age_groups,fare,class
1,1,Middle Aged,71.2833,First
3,1,Middle Aged,53.1,First
6,0,Old,51.8625,First
11,1,Old,26.55,First
23,1,Youngs,35.5,First


In [11]:
# Filter the dataset to include only rows where the "survived" column has a value of 1 (indicating passengers who survived)
df_2 = df_1[df_1["survived"] == 1]
df_2.head()

Unnamed: 0,survived,Age_groups,fare,class
1,1,Middle Aged,71.2833,First
2,1,Youngs,7.925,Third
3,1,Middle Aged,53.1,First
8,1,Youngs,11.1333,Third
9,1,Teens,30.0708,Second


In [12]:
# Filter the dataset to include only rows where the "Age_Groups" column has a value of "Infants"
df_2 = df_2[df_2["Age_groups"] == "Infants"]
# Display the count of unique values in the "Age_Groups" column
df_2["Age_groups"].value_counts()

Age_groups
Infants        12
Toddlers        0
Kids            0
Teens           0
Youngs          0
Middle Aged     0
Old             0
Name: count, dtype: int64

In [None]:
# Based on Multiple Filtering in a Dataset
df_3 = df_1[(df_1["survived"] == 1) & (df_1["Age_groups"] == "Infants") & (df_1["class"] == "Second") & (df_1["fare"] < 300)]
# Let's check the first 5 rows of the filtered dataset
df_3.head()

Unnamed: 0,survived,Age_groups,fare,class
78,1,Infants,29.0,Second
183,1,Infants,39.0,Second
755,1,Infants,14.5,Second
827,1,Infants,37.0042,Second
831,1,Infants,18.75,Second


# 🎉 Mission Complete! 🚀✨