# Day 10 EDA

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
df = sns.load_dataset("titanic")
df.info()

# 1.Explore Data 

In [None]:
df.describe()

In [None]:
df.isnull().sum() / len(df) * 100

In [None]:
sns.heatmap(df.isnull(),cbar=False)

In [None]:
df['sex'].nunique()
df['sex'].unique()

In [None]:
df.nunique()

In [None]:
df['embark_town'].value_counts()

In [None]:
df.groupby(['survived','sex'])['fare'].mean()

In [None]:
df.groupby('pclass')['fare'].mean()

# Value Counts

In [None]:
df.groupby(['who','sex'])['sex'].value_counts()

# Correlation

In [None]:
corr=df[['fare','age','sibsp','parch']].corr()

In [None]:
sns.heatmap(corr,annot=True,cmap='Greens')

# 2.Imputing Missing Values


In [None]:
# drop the column having 70 % missing values
df.drop(columns='deck',axis=1,inplace=True)

In [None]:
# filling missing values
df['age'].fillna(df['age'].mean(),inplace=True)

In [None]:
# fill values with mode 
df.embark_town.fillna(df.embark_town.mode()[0],inplace=True)
df.embarked.fillna(df.embarked.mode()[0],inplace=True)

In [None]:
sns.heatmap(df.isnull())

# Binning (Feature Engineering)

In [None]:
sns.histplot(data=df,x='age', kde=True, bins=10,hue='who', alpha=1)
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Group numerical data into categories for better analysis
bins = [0,2,5,10,18,30,45,88]
labels = ['Baby','Kid','Child','Teenager','young','mature','old']
df['ageName'] = pd.cut(df['age'],bins=bins,labels = labels)

In [None]:
df.ageName.value_counts() # checking values of new column created 

## Rename Column 

In [None]:
df.rename(columns={'ageName' : 'age_group'},inplace=True)

In [None]:
df.groupby('age_group')['survived'].value_counts() 

# Data Filteration

In [None]:
# based on columns 
df.columns

In [None]:
# create new dataframe with only 4 columns
df_01 = df[['survived','age_group','fare','class']]

In [None]:
df_01.head()

In [None]:
# total passengers on the base of class column 
df['class'].value_counts()

In [None]:
# create new df - filter data based on rows criteria
df_first =df_01[df_01['class']=='First']

In [None]:
# create new df - select only row with paid (fare > 200 Pounds)
df_200 = df_01[df_01['fare']>100]

In [None]:
#df_200.info()
df_200['class'].value_counts()

In [None]:
df_01[(df_01['fare']>70) & (df_01['class']=='First')].sort_values(by='fare')

# Self - Practise 

In [None]:
# 1.top 5 passengers with highest fare in 2nd class
df_01[(df_01['class']=='First') & (df_01['fare'] > 50 )].sort_values(by='fare',ascending=False).head(5)

In [None]:
# 2.how many passengers in 3rd class (fare>20)
df_01[(df_01['class']=='Third') & (df_01['fare']>20)].shape[0]

In [None]:
# 3. What was average fare of every class passengers
df_01.groupby("class")["fare"].mean()

In [None]:
# 4 Which passengers in the 'First' class have a fare less than 50 and are female?
df_01[(df_01["class"]=='First') & (df_01["sex"]=="female") & (df_01["fare"]<50)].shape[0]

In [None]:
# 5 What are the top 3 oldest passengers in the 'Third' class? 
df_01[(df_01["class"]=="Third")].sort_values(by="age_group",ascending=False).head(3)

In [None]:
# 6 how many female passengers survived in 'First' class with fare > 250
df[(df['survived']==1)& (df['sex']=='female')&(df['class']=='First')&(df['fare']>250)]

In [None]:
# 7 Sort the dataset by age in descending order. Who are the top 5 oldest passengers
df.sort_values(by='age',ascending=False)[['embarked','age','fare']].head()

In [None]:
# 8.Create a new column - categorises the fare into 4 categories
bins = [0,50,100,150,250]
labels = ['low','normal','high','very high']
df['fareName'] = pd.cut(df['fare'],bins=bins,labels=labels)
df['fareName'].value_counts() # checking values of new column created