## import libraries
import pandas as pd

In [8]:
# load dataset
dataset = pd.read_csv("Datasets/Cleaned_dataset_titanic.csv")

In [10]:
# review top 5 data from the dataset
dataset.head(5)

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,embarked,age_group
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,Adult
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,Adult
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,Adult
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,Adult
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,Adult


In [52]:
# Survival Rate by Gender
dataset.groupby('sex')['survived'].value_counts(normalize=True).loc[:, 1]

sex
female    0.742038
male      0.188908
Name: proportion, dtype: float64

In [26]:
survival_by_sex = (
    dataset.groupby('sex')['survived']
      .mean()
      .reset_index()
)

print(survival_by_sex)


      sex  survived
0  female  0.742038
1    male  0.188908


In [38]:
dataset.groupby('sex')['survived'].mean()

sex
female    0.742038
male      0.188908
Name: survived, dtype: float64

In [54]:
survival_rate = (
    dataset
    .groupby('sex')['survived']
    .value_counts(normalize=True)
    .unstack()
)

survival_rate[1]

sex
female    0.742038
male      0.188908
Name: 1, dtype: float64

In [62]:
# Step 3: Survival Rate by Passenger Class

survival_by_class = (
    dataset.groupby('pclass')['survived']
      .mean()
      .reset_index()
      .sort_values(by='survived', ascending=False)
)

print(survival_by_class)

   pclass  survived
0       1  0.629630
1       2  0.472826
2       3  0.242363


In [64]:
# Step 4: Average Fare by Class

avg_fare = (
    dataset.groupby('pclass')['fare']
      .mean()
      .reset_index()
)

print(avg_fare)


   pclass       fare
0       1  84.154687
1       2  20.662183
2       3  13.675550


In [66]:
# Step 5: Multiple Aggregations with agg()

# Question: Whatâ€™s the age distribution per class?

age_stats = (
    dataset.groupby('pclass')['age']
      .agg(['count', 'mean', 'min', 'max'])
      .reset_index()
)

print(age_stats)


   pclass  count       mean   min   max
0       1    216  37.048118  0.92  80.0
1       2    184  29.866958  0.67  70.0
2       3    491  26.403259  0.42  74.0


In [68]:
# Step 6: Create a New Feature (Age Group)

def age_group(age):
    if age < 18:
        return 'Child'
    elif age <= 60:
        return 'Adult'
    else:
        return 'Senior'

dataset['age_group'] = dataset['age'].apply(age_group)


In [88]:
# Step 7: Survival Rate by Age Group

Survival_Rate = (
    dataset.groupby('age_group')['survived']
      .mean()
      .reset_index()
)

print(Survival_Rate)

  age_group  survived
0     Adult  0.365079
1     Child  0.539823
2    Senior  0.227273


In [80]:
# Step 8: Sort to Find Extremes

# Top 5 highest fares paid:

dataset[['name', 'fare']].sort_values(by='fare', ascending=False).head()


Unnamed: 0,name,fare
258,"Ward, Miss. Anna",512.3292
737,"Lesurer, Mr. Gustave J",512.3292
679,"Cardeza, Mr. Thomas Drake Martinez",512.3292
88,"Fortune, Miss. Mabel Helen",263.0
27,"Fortune, Mr. Charles Alexander",263.0


In [82]:
# ðŸ§ª Mini Challenges (Optional)

# Survival rate by embarked

# Average fare by sex

# Passenger count by age_group

In [84]:
# Survival rate by embarked

Survival_Rate_embarked = (
    dataset.groupby('embarked')['survived']
      .mean()
      .reset_index()
)

print(Survival_Rate_embarked)

  embarked  survived
0        C  0.553571
1        Q  0.389610
2        S  0.339009


In [86]:
# Average fare by sex

average_fare = (
    dataset.groupby('sex')['fare']
      .mean()
      .reset_index()
)

print(average_fare)

      sex       fare
0  female  44.479818
1    male  25.523893


In [100]:
# Passenger count by age_group

passenger_count = (
    dataset['age_group'].value_counts()
    .reset_index()
)

print(passenger_count)

  age_group  count
0     Adult    756
1     Child    113
2    Senior     22
