Advance Operations with Pandas

Examples

2.1. Sorting and SubSetting

In [None]:
import pandas as pd
#Creating a sample DataFrame usind dictionary

#Creating python dictionary with key that has value in list form
data={"Name":["Alice","Bob","Charlie","David"],
      "Age":[24,19,22,25],
      "Score":[88,92,85,95]}

#Converts the dictionary into dataframe where each key becomes column and each list element becomes row
df=pd.DataFrame(data)

#Gives first 5 rows
print(df.head())


      Name  Age  Score
0    Alice   24     88
1      Bob   19     92
2  Charlie   22     85
3    David   25     95


In [None]:
#Example 1: Sort by 'Age' using sort_values()

#Sorts DataFrame based on Age column in ascending order
sorted_by_age=df.sort_values(by='Age') #Sorts rows based on the values of column
print(sorted_by_age)

      Name  Age  Score
1      Bob   19     92
2  Charlie   22     85
0    Alice   24     88
3    David   25     95


In [None]:
#Example 2:Sort by index using sort_index()

#Sorts the row index(0,1,2,3)
sorted_by_index=df.sort_index()
print(sorted_by_index.head())

      Name  Age  Score
0    Alice   24     88
1      Bob   19     92
2  Charlie   22     85
3    David   25     95


2.Subsetting- Indices

In [None]:
#Using iloc[] : Accessing rows and columns by index
#Index-based selection(row index, col index)
subset_iloc=df.iloc[1:3, 0:2]
#iloc[row_start:row_end, col_start:col_end]
print(subset_iloc)

      Name  Age
1      Bob   19
2  Charlie   22


In [None]:
#Using loc[] : Label-based selection(condition, column names)
subset_loc=df.loc[df['Age']>20,['Name',"Score"]]
#selects rows where Age>20 and choose only Name and Score columns
print(subset_loc)

      Name  Score
0    Alice     88
2  Charlie     85
3    David     95


In [None]:
#Using[] : Selecting columns directly
subset_brackets=df[['Name','Age']]
#Passing a list selects multiple columns
print(subset_brackets)

      Name  Age
0    Alice   24
1      Bob   19
2  Charlie   22
3    David   25


3. Subsetting by Values - Columns

In [None]:
#Subsetting a single column
name_column=df['Name']
#Selecting a single column returns a Series(1D data)
print(name_column)

0      Alice
1        Bob
2    Charlie
3      David
Name: Name, dtype: object


In [None]:
#Subsetting multiple columns
name_and_age=df[['Name','Age']]
#Selecting multiple columns requires a list --> returns DataFrame
print(name_and_age)

      Name  Age
0    Alice   24
1      Bob   19
2  Charlie   22
3    David   25


4.Subsetting by Rows(Filtering)

In [None]:
#Filter rows with a single condition
filtered_single=df[df['Age']>20]
#Select only rows where Age>20
print(filtered_single)

      Name  Age  Score
0    Alice   24     88
2  Charlie   22     85
3    David   25     95


In [None]:
#Filter rows with multiple conditions
filtered_multiple=df[(df['Age']>20)&(df['Score']>85)]
print(filtered_multiple)

    Name  Age  Score
0  Alice   24     88
3  David   25     95


5.Filtering on Categorical Values

In [None]:
import pandas as pd

# Style 1 — Create a DataFrame directly from dictionary
df1 = pd.DataFrame({
    'Bob': ['I liked it.', 'It was awful'],
    'Sue': ['Pretty good.', 'Bland.']
})
print(df1)

# Style 2 — Same DataFrame but with custom row labels using index
df2 = pd.DataFrame({
    'Bob': ['I liked it.', 'It was awful.'],
    'Sue': ['Pretty good.', 'Bland.']
}, index=['Product A', 'Product B'])
# index=[] assigns row names manually
print(df2)


            Bob           Sue
0   I liked it.  Pretty good.
1  It was awful        Bland.
                     Bob           Sue
Product A    I liked it.  Pretty good.
Product B  It was awful.        Bland.


3.1 Warm-Up Exercises (Titanic Dataset)

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


SECTION 1 - Load Dataset + View Info

In [3]:
#Section 1: Import Libraries + Load DataSet

titanic = pd.read_csv("/content/drive/My Drive/Colab Notebooks/AI Worksheet/Titanic-Dataset.csv")
print(titanic.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None


In [None]:
#Example code

Exercise Problem

In [6]:
# Load required library
import pandas as pd

# Load Titanic dataset
df = pd.read_csv("/content/drive/My Drive/Colab Notebooks/AI Worksheet/Titanic-Dataset.csv")

# Show dataset info (columns + missing values)
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


Problem 1- Sorting

In [7]:
# Task 1: Select only the Fare column
fare = df[['Fare']]
fare.head()


Unnamed: 0,Fare
0,7.25
1,71.2833
2,7.925
3,53.1
4,8.05


In [8]:
# Task 2: Select Pclass and Age columns
class_age = df[['Pclass', 'Age']]
class_age.head()


Unnamed: 0,Pclass,Age
0,3,22.0
1,1,38.0
2,3,26.0
3,1,35.0
4,3,35.0


In [9]:
# Task 3: Select Survived and Sex columns
survived_gender = df[['Survived', 'Sex']]
survived_gender.head()


Unnamed: 0,Survived,Sex
0,0,male
1,1,female
2,1,female
3,1,female
4,0,male


Problem 2 -Subsetting


In [10]:
# Task 1: Subset rows where Fare > 100
fare_gt_100 = df[df['Fare'] > 100]
fare_gt_100


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
27,28,0,1,"Fortune, Mr. Charles Alexander",male,19.0,3,2,19950,263.0,C23 C25 C27,S
31,32,1,1,"Spencer, Mrs. William Augustus (Marie Eugenie)",female,,1,0,PC 17569,146.5208,B78,C
88,89,1,1,"Fortune, Miss. Mabel Helen",female,23.0,3,2,19950,263.0,C23 C25 C27,S
118,119,0,1,"Baxter, Mr. Quigg Edmond",male,24.0,0,1,PC 17558,247.5208,B58 B60,C
195,196,1,1,"Lurette, Miss. Elise",female,58.0,0,0,PC 17569,146.5208,B80,C
215,216,1,1,"Newell, Miss. Madeleine",female,31.0,1,0,35273,113.275,D36,C
258,259,1,1,"Ward, Miss. Anna",female,35.0,0,0,PC 17755,512.3292,,C
268,269,1,1,"Graham, Mrs. William Thompson (Edith Junkins)",female,58.0,0,1,PC 17582,153.4625,C125,S
269,270,1,1,"Bissette, Miss. Amelia",female,35.0,0,0,PC 17760,135.6333,C99,S
297,298,0,1,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S


In [11]:
# Task 2: Subset rows where Pclass == 1
first_class = df[df['Pclass'] == 1]
first_class


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.5500,C103,S
23,24,1,1,"Sloper, Mr. William Thompson",male,28.0,0,0,113788,35.5000,A6,S
...,...,...,...,...,...,...,...,...,...,...,...,...
871,872,1,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",female,47.0,1,1,11751,52.5542,D35,S
872,873,0,1,"Carlsson, Mr. Frans Olof",male,33.0,0,0,695,5.0000,B51 B53 B55,S
879,880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56.0,0,1,11767,83.1583,C50,C
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S


In [12]:
# Task 3: Subset females younger than 18
female_under_18 = df[(df['Age'] < 18) & (df['Sex'] == "female")]
female_under_18


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7,G6,S
14,15,0,3,"Vestrom, Miss. Hulda Amanda Adolfina",female,14.0,0,0,350406,7.8542,,S
22,23,1,3,"McGowan, Miss. Anna ""Annie""",female,15.0,0,0,330923,8.0292,,Q
24,25,0,3,"Palsson, Miss. Torborg Danira",female,8.0,3,1,349909,21.075,,S
39,40,1,3,"Nicola-Yarred, Miss. Jamila",female,14.0,1,0,2651,11.2417,,C
43,44,1,2,"Laroche, Miss. Simonne Marie Anne Andree",female,3.0,1,2,SC/Paris 2123,41.5792,,C
58,59,1,2,"West, Miss. Constance Mirium",female,5.0,1,2,C.A. 34651,27.75,,S
68,69,1,3,"Andersson, Miss. Erna Alexandra",female,17.0,4,2,3101281,7.925,,S
71,72,0,3,"Goodwin, Miss. Lillian Amy",female,16.0,5,2,CA 2144,46.9,,S


In [13]:
# Task 4: Filter passengers who embarked in Cherbourg (C) or Southampton (S)
embarked_c_or_s = df[df['Embarked'].isin(["C", "S"])]
embarked_c_or_s


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
884,885,0,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.0500,,S
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S


In [14]:
# Task 5: Filter Pclass 1 or 2
first_second_class = df[df['Pclass'].isin([1, 2])]
first_second_class


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.5500,C103,S
...,...,...,...,...,...,...,...,...,...,...,...,...
880,881,1,2,"Shelley, Mrs. William (Imanita Parrish Hall)",female,25.0,0,1,230433,26.0000,,S
883,884,0,2,"Banfield, Mr. Frederick James",male,28.0,0,0,C.A./SOTON 34068,10.5000,,S
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S


EDA - Practice Exercise 1

In [23]:
#Fixing missing Age values

# Replace missing Age with median age
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Age']


Unnamed: 0,Age
0,22.0
1,38.0
2,26.0
3,35.0
4,35.0
...,...
886,27.0
887,19.0
888,28.0
889,26.0


In [24]:
# Step 1: Create column = Fare / Age
df['fare_per_year'] = df['Fare'] / df['Age']
df['fare_per_year']

Unnamed: 0,fare_per_year
0,0.329545
1,1.875876
2,0.304808
3,1.517143
4,0.230000
...,...
886,0.481481
887,1.578947
888,0.837500
889,1.153846


In [25]:
# Step 2: Filter where fare per year > 5
high_fare_age = df[df['fare_per_year'] > 5]
high_fare_age


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,fare_per_year
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.00,3,1,349909,21.0750,,S,10.537500
16,17,0,3,"Rice, Master. Eugene",male,2.00,4,1,382652,29.1250,,Q,14.562500
27,28,0,1,"Fortune, Mr. Charles Alexander",male,19.00,3,2,19950,263.0000,C23 C25 C27,S,13.842105
31,32,1,1,"Spencer, Mrs. William Augustus (Marie Eugenie)",female,28.00,1,0,PC 17569,146.5208,B78,C,5.232886
43,44,1,2,"Laroche, Miss. Simonne Marie Anne Andree",female,3.00,1,2,SC/Paris 2123,41.5792,,C,13.859733
...,...,...,...,...,...,...,...,...,...,...,...,...,...
813,814,0,3,"Andersson, Miss. Ebba Iris Alfrida",female,6.00,4,2,347082,31.2750,,S,5.212500
824,825,0,3,"Panula, Master. Urho Abraham",male,2.00,4,1,3101295,39.6875,,S,19.843750
827,828,1,2,"Mallet, Master. Andre",male,1.00,0,2,S.C./PARIS 2079,37.0042,,C,37.004200
831,832,1,2,"Richards, Master. George Sibley",male,0.83,1,1,29106,18.7500,,S,22.590361


In [26]:
# Step 3: Sort in descending order
high_fare_age_srt = high_fare_age.sort_values(by='fare_per_year', ascending=False)
high_fare_age_srt


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,fare_per_year
305,306,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.5500,C22 C26,S,164.728261
297,298,0,1,"Allison, Miss. Helen Loraine",female,2.00,1,2,113781,151.5500,C22 C26,S,75.775000
386,387,0,3,"Goodwin, Master. Sidney Leonard",male,1.00,5,2,CA 2144,46.9000,,S,46.900000
164,165,0,3,"Panula, Master. Eino Viljami",male,1.00,4,1,3101295,39.6875,,S,39.687500
183,184,1,2,"Becker, Master. Richard F",male,1.00,2,1,230136,39.0000,F4,S,39.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
348,349,1,3,"Coutts, Master. William Loch ""William""",male,3.00,1,1,C.A. 37671,15.9000,,S,5.300000
31,32,1,1,"Spencer, Mrs. William Augustus (Marie Eugenie)",female,28.00,1,0,PC 17569,146.5208,B78,C,5.232886
205,206,0,3,"Strom, Miss. Telma Matilda",female,2.00,0,1,347054,10.4625,G6,S,5.231250
813,814,0,3,"Andersson, Miss. Ebba Iris Alfrida",female,6.00,4,2,347082,31.2750,,S,5.212500


In [22]:
# Step 4: Show Name and fare_per_year
result = high_fare_age_srt[['Name', 'fare_per_year']]
result.head()


Unnamed: 0,Name,fare_per_year
305,"Allison, Master. Hudson Trevor",164.728261
297,"Allison, Miss. Helen Loraine",75.775
386,"Goodwin, Master. Sidney Leonard",46.9
164,"Panula, Master. Eino Viljami",39.6875
183,"Becker, Master. Richard F",39.0


Q2 — Adult Male Who Paid Highest Fare Relative to Class

In [27]:
# Step 1: Create fare per class column
df['fare_per_class'] = df['Fare'] / df['Pclass']
df['fare_per_class']

Unnamed: 0,fare_per_class
0,2.416667
1,71.283300
2,2.641667
3,53.100000
4,2.683333
...,...
886,6.500000
887,30.000000
888,7.816667
889,30.000000


In [28]:
# Step 2: Select adult males (Age >= 18 and Sex = male)
adult_males = df[(df['Age'] >= 18) & (df['Sex'] == "male")]
adult_males

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,fare_per_year,fare_per_class
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,0.329545,2.416667
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,0.230000,2.683333
5,6,0,3,"Moran, Mr. James",male,28.0,0,0,330877,8.4583,,Q,0.302082,2.819433
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,0.960417,51.862500
12,13,0,3,"Saundercock, Mr. William Henry",male,20.0,0,0,A/5. 2151,8.0500,,S,0.402500,2.683333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
883,884,0,2,"Banfield, Mr. Frederick James",male,28.0,0,0,C.A./SOTON 34068,10.5000,,S,0.375000,5.250000
884,885,0,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.0500,,S,0.282000,2.350000
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,0.481481,6.500000
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,1.153846,30.000000


In [29]:
# Step 3: Sort descending by fare_per_class
adult_males_srt = adult_males.sort_values(by='fare_per_class', ascending=False)
adult_males_srt

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,fare_per_year,fare_per_class
737,738,1,1,"Lesurer, Mr. Gustave J",male,35.0,0,0,PC 17755,512.3292,B101,C,14.637977,512.3292
679,680,1,1,"Cardeza, Mr. Thomas Drake Martinez",male,36.0,0,1,PC 17755,512.3292,B51 B53 B55,C,14.231367,512.3292
27,28,0,1,"Fortune, Mr. Charles Alexander",male,19.0,3,2,19950,263.0000,C23 C25 C27,S,13.842105,263.0000
438,439,0,1,"Fortune, Mr. Mark",male,64.0,1,4,19950,263.0000,C23 C25 C27,S,4.109375,263.0000
118,119,0,1,"Baxter, Mr. Quigg Edmond",male,24.0,0,1,PC 17558,247.5208,B58 B60,C,10.313367,247.5208
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
179,180,0,3,"Leonard, Mr. Lionel",male,36.0,0,0,LINE,0.0000,,S,0.000000,0.0000
732,733,0,2,"Knight, Mr. Robert J",male,28.0,0,0,239855,0.0000,,S,0.000000,0.0000
822,823,0,1,"Reuchlin, Jonkheer. John George",male,38.0,0,0,19972,0.0000,,S,0.000000,0.0000
806,807,0,1,"Andrews, Mr. Thomas Jr",male,39.0,0,0,112050,0.0000,A36,S,0.000000,0.0000


In [32]:
# Step 4: Show Name, Age, and fare_per_class
result = adult_males_srt[['Name', 'Age', 'fare_per_class']]
result.head()


Unnamed: 0,Name,Age,fare_per_class
737,"Lesurer, Mr. Gustave J",35.0,512.3292
679,"Cardeza, Mr. Thomas Drake Martinez",36.0,512.3292
27,"Fortune, Mr. Charles Alexander",19.0,263.0
438,"Fortune, Mr. Mark",64.0,263.0
118,"Baxter, Mr. Quigg Edmond",24.0,247.5208


EDA – Group-By Practice


Q1 – % Fare Revenue from Each Class

In [33]:
# Step 1: Sum of all fares
total_fare = df['Fare'].sum()
total_fare


np.float64(28693.9493)

In [34]:
# Step 2-4: Fare totals per class
fare_class_1 = df[df['Pclass'] == 1]['Fare'].sum()
fare_class_2 = df[df['Pclass'] == 2]['Fare'].sum()
fare_class_3 = df[df['Pclass'] == 3]['Fare'].sum()

fare_list = [fare_class_1, fare_class_2, fare_class_3]
fare_list


[np.float64(18177.4125), np.float64(3801.8417), np.float64(6714.6951)]

In [35]:
# Step 5: Convert to percentages
fare_percentage = [(f / total_fare) * 100 for f in fare_list]
fare_percentage


[np.float64(63.349287718996564),
 np.float64(13.24962855496507),
 np.float64(23.401083726038365)]

Q2 – % of Passengers per Age Group

In [38]:
# Step 1: Create age groups
def categorize_age(age):
    if age < 18:
        return "child"
    elif age < 65:
        return "adult"
    else:
        return "senior"

df['age_group'] = df['Age'].apply(categorize_age)
df['age_group']


Unnamed: 0,age_group
0,adult
1,adult
2,adult
3,adult
4,adult
...,...
886,adult
887,adult
888,adult
889,adult


In [39]:
# Step 2: Count total
total_passengers = len(df)
total_passengers


891

In [40]:
# Step 3: Number of each age group
age_group_counts = df['age_group'].value_counts()
age_group_counts


Unnamed: 0_level_0,count
age_group,Unnamed: 1_level_1
adult,767
child,113
senior,11


In [41]:
# Step 4: Convert counts to percentages
age_group_percent = (age_group_counts / total_passengers) * 100
age_group_percent


Unnamed: 0_level_0,count
age_group,Unnamed: 1_level_1
adult,86.083053
child,12.682379
senior,1.234568
