### Groups of Functions in Pandas for Data Analysis

#### A. Creating Series and DataFrames


In [None]:
# Let's create a pandas Series using a python list 

# Step 1: Import pandas package
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Set up views
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)


# Step 2: Define a list
data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

# Step 3: Create the series
series = pd.Series(data)

# lrt's view the series we created
series.head(10)


0     1
1     2
2     3
3     4
4     5
5     6
6     7
7     8
8     9
9    10
dtype: int64

In [13]:
# let's confirm to be sure we created a pandas series
type(series)

pandas.core.series.Series

In [None]:
# Let's create a series using same list, but now we will be adding our own series numbering, in python or pandas
series2 = pd.Series(data, index = ["a","b","c","d","e","f","g","h","i","j"])
series2.head(10)

a     1
b     2
c     3
d     4
e     5
f     6
g     7
h     8
i     9
j    10
dtype: int64

In [42]:
# Let's create a series using python dictionary

#lets create a python dictionary
data2 = {'a': 10, 'b' : 20, 'c': 30}

#lets create the series
series3 = pd.Series(data2)
series3.head()


a    10
b    20
c    30
dtype: int64

### Hands on practice:

In [25]:
#Question 1

#  Create a bucket list
bucket_list = ['bag', 'jewellery', 'ring', 'car', 'shoes', 'phone']

#Convert the list to series with alphabets indexing
bucket_series = pd.Series(bucket_list, index=["a", "b", "c", "d", "e", "f"])

#print
bucket_series.head(6)




a          bag
b    jewellery
c         ring
d          car
e        shoes
f        phone
dtype: object

In [27]:

## Question 2
# Biodata information  in dictionary
bio_data = {
    'Name' : "Olawale",
    'Age' : 16,
    'Level' : "500L",
    'Track' : 'AI Engineer',
    'State of Origin' : 'Osun'
    }

# Convert to pandas series
bio_data_series = pd.Series(bio_data)

bio_data_series.head()

Name                   Olawale
Age                         16
Level                     500L
Track              AI Engineer
State of Origin           Osun
dtype: object

## Creating a DataFrame

In [67]:
# Lets create a dataFrame

# Step 1: import pandas
import pandas as pd

# Define the data using dictionary that is having its value as list

data = {
    'Name': ['Chris', 'Ayo', 'Chisom'],
    'Age': [26, 24, 22],
    'Home_Town': ['Benin', 'Ibadan', 'Enugu']
}

# Let's create a DtaFrame using "df" as short for DataFrame
df = pd.DataFrame(data)
df.head()

Unnamed: 0,Name,Age,Home_Town
0,Chris,26,Benin
1,Ayo,24,Ibadan
2,Chisom,22,Enugu


In [30]:
# lets do the same thing by using list of dictionaries

data2 = [
    {'Name': 'Chris', 'Age': 26, 'Home_Town': 'Benin'},
    {'Name': 'Ayo', 'Age' : 24, 'Home_Town': 'Ibadan'},
    {'Name': 'Chisom', 'Age': 22, 'Home_Town': 'Enugu'}
]

# Let's define the dataframe
df2 = pd.DataFrame(data2)
df2.head()

Unnamed: 0,Name,Age,Home_Town
0,Chris,26,Benin
1,Ayo,24,Ibadan
2,Chisom,22,Enugu


In [None]:
# Lets do same thing again using list of list

data3 = [
    ['Chris', 26, 'Benin'],
    ['Ayo', 24, 'Ibadan'],
    ['Chisom', 22, 'Enugu']
]
df3 = pd.DataFrame(data3, columns=['Name', 'Age', 'Home_Town'])

df3.head()

Unnamed: 0,Name,Age,Home_Town
0,Chris,26,Benin
1,Ayo,24,Ibadan
2,Chisom,22,


In [None]:
# lets print the types to be sure we have defined dataframes
print(type(df))
print(type(df2))
print(type(df3))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


## B. Data Input and Output

In [12]:
# # # Lets get to work...
import pandas as pd
biodata = pd.read_csv('Untitled_form.csv')
biodata.head()


# # Ensure to code along...

Unnamed: 0,Timestamp,First Name,Last Name,Course Track,City,Gender,Seat Number,PC-Make,PC - OS,Feedback
0,2025/09/11 12:55:34 PM GMT+1,Peter,Okonmah,AI,Ogun,Male,28,MACBOOK,Mac OS,non
1,2025/09/11 12:56:11 PM GMT+1,Toyeebat,Nababa,AI,Abeokuta,Female,24,HP,Windows,Excellent
2,2025/09/11 12:57:08 PM GMT+1,Perpetual,Meninwa,AI,Lagos,Female,22,HP,Windows,Thank you so much for the opportunity.
3,2025/09/11 12:57:56 PM GMT+1,Mahfuz,Abdulhameed,AI,Abeokuta,Male,44,HP,Windows,Amazing Shit
4,2025/09/11 12:58:41 PM GMT+1,Divine,Gbadamosi,AI,Abeokuta,Male,35,DELL,Windows,Brain Racking


## C.  Data Inspection and Exploration

In [None]:
'''
.head() # To view the first 5 rows
```

```c
.tail() # To view the last 5 rows
```

```c
.info() # To check the information about the data
```

```c
.describe() # statistical summary
```

```c
.shape # Check the dimension of the dataset
```

```c
.columns # for checking the column names
'''

## D. Data Cleaning

In [73]:
# Do we have any missing values? if yes,lets fill them up


df.isna()
df.dropna()

Unnamed: 0,Name,Age,Home_Town
0,Chris,26,Benin
1,Ayo,24,Ibadan
2,Chisom,22,Enugu


## E. Data Selection and Filtering

In [None]:
# bio_data_column = [irst_Name, Last_Name, City, Course_Track, PC_make, PC_Os, Feedback]

In [9]:
biodata.columns

Index(['Timestamp', 'First Name', 'Last Name', 'Course Track', 'City',
       'Gender', 'Seat Number', 'PC-Make', 'PC - OS', 'Feedback'],
      dtype='object')

##### Column Selection

In [None]:
# lets look througha single column
biodata['First Name']

# # alternatively, we can use dot
# biodata.Gender

0       Male
1     Female
2     Female
3       Male
4       Male
5       Male
6     Female
7     Female
8     Female
9       Male
10    Female
11    Female
12      Male
13      Male
14      Male
15      Male
16      Male
17      Male
18      Male
19      Male
20      Male
21      Male
22      Male
23      Male
24      Male
25      Male
26      Male
27    Female
28      Male
29      Male
30      Male
31      Male
32      Male
33      Male
Name: Gender, dtype: object

In [19]:
# LEts select multiple columns
biodata[['Gender', 'City']]

Unnamed: 0,Gender,City
0,Male,Ogun
1,Female,Abeokuta
2,Female,Lagos
3,Male,Abeokuta
4,Male,Abeokuta
5,Male,Abeokuta
6,Female,Abeokuta
7,Female,Lagos
8,Female,Nairobi
9,Male,Abeokuta


#### Cell Selection

In [26]:
# lets select a single cell

biodata['City'][1] # This will return the first value of the "City" column

# lets try other methods for selecting cells
biodata.at[0, "City"] # This will also return the first value of the "City" column


# There is still another method using .iat[]
biodata.iat[1, 3] # This will return the first value of the first column(row0,column0)

'AI'

#### Row Selection

In [28]:
# Lets select some rows
biodata.iloc[0:5] # we are selecting from index 0 to the 5th index

Unnamed: 0,Timestamp,First Name,Last Name,Course Track,City,Gender,Seat Number,PC-Make,PC - OS,Feedback
0,2025/09/11 12:55:34 PM GMT+1,Peter,Okonmah,AI,Ogun,Male,28,MACBOOK,Mac OS,non
1,2025/09/11 12:56:11 PM GMT+1,Toyeebat,Nababa,AI,Abeokuta,Female,24,HP,Windows,Excellent
2,2025/09/11 12:57:08 PM GMT+1,Perpetual,Meninwa,AI,Lagos,Female,22,HP,Windows,Thank you so much for the opportunity.
3,2025/09/11 12:57:56 PM GMT+1,Mahfuz,Abdulhameed,AI,Abeokuta,Male,44,HP,Windows,Amazing Shit
4,2025/09/11 12:58:41 PM GMT+1,Divine,Gbadamosi,AI,Abeokuta,Male,35,DELL,Windows,Brain Racking


In [29]:
# combination of row and column selection
biodata.iloc[0:5, 0:3] # the first slice picks the rows and the second slice picks the columns

Unnamed: 0,Timestamp,First Name,Last Name
0,2025/09/11 12:55:34 PM GMT+1,Peter,Okonmah
1,2025/09/11 12:56:11 PM GMT+1,Toyeebat,Nababa
2,2025/09/11 12:57:08 PM GMT+1,Perpetual,Meninwa
3,2025/09/11 12:57:56 PM GMT+1,Mahfuz,Abdulhameed
4,2025/09/11 12:58:41 PM GMT+1,Divine,Gbadamosi


### Hands on Practice

In [13]:
biodata.loc[0:1, 'City']
# biodata.loc[0:1]


0        Ogun
1    Abeokuta
Name: City, dtype: object

#### Conditional Filtering

In [None]:
# Filter rows where Gender is 'Female'. This is going to return dataframe
filtered_male = biodata[biodata['Gender'] == 'Male']
print("Rows where Gender is 'Male':")
filtered_male


Rows where Gender is 'Male':


Unnamed: 0,Timestamp,First Name,Last Name,Course Track,City,Gender,Seat Number,PC-Make,PC - OS,Feedback
0,2025/09/11 12:55:34 PM GMT+1,Peter,Okonmah,AI,Ogun,Male,28,MACBOOK,Mac OS,non
3,2025/09/11 12:57:56 PM GMT+1,Mahfuz,Abdulhameed,AI,Abeokuta,Male,44,HP,Windows,Amazing Shit
4,2025/09/11 12:58:41 PM GMT+1,Divine,Gbadamosi,AI,Abeokuta,Male,35,DELL,Windows,Brain Racking
5,2025/09/11 12:58:55 PM GMT+1,Abdulmalik,Adedotun,AI,Abeokuta,Male,200,HP,Windows,Enjoying the course so far
9,2025/09/11 12:59:28 PM GMT+1,Hannah,Tanimola,AI,Abeokuta,Male,30,HP,Windows,On God
12,2025/09/11 1:00:03 PM GMT+1,Opeyemi,Odejimi,Cloud Computing,Abeokuta,Male,38,HP,Linux,Na wa
13,2025/09/11 1:00:13 PM GMT+1,Olasunkanmi,Rasak,AI,Kobape,Male,3,HP,Windows,My gratitude to the sponsor of this program an...
14,2025/09/11 1:00:27 PM GMT+1,Saheed,Olayinka,AI;Data Science;Web Dev,Abeokuta,Male,29,HP,Windows,None for now
15,2025/09/11 1:00:31 PM GMT+1,Kehinde,Akindele,Cloud Computing,Abeokuta,Male,54,Gateway,Windows,Great
16,2025/09/11 1:00:43 PM GMT+1,Oluwole,Oludayo,AI,Abeokuta,Male,09,HP,Windows,Good training to attend


In [41]:
# Filter rows where City is "abeokuta" and course_track is Ai
filtered_city = biodata[(biodata['City'] == 'Abeokuta') & (biodata['Course Track'] == 'AI')]
print("Rows where City is 'Lagos' and Course_Track is 'Data Science':")
filtered_city


Rows where City is 'Lagos' and Course_Track is 'Data Science':


Unnamed: 0,Timestamp,First Name,Last Name,Course Track,City,Gender,Seat Number,PC-Make,PC - OS,Feedback
1,2025/09/11 12:56:11 PM GMT+1,Toyeebat,Nababa,AI,Abeokuta,Female,24,HP,Windows,Excellent
3,2025/09/11 12:57:56 PM GMT+1,Mahfuz,Abdulhameed,AI,Abeokuta,Male,44,HP,Windows,Amazing Shit
4,2025/09/11 12:58:41 PM GMT+1,Divine,Gbadamosi,AI,Abeokuta,Male,35,DELL,Windows,Brain Racking
5,2025/09/11 12:58:55 PM GMT+1,Abdulmalik,Adedotun,AI,Abeokuta,Male,200,HP,Windows,Enjoying the course so far
6,2025/09/11 12:58:55 PM GMT+1,Naheemot,Adebiyi,AI,Abeokuta,Female,32,DELL,Windows,Grateful for the opportunity to be here.
9,2025/09/11 12:59:28 PM GMT+1,Hannah,Tanimola,AI,Abeokuta,Male,30,HP,Windows,On God
11,2025/09/11 12:59:43 PM GMT+1,Esther,Kudoro,AI,Abeokuta,Female,1,HP,Windows,Chill
16,2025/09/11 1:00:43 PM GMT+1,Oluwole,Oludayo,AI,Abeokuta,Male,9,HP,Windows,Good training to attend
18,2025/09/11 1:00:54 PM GMT+1,Ademola,Akinrinde,AI,Abeokuta,Male,100,MACBOOK,Mac OS,Awesome shit\n\n
21,2025/09/11 1:01:46 PM GMT+1,Ayuba,Raji,AI,Abeokuta,Male,26,HP,Windows,None for now


In [14]:
# Filter rows where City is either  'Lagos' or "Abeokuta"
cities = ['Abeokuta', 'Lagos']
city_filthered = biodata[biodata['City'].isin (cities)]
print("Rows where City is either 'Lagos' or 'Abeokuta':")
city_filthered

Rows where City is either 'Lagos' or 'Abeokuta':


Unnamed: 0,Timestamp,First Name,Last Name,Course Track,City,Gender,Seat Number,PC-Make,PC - OS,Feedback
1,2025/09/11 12:56:11 PM GMT+1,Toyeebat,Nababa,AI,Abeokuta,Female,24,HP,Windows,Excellent
2,2025/09/11 12:57:08 PM GMT+1,Perpetual,Meninwa,AI,Lagos,Female,22,HP,Windows,Thank you so much for the opportunity.
3,2025/09/11 12:57:56 PM GMT+1,Mahfuz,Abdulhameed,AI,Abeokuta,Male,44,HP,Windows,Amazing Shit
4,2025/09/11 12:58:41 PM GMT+1,Divine,Gbadamosi,AI,Abeokuta,Male,35,DELL,Windows,Brain Racking
5,2025/09/11 12:58:55 PM GMT+1,Abdulmalik,Adedotun,AI,Abeokuta,Male,200,HP,Windows,Enjoying the course so far
6,2025/09/11 12:58:55 PM GMT+1,Naheemot,Adebiyi,AI,Abeokuta,Female,32,DELL,Windows,Grateful for the opportunity to be here.
7,2025/09/11 12:59:00 PM GMT+1,Kanyisola,Fagbayi,AI;Data Science,Lagos,Female,82,HP,Windows,One chin chin for you for this form
9,2025/09/11 12:59:28 PM GMT+1,Hannah,Tanimola,AI,Abeokuta,Male,30,HP,Windows,On God
10,2025/09/11 12:59:41 PM GMT+1,Deborah,Adelegan,AI;Data Science,Abeokuta,Female,1,HP,Windows,None for now
11,2025/09/11 12:59:43 PM GMT+1,Esther,Kudoro,AI,Abeokuta,Female,1,HP,Windows,Chill


#### Using the .query() method

In [19]:
# use query() to filter rows where Course Track is "AI" and Feedback is 'Excellent'
# query_filtered = biodata.query("City == 'Abeokuta' and Feedback == 'Excellent'")
# print('Rows filtered using query() method:')
# query_filtered
query_filtered = biodata.query("City == 'Abeokuta' and Feedback == 'Amazing Shit'")
print("Rows filtered using query() method:")
query_filtered

Rows filtered using query() method:


Unnamed: 0,Timestamp,First Name,Last Name,Course Track,City,Gender,Seat Number,PC-Make,PC - OS,Feedback
3,2025/09/11 12:57:56 PM GMT+1,Mahfuz,Abdulhameed,AI,Abeokuta,Male,44,HP,Windows,Amazing Shit


In [23]:
# Filter rows where Gender is 'Female' 
female = biodata.query("Gender == 'Female'")
print("Students that are Female:")
female

Students that are Female:


Unnamed: 0,Timestamp,First Name,Last Name,Course Track,City,Gender,Seat Number,PC-Make,PC - OS,Feedback
1,2025/09/11 12:56:11 PM GMT+1,Toyeebat,Nababa,AI,Abeokuta,Female,24,HP,Windows,Excellent
2,2025/09/11 12:57:08 PM GMT+1,Perpetual,Meninwa,AI,Lagos,Female,22,HP,Windows,Thank you so much for the opportunity.
6,2025/09/11 12:58:55 PM GMT+1,Naheemot,Adebiyi,AI,Abeokuta,Female,32,DELL,Windows,Grateful for the opportunity to be here.
7,2025/09/11 12:59:00 PM GMT+1,Kanyisola,Fagbayi,AI;Data Science,Lagos,Female,82,HP,Windows,One chin chin for you for this form
8,2025/09/11 12:59:16 PM GMT+1,Blessing,James,Cyber Security,Nairobi,Female,45678,HP,Windows,Thanks for creating the form.
10,2025/09/11 12:59:41 PM GMT+1,Deborah,Adelegan,AI;Data Science,Abeokuta,Female,1,HP,Windows,None for now
11,2025/09/11 12:59:43 PM GMT+1,Esther,Kudoro,AI,Abeokuta,Female,1,HP,Windows,Chill
27,2025/09/11 1:03:12 PM GMT+1,Adeoye,Mary,AI,abeokuta,Female,15,LENOVO,Windows,Still processing


In [31]:
# Filter rows where PC_make is either 'HP' or 'Dell'
hp_dell = biodata.query("Gender in ['Male', 'Female']")
print("Rows where PC_make is either HP or Dell:")
hp_dell

Rows where PC_make is either HP or Dell:


Unnamed: 0,Timestamp,First Name,Last Name,Course Track,City,Gender,Seat Number,PC-Make,PC - OS,Feedback
0,2025/09/11 12:55:34 PM GMT+1,Peter,Okonmah,AI,Ogun,Male,28,MACBOOK,Mac OS,non
1,2025/09/11 12:56:11 PM GMT+1,Toyeebat,Nababa,AI,Abeokuta,Female,24,HP,Windows,Excellent
2,2025/09/11 12:57:08 PM GMT+1,Perpetual,Meninwa,AI,Lagos,Female,22,HP,Windows,Thank you so much for the opportunity.
3,2025/09/11 12:57:56 PM GMT+1,Mahfuz,Abdulhameed,AI,Abeokuta,Male,44,HP,Windows,Amazing Shit
4,2025/09/11 12:58:41 PM GMT+1,Divine,Gbadamosi,AI,Abeokuta,Male,35,DELL,Windows,Brain Racking
5,2025/09/11 12:58:55 PM GMT+1,Abdulmalik,Adedotun,AI,Abeokuta,Male,200,HP,Windows,Enjoying the course so far
6,2025/09/11 12:58:55 PM GMT+1,Naheemot,Adebiyi,AI,Abeokuta,Female,32,DELL,Windows,Grateful for the opportunity to be here.
7,2025/09/11 12:59:00 PM GMT+1,Kanyisola,Fagbayi,AI;Data Science,Lagos,Female,00082,HP,Windows,One chin chin for you for this form
8,2025/09/11 12:59:16 PM GMT+1,Blessing,James,Cyber Security,Nairobi,Female,45678,HP,Windows,Thanks for creating the form.
9,2025/09/11 12:59:28 PM GMT+1,Hannah,Tanimola,AI,Abeokuta,Male,30,HP,Windows,On God


In [None]:
# Define a variable for the course track
# desired_track = 'Cloud Computing'

# # Use the variable in the query expression
# cloud_computing_students = biodata("Course Track == @desired_track")
# print("Students in the Cloud Computing track:")
# cloud_computing_students

TypeError: 'DataFrame' object is not callable

#### lets filter thr Feedback is not "Poor" and city is "lagos"

In [36]:
# Filter rows where Feedback is not 'Poor' and City is 'Lagos'
good_feedback_lagos = biodata.query("Feedback != 'Poor' and City == 'Lagos'")
print("Students in Lagos with Feedback other than 'Poor':")
good_feedback_lagos

Students in Lagos with Feedback other than 'Poor':


Unnamed: 0,Timestamp,First Name,Last Name,Course Track,City,Gender,Seat Number,PC-Make,PC - OS,Feedback
2,2025/09/11 12:57:08 PM GMT+1,Perpetual,Meninwa,AI,Lagos,Female,22,HP,Windows,Thank you so much for the opportunity.
7,2025/09/11 12:59:00 PM GMT+1,Kanyisola,Fagbayi,AI;Data Science,Lagos,Female,82,HP,Windows,One chin chin for you for this form
17,2025/09/11 1:00:49 PM GMT+1,Samuel,Oyewusi,Web Dev,Lagos,Male,15,HP,Windows,Satisfactory


In [None]:
# #LEts create a more complex query filter for Course_Track,Feedback and Seat_No
# complex_query = biodata.query("Course_Track == 'Data Science' or (Feedback == 'Excellent' and Seat_No < 115)")
# print("Complex query result:")
# complex_query

### F. Data Transformation

Renaming column name

In [44]:
# Lets modify the column name by fixing the old names as keys and the new name as value
biodata.rename(columns={'First Name': 'FirstName', 'Last Name': 'LastName' })
biodata.rename(columns={'Course Track': 'Course_Track', 'Seat Number': 'Seat_Number', 'PC-Make' : 'PC_Make'})
biodata.rename(columns={'PC-OS' : 'PC_OS'})

Unnamed: 0,Timestamp,First Name,Last Name,Course Track,City,Gender,Seat Number,PC-Make,PC - OS,Feedback
0,2025/09/11 12:55:34 PM GMT+1,Peter,Okonmah,AI,Ogun,Male,28,MACBOOK,Mac OS,non
1,2025/09/11 12:56:11 PM GMT+1,Toyeebat,Nababa,AI,Abeokuta,Female,24,HP,Windows,Excellent
2,2025/09/11 12:57:08 PM GMT+1,Perpetual,Meninwa,AI,Lagos,Female,22,HP,Windows,Thank you so much for the opportunity.
3,2025/09/11 12:57:56 PM GMT+1,Mahfuz,Abdulhameed,AI,Abeokuta,Male,44,HP,Windows,Amazing Shit
4,2025/09/11 12:58:41 PM GMT+1,Divine,Gbadamosi,AI,Abeokuta,Male,35,DELL,Windows,Brain Racking
5,2025/09/11 12:58:55 PM GMT+1,Abdulmalik,Adedotun,AI,Abeokuta,Male,200,HP,Windows,Enjoying the course so far
6,2025/09/11 12:58:55 PM GMT+1,Naheemot,Adebiyi,AI,Abeokuta,Female,32,DELL,Windows,Grateful for the opportunity to be here.
7,2025/09/11 12:59:00 PM GMT+1,Kanyisola,Fagbayi,AI;Data Science,Lagos,Female,00082,HP,Windows,One chin chin for you for this form
8,2025/09/11 12:59:16 PM GMT+1,Blessing,James,Cyber Security,Nairobi,Female,45678,HP,Windows,Thanks for creating the form.
9,2025/09/11 12:59:28 PM GMT+1,Hannah,Tanimola,AI,Abeokuta,Male,30,HP,Windows,On God


#### APpling String Methods


In [61]:
# Lets apply some of the method
biodata['City'] = biodata['City'].str.upper()
biodata['Gender'] = biodata['Gender'].str.upper()
biodata['Last Name'] = biodata['Last Name'].str.lower()
biodata.head()

# Lets define a lambda function
# lambda x: x.str.title()

# the .apply() method will help apply the function to the selected columns

# biodata[col].apply(lambda x : x.str.title())
# biodata.head()

# # We can decide to apply the lambda funtion to every element in the dataset
# biodata.applymap(lambda x: x.str.title())
# biodata.head()

Unnamed: 0,Timestamp,First Name,Last Name,Course Track,City,Gender,Seat Number,PC-Make,PC - OS,Feedback
0,2025/09/11 12:55:34 PM GMT+1,PETER,okonmah,AI,OGUN,MALE,28,MACBOOK,Mac OS,non
1,2025/09/11 12:56:11 PM GMT+1,TOYEEBAT,nababa,AI,ABEOKUTA,FEMALE,24,HP,Windows,excellent
2,2025/09/11 12:57:08 PM GMT+1,PERPETUAL,meninwa,AI,LAGOS,FEMALE,22,HP,Windows,thank you so much for the opportunity.
3,2025/09/11 12:57:56 PM GMT+1,MAHFUZ,abdulhameed,AI,ABEOKUTA,MALE,44,HP,Windows,amazing shit
4,2025/09/11 12:58:41 PM GMT+1,DIVINE,gbadamosi,AI,ABEOKUTA,MALE,35,DELL,Windows,brain racking


### Sorting Values

In [None]:
biodata.sort_values(by= axis= 1, ascending= True, inplace= False, kind='quicksort', na_position= 'last', ignore_index=False)

TypeError: DataFrame.sort_values() missing 1 required positional argument: 'by'

In [63]:
# Sorting Columns
biodata.sort_values(by='City', ascending=True)


Unnamed: 0,Timestamp,First Name,Last Name,Course Track,City,Gender,Seat Number,PC-Make,PC - OS,Feedback
16,2025/09/11 1:00:43 PM GMT+1,OLUWOLE,oludayo,AI,ABEOKUTA,MALE,09,HP,Windows,good training to attend
31,2025/09/11 1:10:16 PM GMT+1,RIDWANULLAH,osho,AI;Cyber Security;Data Science,ABEOKUTA,MALE,45,DELL,Windows,it is what it is !!!
30,2025/09/11 1:06:48 PM GMT+1,GABRIEL,bamgbose,AI,ABEOKUTA,MALE,2,HP,Windows,good
29,2025/09/11 1:06:03 PM GMT+1,SAMUEL,dasaolu,AI,ABEOKUTA,MALE,100,HP;MACBOOK,Linux,"good so far, i guess"
28,2025/09/11 1:03:36 PM GMT+1,BABATUNDE,rahmon,AI,ABEOKUTA,MALE,20,none,Windows,i actually have a pc that has a very low stora...
27,2025/09/11 1:03:12 PM GMT+1,ADEOYE,mary,AI,ABEOKUTA,FEMALE,15,LENOVO,Windows,still processing
24,2025/09/11 1:02:49 PM GMT+1,SOLOMON,olaiya,AI,ABEOKUTA,MALE,16,MACBOOK,Mac OS,delighted learning here
23,2025/09/11 1:02:28 PM GMT+1,OLAJIDE,abioye,AI,ABEOKUTA,MALE,5,HP,Windows,"so far, i getting to familarize myself with nu..."
21,2025/09/11 1:01:46 PM GMT+1,AYUBA,raji,AI,ABEOKUTA,MALE,26,HP,Windows,none for now
18,2025/09/11 1:00:54 PM GMT+1,ADEMOLA,akinrinde,AI,ABEOKUTA,MALE,100,MACBOOK,Mac OS,awesome shit\r\n\r\n


In [68]:
# lets sort by multiple columns
biodata.sort_values(by=['City', 'PC-Make'], ascending=[True, True])

Unnamed: 0,Timestamp,First Name,Last Name,Course Track,City,Gender,Seat Number,PC-Make,PC - OS,Feedback
4,2025/09/11 12:58:41 PM GMT+1,DIVINE,gbadamosi,AI,ABEOKUTA,MALE,35,DELL,Windows,brain racking
6,2025/09/11 12:58:55 PM GMT+1,NAHEEMOT,adebiyi,AI,ABEOKUTA,FEMALE,32,DELL,Windows,grateful for the opportunity to be here.
31,2025/09/11 1:10:16 PM GMT+1,RIDWANULLAH,osho,AI;Cyber Security;Data Science,ABEOKUTA,MALE,45,DELL,Windows,it is what it is !!!
33,2025/09/11 1:18:15 PM GMT+1,MICHAEL,osisami,AI,ABEOKUTA,MALE,12,DELL;HP,Windows,nil
15,2025/09/11 1:00:31 PM GMT+1,KEHINDE,akindele,Cloud Computing,ABEOKUTA,MALE,54,Gateway,Windows,great
1,2025/09/11 12:56:11 PM GMT+1,TOYEEBAT,nababa,AI,ABEOKUTA,FEMALE,24,HP,Windows,excellent
3,2025/09/11 12:57:56 PM GMT+1,MAHFUZ,abdulhameed,AI,ABEOKUTA,MALE,44,HP,Windows,amazing shit
5,2025/09/11 12:58:55 PM GMT+1,ABDULMALIK,adedotun,AI,ABEOKUTA,MALE,200,HP,Windows,enjoying the course so far
9,2025/09/11 12:59:28 PM GMT+1,HANNAH,tanimola,AI,ABEOKUTA,MALE,30,HP,Windows,on god
10,2025/09/11 12:59:41 PM GMT+1,DEBORAH,adelegan,AI;Data Science,ABEOKUTA,FEMALE,001,HP,Windows,none for now


In [71]:
biodata.sort_index(axis = 0, level = None, ascending = False, inplace = False, sort_remaining = True)

Unnamed: 0,Timestamp,First Name,Last Name,Course Track,City,Gender,Seat Number,PC-Make,PC - OS,Feedback
33,2025/09/11 1:18:15 PM GMT+1,MICHAEL,osisami,AI,ABEOKUTA,MALE,12,DELL;HP,Windows,nil
32,2025/09/11 1:11:39 PM GMT+1,OLUWAPELUMI,adenuga,Web Dev,ABEOKUTA,MALE,36,HP,Windows,live yours
31,2025/09/11 1:10:16 PM GMT+1,RIDWANULLAH,osho,AI;Cyber Security;Data Science,ABEOKUTA,MALE,45,DELL,Windows,it is what it is !!!
30,2025/09/11 1:06:48 PM GMT+1,GABRIEL,bamgbose,AI,ABEOKUTA,MALE,2,HP,Windows,good
29,2025/09/11 1:06:03 PM GMT+1,SAMUEL,dasaolu,AI,ABEOKUTA,MALE,100,HP;MACBOOK,Linux,"good so far, i guess"
28,2025/09/11 1:03:36 PM GMT+1,BABATUNDE,rahmon,AI,ABEOKUTA,MALE,20,none,Windows,i actually have a pc that has a very low stora...
27,2025/09/11 1:03:12 PM GMT+1,ADEOYE,mary,AI,ABEOKUTA,FEMALE,15,LENOVO,Windows,still processing
26,2025/09/11 1:02:59 PM GMT+1,OLUWASEYI,egunjobi,Cloud Computing,CAPE TOWN,MALE,dirty,MACBOOK,Mac OS,nice to meet you all
25,2025/09/11 1:02:56 PM GMT+1,OLUWADAMILARE,bello,AI,SANGO,MALE,373,DELL;HP;LENOVO;MACBOOK;ASUS;,Mac OS,coding is interesting when you understand
24,2025/09/11 1:02:49 PM GMT+1,SOLOMON,olaiya,AI,ABEOKUTA,MALE,16,MACBOOK,Mac OS,delighted learning here


### Grouping and Aggregation

In [None]:
#  Lets manually create a bio_data sample data
bio = {
    'First_Name': ['Emeka', 'Aisha', 'Ayo', 'Chinedu', 'Fatima', 'Ibrahim', 'Ngozi', 'Tolu', 'Olamide', 'Yusuf',
                   'Ada', 'Kunle', 'Mercy', 'Segun', 'Zainab', 'Donald', 'Kemi', 'Usman', 'Funmi', 'Chika'],
    'Last_Name': ['Julius', 'Bello', 'Adewale', 'Godswill', 'Abubakar', 'David', 'Collins', 'Ogunleye', 'Adepoju', 'Garba',
                  'Umeh', 'Ojo', 'Musa', 'Balogun', 'Mohammed', 'Obi', 'Adebayo', 'Suleiman', 'Williams', 'Micheal'],
    'Gender': ['Male', 'Female', 'Male', 'Male', 'Female', 'Male', 'Female', 'Male', 'Male', 'Male',
               'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male'],
    'Seat_No': range(101, 121),
    'City': ['Lagos', 'Abuja', 'Ibadan', 'Enugu', 'Kano', 'Benin', 'Port Harcourt', 'Abeokuta', 'Benin', 'Abeokuta',
             'Lagos', 'Abeokuta', 'Lagos', 'Ibadan', 'Abuja', 'Port Harcourt', 'Benin', 'Jos', 'Calabar', 'Onitsha'],
    'Course_Track': ['Data Science', 'Cloud Computing', 'Cybersecurity', 'AI', 'Data Science', 'Cloud Computing',
                     'Web Dev', 'AI', 'Cybersecurity', 'AI', 'Data Science', 'Web Dev',
                     'Cybersecurity', 'AI', 'Cloud Computing', 'Data Science', 'Web Dev', 'Data Science',
                     'Data Science', 'Cloud Computing'],
    'PC_make': ['HP', 'Dell', 'HP', 'Asus', 'Apple', 'HP', 'Dell', 'Lenovo', 'Asus', 'Apple',
                'HP', 'Dell', 'Lenovo', 'Asus', 'Dell', 'HP', 'Dell', 'Lenovo', 'Asus', 'Apple'],
    'PC_Os': ['Windows', 'Linux', 'Windows', 'Windows', 'Linux', 'MacOS', 'Windows', 'Linux', 'MacOS', 'Windows',
              'Linux', 'MacOS', 'Windows', 'Linux', 'MacOS', 'Windows', 'Linux', 'MacOS', 'Windows', 'Linux'],
    'Feedback': ['Good', 'Excellent', 'Excellent', 'Good', 'Poor', 'Excellent', 'Good', 'Average', 'Good', 'Excellent',
                 'Good', 'Poor', 'Average', 'Excellent', 'Good', 'Average', 'Excellent', 'Good', 'Good', 'Excellent']
}

# # Lets convert it to dataframe first
# bio_data2 = pd.DataFrame(bio)
# bio_data2

Unnamed: 0,First_Name,Last_Name,Gender,Seat_No,City,Course_Track,PC_make,PC_Os,Feedback
0,Emeka,Julius,Male,101,Lagos,Data Science,HP,Windows,Good
1,Aisha,Bello,Female,102,Abuja,Cloud Computing,Dell,Linux,Excellent
2,Ayo,Adewale,Male,103,Ibadan,Cybersecurity,HP,Windows,Excellent
3,Chinedu,Godswill,Male,104,Enugu,AI,Asus,Windows,Good
4,Fatima,Abubakar,Female,105,Kano,Data Science,Apple,Linux,Poor
5,Ibrahim,David,Male,106,Benin,Cloud Computing,HP,MacOS,Excellent
6,Ngozi,Collins,Female,107,Port Harcourt,Web Dev,Dell,Windows,Good
7,Tolu,Ogunleye,Male,108,Abeokuta,AI,Lenovo,Linux,Average
8,Olamide,Adepoju,Male,109,Benin,Cybersecurity,Asus,MacOS,Good
9,Yusuf,Garba,Male,110,Abeokuta,AI,Apple,Windows,Excellent


In [75]:
# Lets convert it to dataframe first
bio_data2 = pd.DataFrame(bio)
bio_data2

Unnamed: 0,First_Name,Last_Name,Gender,Seat_No,City,Course_Track,PC_make,PC_Os,Feedback
0,Emeka,Julius,Male,101,Lagos,Data Science,HP,Windows,Good
1,Aisha,Bello,Female,102,Abuja,Cloud Computing,Dell,Linux,Excellent
2,Ayo,Adewale,Male,103,Ibadan,Cybersecurity,HP,Windows,Excellent
3,Chinedu,Godswill,Male,104,Enugu,AI,Asus,Windows,Good
4,Fatima,Abubakar,Female,105,Kano,Data Science,Apple,Linux,Poor
5,Ibrahim,David,Male,106,Benin,Cloud Computing,HP,MacOS,Excellent
6,Ngozi,Collins,Female,107,Port Harcourt,Web Dev,Dell,Windows,Good
7,Tolu,Ogunleye,Male,108,Abeokuta,AI,Lenovo,Linux,Average
8,Olamide,Adepoju,Male,109,Benin,Cybersecurity,Asus,MacOS,Good
9,Yusuf,Garba,Male,110,Abeokuta,AI,Apple,Windows,Excellent


In [76]:
bio_data2.head()

Unnamed: 0,First_Name,Last_Name,Gender,Seat_No,City,Course_Track,PC_make,PC_Os,Feedback
0,Emeka,Julius,Male,101,Lagos,Data Science,HP,Windows,Good
1,Aisha,Bello,Female,102,Abuja,Cloud Computing,Dell,Linux,Excellent
2,Ayo,Adewale,Male,103,Ibadan,Cybersecurity,HP,Windows,Excellent
3,Chinedu,Godswill,Male,104,Enugu,AI,Asus,Windows,Good
4,Fatima,Abubakar,Female,105,Kano,Data Science,Apple,Linux,Poor


In [77]:
# Lets save it as a CSV file
bio_data2.to_csv("bio_data2.csv", index = False)

In [79]:
bio_data2["Course_Track"].unique()

array(['Data Science', 'Cloud Computing', 'Cybersecurity', 'AI',
       'Web Dev'], dtype=object)

In [80]:
bio_data2["Course_Track"].nunique()

5

In [81]:
bio_data2["First_Name"].nunique()

20

In [82]:
bio_data2["First_Name"].unique()

array(['Emeka', 'Aisha', 'Ayo', 'Chinedu', 'Fatima', 'Ibrahim', 'Ngozi',
       'Tolu', 'Olamide', 'Yusuf', 'Ada', 'Kunle', 'Mercy', 'Segun',
       'Zainab', 'Donald', 'Kemi', 'Usman', 'Funmi', 'Chika'],
      dtype=object)

In [None]:
# What is the total numbers of students taking each track?

track_count = bio_data2.groupby("Course_Track").agg({"First_Name" : "count"})
track_count

Unnamed: 0_level_0,Course_Track
Course_Track,Unnamed: 1_level_1
AI,4
Cloud Computing,4
Cybersecurity,3
Data Science,6
Web Dev,3


In [90]:
# What is the total number of students having the same numbers of PC_make
bio_data2.groupby("PC_make")["PC_make"].count()


PC_make
Apple     3
Asus      4
Dell      5
HP        5
Lenovo    3
Name: PC_make, dtype: int64

In [91]:
bio_data2.groupby('Course_Track').agg({'City' : 'sum'})

Unnamed: 0_level_0,City
Course_Track,Unnamed: 1_level_1
AI,EnuguAbeokutaAbeokutaIbadan
Cloud Computing,AbujaBeninAbujaOnitsha
Cybersecurity,IbadanBeninLagos
Data Science,LagosKanoLagosPort HarcourtJosCalabar
Web Dev,Port HarcourtAbeokutaBenin


In [92]:
# What types of OS do students in each track use?
bio_data2.groupby('Course_Track').agg({'PC_Os': 'sum'})

Unnamed: 0_level_0,PC_Os
Course_Track,Unnamed: 1_level_1
AI,WindowsLinuxWindowsLinux
Cloud Computing,LinuxMacOSMacOSLinux
Cybersecurity,WindowsMacOSWindows
Data Science,WindowsLinuxLinuxWindowsMacOSWindows
Web Dev,WindowsMacOSLinux


In [93]:
# What is the most common course among the female students?
female_group = bio_data2.groupby('Gender').get_group('Female')
female_group

Unnamed: 0,First_Name,Last_Name,Gender,Seat_No,City,Course_Track,PC_make,PC_Os,Feedback
1,Aisha,Bello,Female,102,Abuja,Cloud Computing,Dell,Linux,Excellent
4,Fatima,Abubakar,Female,105,Kano,Data Science,Apple,Linux,Poor
6,Ngozi,Collins,Female,107,Port Harcourt,Web Dev,Dell,Windows,Good
10,Ada,Umeh,Female,111,Lagos,Data Science,HP,Linux,Good
12,Mercy,Musa,Female,113,Lagos,Cybersecurity,Lenovo,Windows,Average
14,Zainab,Mohammed,Female,115,Abuja,Cloud Computing,Dell,MacOS,Good
16,Kemi,Adebayo,Female,117,Benin,Web Dev,Dell,Linux,Excellent
18,Funmi,Williams,Female,119,Calabar,Data Science,Asus,Windows,Good


In [94]:
female_group['Course_Track'].value_counts()

Course_Track
Data Science       3
Cloud Computing    2
Web Dev            2
Cybersecurity      1
Name: count, dtype: int64

In [95]:
gender_size= bio_data2.groupby("Gender").size()
gender_size

Gender
Female     8
Male      12
dtype: int64

In [96]:
gender_size= bio_data2.groupby("Course_Track").size()
gender_size

Course_Track
AI                 4
Cloud Computing    4
Cybersecurity      3
Data Science       6
Web Dev            3
dtype: int64

In [97]:
bio_data2['Course_Track'].value_counts()

Course_Track
Data Science       6
Cloud Computing    4
AI                 4
Cybersecurity      3
Web Dev            3
Name: count, dtype: int64

In [None]:
# This will help you to search and return the index of the specified group member

by_city = bio_data2.groupby("City")
by_city.groups["Lagos"]
#or
# bio_data2.groupby('City').groups['Abeokuta']

Index([7, 9, 11], dtype='int64')

### H. Data Reshaping

In [102]:
# Note that is just an illustration of what is possible. It does not make sense to take the mean of Seat_No

pivot_table = pd.pivot_table(bio_data2,
                             index='Gender',
                             columns='Course_Track',
                             values='Seat_No',
                             aggfunc='mean')
print("Pivot Table of Average Seat_No by Gender and Course_Track:")
pivot_table

Pivot Table of Average Seat_No by Gender and Course_Track:


Course_Track,AI,Cloud Computing,Cybersecurity,Data Science,Web Dev
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Female,,108.5,113.0,111.666667,112.0
Male,109.0,113.0,106.0,111.666667,112.0


In [103]:
# LEts transpose this
pivot_table.T

Gender,Female,Male
Course_Track,Unnamed: 1_level_1,Unnamed: 2_level_1
AI,,109.0
Cloud Computing,108.5,113.0
Cybersecurity,113.0,106.0
Data Science,111.666667,111.666667
Web Dev,112.0,112.0


### I. Merging and Joining

In [107]:
# Lets add more details to our bio_data2 dataset by create a new one.

course_data = {
    'Course_Track': ['Data Science', 'Web Dev', 'Cybersecurity', 'AI', 'Cloud Computing'],
    'Duration': ['8 months', '4 months', '5 months', '7 months', '6 months'],
    'Fee': [600000, 350000, 450000, 550000, 500000]
}
course_df = pd.DataFrame(course_data)
#  both bio_data2 and course_data have "Course_Track" in common

# Merge the two DataFrames on Course_Track (inner join by default)
merged_df = pd.merge(bio_data2, course_df, on='Course_Track')
print("Merged DataFrame (Inner Join on Course_Track):")
merged_df.head()

Merged DataFrame (Inner Join on Course_Track):


Unnamed: 0,First_Name,Last_Name,Gender,Seat_No,City,Course_Track,PC_make,PC_Os,Feedback,Duration,Fee
0,Emeka,Julius,Male,101,Lagos,Data Science,HP,Windows,Good,8 months,600000
1,Aisha,Bello,Female,102,Abuja,Cloud Computing,Dell,Linux,Excellent,6 months,500000
2,Ayo,Adewale,Male,103,Ibadan,Cybersecurity,HP,Windows,Excellent,5 months,450000
3,Chinedu,Godswill,Male,104,Enugu,AI,Asus,Windows,Good,7 months,550000
4,Fatima,Abubakar,Female,105,Kano,Data Science,Apple,Linux,Poor,8 months,600000


In [108]:
# Left join: keep all rows from df
left_joined = pd.merge(bio_data2, course_df, on='Course_Track', how='left')
print("Left Joined DataFrame:")
left_joined.head()

#Observe the output, it seems to be the same with the one above

Left Joined DataFrame:


Unnamed: 0,First_Name,Last_Name,Gender,Seat_No,City,Course_Track,PC_make,PC_Os,Feedback,Duration,Fee
0,Emeka,Julius,Male,101,Lagos,Data Science,HP,Windows,Good,8 months,600000
1,Aisha,Bello,Female,102,Abuja,Cloud Computing,Dell,Linux,Excellent,6 months,500000
2,Ayo,Adewale,Male,103,Ibadan,Cybersecurity,HP,Windows,Excellent,5 months,450000
3,Chinedu,Godswill,Male,104,Enugu,AI,Asus,Windows,Good,7 months,550000
4,Fatima,Abubakar,Female,105,Kano,Data Science,Apple,Linux,Poor,8 months,600000


In [113]:
# Create a city DataFrame with details for each unique city in your bio dataset
city_data = {
    'City': ['Lagos', 'Abuja', 'Ibadan', 'Enugu', 'Kano', 'Benin', 'Port Harcourt', 'Abeokuta', 'Jos', 'Calabar', 'Onitsha'],
    'Population': [14000000, 3000000, 5000000, 4000000, 3500000, 2000000, 2500000, 800000, 600000, 500000, 900000],
    'Region': ['South West', 'Federal Capital Territory', 'South West', 'South East', 'North West',
               'South South', 'South South', 'South West', 'North Central', 'South South', 'South East']
}

city_df = pd.DataFrame(city_data)

# Lets set index for the dataset before joining
df_indexed = city_df.set_index("City")
df_indexed


Unnamed: 0_level_0,Population,Region
City,Unnamed: 1_level_1,Unnamed: 2_level_1
Lagos,14000000,South West
Abuja,3000000,Federal Capital Territory
Ibadan,5000000,South West
Enugu,4000000,South East
Kano,3500000,North West
Benin,2000000,South South
Port Harcourt,2500000,South South
Abeokuta,800000,South West
Jos,600000,North Central
Calabar,500000,South South


In [114]:
# Lets join this with merged_df

joined_df = df_indexed.join(merged_df, how='left')
print("Joined DataFrame using .join():")
joined_df.head()

# Ensure to note the output

Joined DataFrame using .join():


Unnamed: 0_level_0,Population,Region,First_Name,Last_Name,Gender,Seat_No,City,Course_Track,PC_make,PC_Os,Feedback,Duration,Fee
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Lagos,14000000,South West,,,,,,,,,,,
Abuja,3000000,Federal Capital Territory,,,,,,,,,,,
Ibadan,5000000,South West,,,,,,,,,,,
Enugu,4000000,South East,,,,,,,,,,,
Kano,3500000,North West,,,,,,,,,,,
