In [1]:
import pandas as pd
import numpy as np 
import os

In [2]:
# Download the dataset
current_path = os.getcwd()
root_dir = os.path.abspath(os.path.join(current_path, '../../'))
dataset_path = os.path.join(root_dir, 'datasets', 'sudent-adm-records-kaggle-df')
def check_files(full_path):
    for dirname, _, filenames in os.walk(full_path):
        for filename in filenames:
            print(os.path.join(dirname, filename))

check_files(dataset_path)

c:\Users\study_2025\Documents\Github\Doc-UP-AlejandroJaimes\Pandas-for-Education-Learning-through-Hands-On-Examples\datasets\sudent-adm-records-kaggle-df\student_admission_record_dirty.csv


### **Questions and Tasks**  

#### **1. Data Exploration**  
1. Load the dataset into a Pandas `DataFrame`.  
2. Display the first 5 rows of the DataFrame.  
3. What are the dataset's columns?  

In [11]:
# 1. Load the dataset into a Pandas `DataFrame`.
filepath = os.path.join(dataset_path, 'student_admission_record_dirty.csv')
std_adm_dirty = pd.read_csv(filepath)

In [12]:
# 2. Display the first 5 rows of the DataFrame. 
std_adm_dirty.head()

Unnamed: 0,Name,Age,Gender,Admission Test Score,High School Percentage,City,Admission Status
0,Shehroz,24.0,Female,50.0,68.9,Quetta,Rejected
1,Waqar,21.0,Female,99.0,60.73,Karachi,
2,Bushra,17.0,Male,89.0,,Islamabad,Accepted
3,Aliya,17.0,Male,55.0,85.29,Karachi,Rejected
4,Bilal,20.0,Male,65.0,61.13,Lahore,


In [13]:
# 3. What are the dataset's columns? 
std_adm_dirty.columns.to_list()

['Name',
 'Age',
 'Gender',
 'Admission Test Score',
 'High School Percentage',
 'City',
 'Admission Status']

#### **2. Selecting and Manipulating Columns**  
4. Select the columns `admission_status`, `high_school_percentage`, and `admission_test_score`.  
5. Rename the selected columns to `status`, `hs_score`, and `test_score`, respectively. 

In [14]:
# 4. Select the columns `admission_status`, `high_school_percentage`, and `admission_test_score`.  
std_adm_dirty.columns = std_adm_dirty.columns.str.replace(' ','_').str.lower()
def shorten_cols(col):
    return (
        str(col)
        .replace('admission_test_score', 'test_score')
        .replace('high_school_percentage', 'hs_score')
        .replace('admission_status', 'status')
    )
std_adm_dirty.rename(columns=shorten_cols, inplace=True)
std_adm_dirty.columns

Index(['name', 'age', 'gender', 'test_score', 'hs_score', 'city', 'status'], dtype='object')

#### **3. Sorting and Summarizing Data**  
6. Sort the DataFrame by `hs_score` in descending order and display the top 5 rows.  
7. Compute the average of `hs_score` and `test_score`.  
8. Count how many students were admitted and how many were rejected. 

In [15]:
# 6. Sort the DataFrame by `hs_score` in descending order and display the top 5 rows.  
std_adm_dirty.sort_values(by='hs_score',ascending=False,inplace=True)
std_adm_dirty.head()

Unnamed: 0,name,age,gender,test_score,hs_score,city,status
47,Maryam,19.0,Female,74.0,110.5,Lahore,Accepted
123,Bushra,18.0,Female,93.0,99.8,Islamabad,Rejected
22,Kamran,18.0,Male,53.0,98.98,Multan,Rejected
9,Kamran,18.0,Male,53.0,98.98,Multan,Rejected
145,,23.0,Female,93.0,98.71,Karachi,Accepted


In [18]:
# 7. Compute the average of `hs_score` and `test_score`. 
std_adm_dirty['avg_score'] = std_adm_dirty.loc[:, ['hs_score','test_score']].mean(axis=1)
std_adm_dirty.head()

Unnamed: 0,name,age,gender,test_score,hs_score,city,status,avg_score
47,Maryam,19.0,Female,74.0,110.5,Lahore,Accepted,92.25
123,Bushra,18.0,Female,93.0,99.8,Islamabad,Rejected,96.4
22,Kamran,18.0,Male,53.0,98.98,Multan,Rejected,75.99
9,Kamran,18.0,Male,53.0,98.98,Multan,Rejected,75.99
145,,23.0,Female,93.0,98.71,Karachi,Accepted,95.855


In [21]:
# 8. Count how many students were admitted and how many were rejected. 
std_adm_dirty['status'].value_counts()

status
Rejected    76
Accepted    71
Name: count, dtype: int64

#### **4. Handling Missing Values and Advanced Operations**  
9. Identify if there are any missing values in the dataset.  
10. If `hs_score` or `test_score` contain missing values, replace them with the median of each column.  
11. Create a new column called `final_score`, which is the average of `hs_score` and `test_score`.  
12. Normalize the values in `final_score` to a range of 0 to 1.

In [23]:
# 9. Identify if there are any missing values in the dataset.
std_adm_dirty.isnull().any().any()

np.True_

In [27]:
# 10. If `hs_score` or `test_score` contain missing values, replace them with the median of each column. 
missing_values = std_adm_dirty.loc[:, ['hs_score','test_score']]
print(missing_values.isna().sum())
df_nulls = missing_values[missing_values.isna().any(axis=1)]
df_nulls

hs_score      11
test_score    11
dtype: int64


Unnamed: 0,hs_score,test_score
6,97.31,
137,97.21,
116,86.5,
48,75.05,
68,69.83,
96,67.75,
125,61.83,
139,56.43,
67,0.0,
2,,89.0


In [31]:
# calculate median columns
median_hs_score = std_adm_dirty['hs_score'].median(skipna=True)
median_ts_score = std_adm_dirty['test_score'].median(skipna=True)
print(f'Median hs_score: {median_hs_score}\nMedian ts_score: {median_ts_score}')

Median hs_score: 77.545
Median ts_score: 79.0


In [32]:
std_adm_dirty['hs_score'] = std_adm_dirty['hs_score'].fillna(median_hs_score)
std_adm_dirty['test_score'] = std_adm_dirty['test_score'].fillna(median_ts_score)

In [34]:
std_adm_dirty.isna().sum()

name          10
age           10
gender        10
test_score     0
hs_score       0
city          10
status        10
avg_score      2
dtype: int64

In [37]:

np.mean([std_adm_dirty['test_score'],std_adm_dirty['hs_score']])

np.float64(76.7833280254777)

In [38]:
# 11. Create a new column called `final_score`, which is the average of `hs_score` and `test_score`. 
std_adm_dirty['final_score'] = np.mean([std_adm_dirty['test_score'],std_adm_dirty['hs_score']], axis=0)
std_adm_dirty.head()

Unnamed: 0,name,age,gender,test_score,hs_score,city,status,avg_score,final_score
47,Maryam,19.0,Female,74.0,110.5,Lahore,Accepted,92.25,92.25
123,Bushra,18.0,Female,93.0,99.8,Islamabad,Rejected,96.4,96.4
22,Kamran,18.0,Male,53.0,98.98,Multan,Rejected,75.99,75.99
9,Kamran,18.0,Male,53.0,98.98,Multan,Rejected,75.99,75.99
145,,23.0,Female,93.0,98.71,Karachi,Accepted,95.855,95.855


In [42]:
# 12. Normalize the values in `final_score` to a range of 0 to 1.
x_min = std_adm_dirty['final_score'].min()
x_max = std_adm_dirty['final_score'].max()
std_adm_dirty['nfinal_score'] = (std_adm_dirty['final_score'] - x_min) / (x_max - x_min)
std_adm_dirty.head()

Unnamed: 0,name,age,gender,test_score,hs_score,city,status,avg_score,final_score,nfinal_score
47,Maryam,19.0,Female,74.0,110.5,Lahore,Accepted,92.25,92.25,0.747102
123,Bushra,18.0,Female,93.0,99.8,Islamabad,Rejected,96.4,96.4,0.795702
22,Kamran,18.0,Male,53.0,98.98,Multan,Rejected,75.99,75.99,0.556681
9,Kamran,18.0,Male,53.0,98.98,Multan,Rejected,75.99,75.99,0.556681
145,,23.0,Female,93.0,98.71,Karachi,Accepted,95.855,95.855,0.78932


#### **5. Data Analysis**  
13. Which city has the most admitted students?  
14. How does the average admission test score (`test_score`) differ between admitted and rejected students?  
15. Transpose the DataFrame so that columns become rows and vice versa.  

In [58]:
# 13. Which city has the most admitted students?
cols = ['status','city']
cities_admitted = std_adm_dirty.loc[std_adm_dirty['status'] == "Accepted", cols]
cities_admitted.head()

Unnamed: 0,status,city
47,Accepted,Lahore
145,Accepted,Karachi
6,Accepted,Multan
132,Accepted,Peshawar
149,Accepted,Quetta


In [60]:
group_data = (
    cities_admitted.groupby('city')['status']
    .count()
    .reset_index(name="total_accepted")
)

group_data.sort_values(by='total_accepted', ascending=False).head(5)

Unnamed: 0,city,total_accepted
5,Quetta,13
2,Lahore,12
1,Karachi,10
3,Multan,10
6,Rawalpindi,8


In [63]:
# 14. How does the average admission test score (`test_score`) differ between admitted and rejected students?  
std_adm_dirty.groupby("status")["test_score"].mean()

status
Accepted    78.140845
Rejected    76.907895
Name: test_score, dtype: float64

In [64]:
# 15. Transpose the DataFrame so that columns become rows and vice versa.  
std_adm_dirty.head()

Unnamed: 0,name,age,gender,test_score,hs_score,city,status,avg_score,final_score,nfinal_score
47,Maryam,19.0,Female,74.0,110.5,Lahore,Accepted,92.25,92.25,0.747102
123,Bushra,18.0,Female,93.0,99.8,Islamabad,Rejected,96.4,96.4,0.795702
22,Kamran,18.0,Male,53.0,98.98,Multan,Rejected,75.99,75.99,0.556681
9,Kamran,18.0,Male,53.0,98.98,Multan,Rejected,75.99,75.99,0.556681
145,,23.0,Female,93.0,98.71,Karachi,Accepted,95.855,95.855,0.78932


In [65]:
std_adm_dirty.T.head()

Unnamed: 0,47,123,22,9,145,36,120,85,108,54,...,5,8,11,19,25,29,74,130,141,144
name,Maryam,Bushra,Kamran,Kamran,,Ayesha,Ayesha,Maham,Hania,Shehroz,...,Murtaza,Rohail,Nashit,,Shayan,Laiba,Afshan,Asad,Laiba,Sana
age,19.0,18.0,18.0,18.0,23.0,24.0,24.0,22.0,24.0,22.0,...,23.0,17.0,18.0,19.0,18.0,22.0,23.0,20.0,21.0,21.0
gender,Female,Female,Male,Male,Female,Male,Male,Female,Female,Female,...,Female,Male,Male,Male,Male,Female,Female,Female,Female,Female
test_score,74.0,93.0,53.0,53.0,93.0,94.0,94.0,88.0,83.0,72.0,...,79.0,64.0,89.0,84.0,79.0,57.0,98.0,51.0,76.0,53.0
hs_score,110.5,99.8,98.98,98.98,98.71,98.43,98.43,98.38,97.93,97.84,...,77.545,77.545,77.545,77.545,77.545,77.545,77.545,77.545,77.545,77.545
