Task 1: Build and load the dataset

In [35]:
import pandas as pd
import random

attendance_raw = []

for i in range(1, 25):
    record = {
        "student_id": f"S{i:03d}",
        "cohort": random.choice(["alpha", "beta", "gamma"]),
        "attended_sessions": random.randint(0, 6),
        "expected_sessions": 6
    }
    
    attendance_raw.append(record)

attendance = pd.DataFrame(attendance_raw)

print("First five rows:")
print(attendance.head())

print("\nInfo:")
attendance.info()

First five rows:
  student_id cohort  attended_sessions  expected_sessions
0       S001   beta                  1                  6
1       S002  gamma                  1                  6
2       S003  gamma                  5                  6
3       S004   beta                  4                  6
4       S005  alpha                  5                  6

Info:
<class 'pandas.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   student_id         24 non-null     str  
 1   cohort             24 non-null     str  
 2   attended_sessions  24 non-null     int64
 3   expected_sessions  24 non-null     int64
dtypes: int64(2), str(2)
memory usage: 900.0 bytes


Task 2: Set an index and validate alignment

In [36]:
attendance_indexed = attendance.set_index("student_id")

excused_ids = ["S003", "S006", "S009", "S011", "S012", "S015", 
               "S018", "S024", "S030", "S035"]
excused_values = [1, 2, 1, 3, 2, 1, 1, 2, 1, 2]

excused_absences = pd.Series(data=excused_values, index=excused_ids)
attendance_indexed["adjusted_attendance"] = (
    attendance_indexed["attended_sessions"] + excused_absences
)

attendance_indexed


attendance_indexed["adjusted_attendance"] = (
    attendance_indexed["adjusted_attendance"].fillna(attendance_indexed["attended_sessions"])
)

print("\nUpdated adjusted_attendance:")
print(attendance_indexed[["attended_sessions", "adjusted_attendance"]])


Updated adjusted_attendance:
            attended_sessions  adjusted_attendance
student_id                                        
S001                        1                  1.0
S002                        1                  1.0
S003                        5                  6.0
S004                        4                  4.0
S005                        5                  5.0
S006                        0                  2.0
S007                        6                  6.0
S008                        2                  2.0
S009                        3                  4.0
S010                        6                  6.0
S011                        3                  6.0
S012                        5                  7.0
S013                        2                  2.0
S014                        2                  2.0
S015                        0                  1.0
S016                        6                  6.0
S017                        4                  4.0
S

Task 3: Clean and normalize categories

In [37]:
attendance.loc[0, "cohort"] = " Alpha"
attendance.loc[8, "cohort"] = "BETA "
attendance.loc[11, "cohort"] = " gamma "
attendance.loc[14, "cohort"] = " Beta   "
attendance.loc[17, "cohort"] = "Gamma  "
attendance.loc[21, "cohort"] = " ALPHA   "

In [38]:
attendance["cohort"].unique()

<StringArray>
[   ' Alpha',     'gamma',      'beta',     'alpha',     'BETA ',   ' gamma ',
  ' Beta   ',   'Gamma  ', ' ALPHA   ']
Length: 9, dtype: str

In [39]:
attendance["cohort"] = attendance["cohort"].str.strip().str.lower()

In [41]:
attendance["cohort"].unique()

<StringArray>
['alpha', 'gamma', 'beta']
Length: 3, dtype: str

Task 4: Filter and compute summaries

In [42]:
low_attendance = attendance[attendance["attended_sessions"] < attendance["expected_sessions"]]
average_by_cohort = attendance.groupby("cohort")["attended_sessions"].mean()
print(average_by_cohort)

cohort
alpha    2.727273
beta     3.666667
gamma    3.571429
Name: attended_sessions, dtype: float64


In [43]:
print("Cohorts match:", set(average_by_cohort.index) == set(attendance["cohort"].unique()))

Cohorts match: True


Task 5: Add a derived field and validate it

In [44]:
attendance["attendance_ok"] = (
    attendance["attended_sessions"] >= attendance["expected_sessions"]
)
assert (
    attendance.loc[low_attendance.index, "attendance_ok"] == False
).all()