### Task 1

In [1]:
import pandas as pd
import numpy as np

np.random.seed(42)
cohorts = ["alpha", "beta", "gamma"]

attendance_raw = [
    {
        "student_id": f"S{str(i).zfill(3)}",
        "cohort": cohorts[i % 3],
        "attended_sessions": np.random.randint(0, 7),
        "expected_sessions": 6
    }
    for i in range(1, 25)
]

attendance = pd.DataFrame(attendance_raw)
print(attendance.head(5))
attendance.info()

  student_id cohort  attended_sessions  expected_sessions
0       S001   beta                  6                  6
1       S002  gamma                  3                  6
2       S003  alpha                  4                  6
3       S004   beta                  6                  6
4       S005  gamma                  2                  6
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   student_id         24 non-null     object
 1   cohort             24 non-null     object
 2   attended_sessions  24 non-null     int64 
 3   expected_sessions  24 non-null     int64 
dtypes: int64(2), object(2)
memory usage: 900.0+ bytes


### Task 2

In [2]:
import pandas as pd
attendance_indexed = attendance.set_index("student_id")
attendance_indexed.head()

Unnamed: 0_level_0,cohort,attended_sessions,expected_sessions
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
S001,beta,6,6
S002,gamma,3,6
S003,alpha,4,6
S004,beta,6,6
S005,gamma,2,6


In [3]:
excused_absences = pd.Series({
        "S001": 1,
        "S003": 1,
        "S005": 2,
        "S007": 1,
        "S009": 1,
        "S011": 2,
        "S013": 1,
        "S015": 1,
        "S020": 1,
        "S030": 2,  # not exist
        "S031": 1   # not exist
    })

excused_absences.loc["S001"]
attendance_indexed["adjusted_attendance"] = (
    attendance_indexed["attended_sessions"] + excused_absences
)

print(attendance_indexed["adjusted_attendance"])
print("\n")

#rows without matching IDs
attendance_indexed["adjusted_attendance"].isna().sum()

#
attendance_indexed["adjusted_attendance"] = (
    attendance_indexed["adjusted_attendance"]
    .fillna(attendance_indexed["attended_sessions"])
)

#Then fill missing values
attendance_indexed["adjusted_attendance"]

student_id
S001    7.0
S002    NaN
S003    5.0
S004    NaN
S005    4.0
S006    NaN
S007    5.0
S008    NaN
S009    2.0
S010    NaN
S011    8.0
S012    NaN
S013    3.0
S014    NaN
S015    4.0
S016    NaN
S017    NaN
S018    NaN
S019    NaN
S020    4.0
S021    NaN
S022    NaN
S023    NaN
S024    NaN
Name: adjusted_attendance, dtype: float64




student_id
S001    7.0
S002    3.0
S003    5.0
S004    6.0
S005    4.0
S006    4.0
S007    5.0
S008    6.0
S009    2.0
S010    2.0
S011    8.0
S012    2.0
S013    3.0
S014    4.0
S015    4.0
S016    2.0
S017    5.0
S018    4.0
S019    1.0
S020    4.0
S021    5.0
S022    5.0
S023    1.0
S024    3.0
Name: adjusted_attendance, dtype: float64

### Task 3

In [4]:
attendance.loc[1, "cohort"] = " Alpha "
attendance.loc[5, "cohort"] = "BETA"
attendance.loc[9, "cohort"] = " gamma "
attendance.loc[13, "cohort"] = "ALPHA "
attendance.loc[17, "cohort"] = " Beta"

attendance["cohort"].unique()
attendance["cohort"] = (
    attendance["cohort"]
    .str.strip()
    .str.lower()
)

attendance["cohort"].unique()

array(['beta', 'alpha', 'gamma'], dtype=object)

### Task 4

In [5]:
low_attendance = attendance[
    attendance["attended_sessions"] < attendance["expected_sessions"]
].copy()

print("Low Attendance DataFrame (first 5 rows):")
display(low_attendance.head())


# Average attended_sessions by cohort
cohort_summary = attendance.groupby("cohort")["attended_sessions"].mean()

print("\nAverage attended_sessions by cohort:")
display(cohort_summary)


# Validation — cohorts match cleaned cohorts
print("\nUnique cohorts in dataset:", attendance["cohort"].unique())
print("Cohorts in summary:", cohort_summary.index.tolist())

Low Attendance DataFrame (first 5 rows):


Unnamed: 0,student_id,cohort,attended_sessions,expected_sessions
1,S002,alpha,3,6
2,S003,alpha,4,6
4,S005,gamma,2,6
5,S006,beta,4,6
6,S007,beta,4,6



Average attended_sessions by cohort:


cohort
alpha    3.125000
beta     3.777778
gamma    3.571429
Name: attended_sessions, dtype: float64


Unique cohorts in dataset: ['beta' 'alpha' 'gamma']
Cohorts in summary: ['alpha', 'beta', 'gamma']


### Task 5

In [6]:
attendance["attendance_ok"] = (
    attendance["attended_sessions"] >= attendance["expected_sessions"]
)

print("Attendance with new column:")
display(attendance.head())


# Validation — all low attendance students must be False
validation_check = attendance.loc[low_attendance.index, "attendance_ok"]

assert not validation_check.any(), \
    "Validation failed: Some low attendance students marked as OK"

print("Validation passed: All low attendance students have attendance_ok = False.")

Attendance with new column:


Unnamed: 0,student_id,cohort,attended_sessions,expected_sessions,attendance_ok
0,S001,beta,6,6,True
1,S002,alpha,3,6,False
2,S003,alpha,4,6,False
3,S004,beta,6,6,True
4,S005,gamma,2,6,False


Validation passed: All low attendance students have attendance_ok = False.
