In [1]:
import pandas as pd

# raw attendance data 24 rec
attendance_raw = [
    {"student_id": f"S{str(i).zfill(3)}",
     "cohort": ["alpha", "beta", "gamma"][i % 3],
     "attended_sessions": i % 7,
     "expected_sessions": 6}
    for i in range(1, 25)
]

# Load into DataFrame
attendance = pd.DataFrame(attendance_raw)

# Display first five rows
print(attendance.head())

# Confirm structure and data types
attendance.info()

  student_id cohort  attended_sessions  expected_sessions
0       S001   beta                  1                  6
1       S002  gamma                  2                  6
2       S003  alpha                  3                  6
3       S004   beta                  4                  6
4       S005  gamma                  5                  6
<class 'pandas.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   student_id         24 non-null     str  
 1   cohort             24 non-null     str  
 2   attended_sessions  24 non-null     int64
 3   expected_sessions  24 non-null     int64
dtypes: int64(2), str(2)
memory usage: 900.0 bytes


In [2]:
# Set student_id as the index
attendance_indexed = attendance.set_index("student_id")

# Create a Series with excused absences (some IDs do NOT exist)
excused_absences = pd.Series(
    {
        "S002": 1,
        "S004": 2,
        "S006": 1,
        "S009": 1,
        "S012": 2,
        "S015": 1,
        "S018": 1,
        "S021": 2,
        "S024": 1,
        "S030": 2,  # does not exist
        "S031": 1   # does not exist
    }
)

# Add Series to attended_sessions (index alignment happens here)
attendance_indexed["adjusted_attendance"] = (
    attendance_indexed["attended_sessions"] + excused_absences
)

# Show rows with missing values due to non-matching IDs
print(attendance_indexed[attendance_indexed["adjusted_attendance"].isna()][
    ["attended_sessions", "adjusted_attendance"]
])

# Fill missing adjusted_attendance with original attended_sessions
attendance_indexed["adjusted_attendance"] = (
    attendance_indexed["adjusted_attendance"]
    .fillna(attendance_indexed["attended_sessions"])
)

# Display updated column
print(attendance_indexed[["attended_sessions", "adjusted_attendance"]].head())

            attended_sessions  adjusted_attendance
student_id                                        
S001                        1                  NaN
S003                        3                  NaN
S005                        5                  NaN
S007                        0                  NaN
S008                        1                  NaN
S010                        3                  NaN
S011                        4                  NaN
S013                        6                  NaN
S014                        0                  NaN
S016                        2                  NaN
S017                        3                  NaN
S019                        5                  NaN
S020                        6                  NaN
S022                        1                  NaN
S023                        2                  NaN
            attended_sessions  adjusted_attendance
student_id                                        
S001                        1  

In [3]:
# Introduce inconsistencies
attendance_indexed.loc["S003", "cohort"] = " Alpha "
attendance_indexed.loc["S007", "cohort"] = "BETA"
attendance_indexed.loc["S010", "cohort"] = " gamma "

# Normalize cohort column
attendance_indexed["cohort"] = (
    attendance_indexed["cohort"]
    .str.strip()
    .str.lower()
)

# Confirm cleaning worked
print(attendance_indexed["cohort"].unique())

<StringArray>
['beta', 'gamma', 'alpha']
Length: 3, dtype: str


In [4]:
# Filter students with low attendance
low_attendance = attendance_indexed[
    attendance_indexed["attended_sessions"] < attendance_indexed["expected_sessions"]
]

# Average attended sessions by cohort
attendance_summary = (
    attendance_indexed
    .groupby("cohort")["attended_sessions"]
    .mean()
)

# Print results
print(low_attendance.head())
print(attendance_summary)

           cohort  attended_sessions  expected_sessions  adjusted_attendance
student_id                                                                  
S001         beta                  1                  6                  1.0
S002        gamma                  2                  6                  3.0
S003        alpha                  3                  6                  3.0
S004         beta                  4                  6                  6.0
S005        gamma                  5                  6                  5.0
cohort
alpha    3.000000
beta     2.714286
gamma    2.888889
Name: attended_sessions, dtype: float64


In [6]:
attendance_indexed["attendance_ok"] = (
    attendance_indexed["attended_sessions"] >= attendance_indexed["expected_sessions"]
)

low_attendance = attendance_indexed[
    attendance_indexed["attended_sessions"] < attendance_indexed["expected_sessions"]
]

print(low_attendance["attendance_ok"].unique())

[False]
