# Task 1: Build and load the dataset
Create a list of dictionaries called attendance_raw with exactly 24 records. Each record must include:

student_id in the format S001 to S024
cohort as one of ["alpha", "beta", "gamma"]
attended_sessions as an integer between 0 and 6
expected_sessions as the integer 6
Then load the list into a DataFrame named attendance. Print the first five rows and call info() to confirm the structure and data types.

 

In [365]:
import numpy as np
import pandas as pd
import random

In [366]:
rng = np.random.default_rng(42)

cohort = ["alpha", "beta", "gamma"]

attendance_raw = []


for i in range(1,25):
    
 records = { 'student_id': f"S00{i}" if i < 10  else f"S0{i}" ,
                   'cohort': rng.choice(cohort),
                   'attended_sessions': rng.integers(0,7),
                   'expected_sessions': 6}
 attendance_raw.append(records)


                   



In [367]:
attendance_raw

[{'student_id': 'S001',
  'cohort': np.str_('alpha'),
  'attended_sessions': np.int64(5),
  'expected_sessions': 6},
 {'student_id': 'S002',
  'cohort': np.str_('beta'),
  'attended_sessions': np.int64(3),
  'expected_sessions': 6},
 {'student_id': 'S003',
  'cohort': np.str_('beta'),
  'attended_sessions': np.int64(6),
  'expected_sessions': 6},
 {'student_id': 'S004',
  'cohort': np.str_('alpha'),
  'attended_sessions': np.int64(4),
  'expected_sessions': 6},
 {'student_id': 'S005',
  'cohort': np.str_('alpha'),
  'attended_sessions': np.int64(0),
  'expected_sessions': 6},
 {'student_id': 'S006',
  'cohort': np.str_('beta'),
  'attended_sessions': np.int64(6),
  'expected_sessions': 6},
 {'student_id': 'S007',
  'cohort': np.str_('gamma'),
  'attended_sessions': np.int64(5),
  'expected_sessions': 6},
 {'student_id': 'S008',
  'cohort': np.str_('gamma'),
  'attended_sessions': np.int64(5),
  'expected_sessions': 6},
 {'student_id': 'S009',
  'cohort': np.str_('beta'),
  'attended_se

In [368]:
attendance = pd.DataFrame(attendance_raw)
attendance

Unnamed: 0,student_id,cohort,attended_sessions,expected_sessions
0,S001,alpha,5,6
1,S002,beta,3,6
2,S003,beta,6,6
3,S004,alpha,4,6
4,S005,alpha,0,6
5,S006,beta,6,6
6,S007,gamma,5,6
7,S008,gamma,5,6
8,S009,beta,0,6
9,S010,gamma,3,6


In [369]:
attendance.iloc[:5,:]

Unnamed: 0,student_id,cohort,attended_sessions,expected_sessions
0,S001,alpha,5,6
1,S002,beta,3,6
2,S003,beta,6,6
3,S004,alpha,4,6
4,S005,alpha,0,6


In [370]:
attendance.info()

<class 'pandas.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   student_id         24 non-null     str  
 1   cohort             24 non-null     str  
 2   attended_sessions  24 non-null     int64
 3   expected_sessions  24 non-null     int64
dtypes: int64(2), str(2)
memory usage: 900.0 bytes


# Task 2: Set an index and validate alignment
Set student_id as the index and store the result in attendance_indexed. Create a Series named excused_absences with at least 10 student IDs (some IDs must not exist in the DataFrame). Add this Series to attended_sessions to create a new column adjusted_attendance. Confirm that rows without matching IDs become missing in adjusted_attendance. Then fill missing values in adjusted_attendance with the original attended_sessions and show the updated column.



In [371]:
attendance_indexed =  attendance.set_index('student_id')
attendance_indexed

Unnamed: 0_level_0,cohort,attended_sessions,expected_sessions
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
S001,alpha,5,6
S002,beta,3,6
S003,beta,6,6
S004,alpha,4,6
S005,alpha,0,6
S006,beta,6,6
S007,gamma,5,6
S008,gamma,5,6
S009,beta,0,6
S010,gamma,3,6


In [372]:
excused_absences = pd.Series({"S001": 1,
        "S003": 1,
        "S005": 2,
        "S007": 1,
        "S010": 1,
        "S012": 2,
        "S015": 1,
        "S018": 1,
        "S024": 2,
        "S030": 1, 
        "S999": 2,})

In [373]:
excused_absences

S001    1
S003    1
S005    2
S007    1
S010    1
S012    2
S015    1
S018    1
S024    2
S030    1
S999    2
dtype: int64

In [374]:
attendance_indexed['adjusted_attendance'] = attendance_indexed['attended_sessions'] + excused_absences

In [375]:
attendance_indexed

Unnamed: 0_level_0,cohort,attended_sessions,expected_sessions,adjusted_attendance
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
S001,alpha,5,6,6.0
S002,beta,3,6,
S003,beta,6,6,7.0
S004,alpha,4,6,
S005,alpha,0,6,2.0
S006,beta,6,6,
S007,gamma,5,6,6.0
S008,gamma,5,6,
S009,beta,0,6,
S010,gamma,3,6,4.0


In [376]:
missing_rows = attendance_indexed['adjusted_attendance'].isna()
missing_rows

student_id
S001    False
S002     True
S003    False
S004     True
S005    False
S006     True
S007    False
S008     True
S009     True
S010    False
S011     True
S012    False
S013     True
S014     True
S015    False
S016     True
S017     True
S018    False
S019     True
S020     True
S021     True
S022     True
S023     True
S024    False
Name: adjusted_attendance, dtype: bool

In [377]:
attendance_indexed2 = attendance_indexed.copy()
    
attendance_indexed2['adjusted_attendance'] = attendance_indexed2['adjusted_attendance'].fillna(attendance_indexed2['attended_sessions'])
attendance_indexed2

Unnamed: 0_level_0,cohort,attended_sessions,expected_sessions,adjusted_attendance
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
S001,alpha,5,6,6.0
S002,beta,3,6,3.0
S003,beta,6,6,7.0
S004,alpha,4,6,4.0
S005,alpha,0,6,2.0
S006,beta,6,6,6.0
S007,gamma,5,6,6.0
S008,gamma,5,6,5.0
S009,beta,0,6,0.0
S010,gamma,3,6,4.0


In [378]:
attendance_indexed2['adjusted_attendance']

student_id
S001    6.0
S002    3.0
S003    7.0
S004    4.0
S005    2.0
S006    6.0
S007    6.0
S008    5.0
S009    0.0
S010    4.0
S011    2.0
S012    8.0
S013    4.0
S014    5.0
S015    4.0
S016    1.0
S017    3.0
S018    1.0
S019    5.0
S020    4.0
S021    5.0
S022    2.0
S023    6.0
S024    8.0
Name: adjusted_attendance, dtype: float64

# Task 3: Clean and normalize categories
Introduce a small inconsistency by modifying a few cohort values to include extra whitespace and inconsistent casing. Then write pandas code to normalize the cohort column by stripping whitespace and converting to lowercase. After cleaning, display the unique cohorts to confirm that the inconsistencies are resolved.

In [379]:
inconsistent_cohort = attendance_indexed2

inconsistent_cohort

Unnamed: 0_level_0,cohort,attended_sessions,expected_sessions,adjusted_attendance
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
S001,alpha,5,6,6.0
S002,beta,3,6,3.0
S003,beta,6,6,7.0
S004,alpha,4,6,4.0
S005,alpha,0,6,2.0
S006,beta,6,6,6.0
S007,gamma,5,6,6.0
S008,gamma,5,6,5.0
S009,beta,0,6,0.0
S010,gamma,3,6,4.0


In [380]:
inconsistent_cohort.iloc[2,0] = ' BeTa'
inconsistent_cohort.iloc[8,0] = '  AlphA   '
inconsistent_cohort.iloc[13,0] = 'GamMa   '


In [381]:
inconsistent_cohort

Unnamed: 0_level_0,cohort,attended_sessions,expected_sessions,adjusted_attendance
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
S001,alpha,5,6,6.0
S002,beta,3,6,3.0
S003,BeTa,6,6,7.0
S004,alpha,4,6,4.0
S005,alpha,0,6,2.0
S006,beta,6,6,6.0
S007,gamma,5,6,6.0
S008,gamma,5,6,5.0
S009,AlphA,0,6,0.0
S010,gamma,3,6,4.0


In [382]:
consistent_cohort = inconsistent_cohort.copy()
consistent_cohort['cohort'] = inconsistent_cohort['cohort'].str.strip()
consistent_cohort['cohort'] = consistent_cohort['cohort'].str.lower()

consistent_cohort.head(5)

Unnamed: 0_level_0,cohort,attended_sessions,expected_sessions,adjusted_attendance
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
S001,alpha,5,6,6.0
S002,beta,3,6,3.0
S003,beta,6,6,7.0
S004,alpha,4,6,4.0
S005,alpha,0,6,2.0


In [383]:
consistent_cohort['cohort'].unique()

<StringArray>
['alpha', 'beta', 'gamma']
Length: 3, dtype: str

# Task 4: Filter and compute summaries
Filter the DataFrame to students where attended_sessions is below expected_sessions. Store the result in low_attendance. Compute the average attended_sessions by cohort using groupby. Print the summary and verify that cohorts in the summary match the cleaned cohorts.

In [384]:
df = consistent_cohort

In [385]:
low_attendance = df[(df['attended_sessions']) < (df['expected_sessions'])]

In [386]:
low_attendance

Unnamed: 0_level_0,cohort,attended_sessions,expected_sessions,adjusted_attendance
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
S001,alpha,5,6,6.0
S002,beta,3,6,3.0
S004,alpha,4,6,4.0
S005,alpha,0,6,2.0
S007,gamma,5,6,6.0
S008,gamma,5,6,5.0
S009,alpha,0,6,0.0
S010,gamma,3,6,4.0
S011,beta,2,6,2.0
S013,gamma,4,6,4.0


In [387]:
avg_attendance_sessions_bycohort = df.groupby('cohort')['attended_sessions'].mean()

In [388]:
print(avg_attendance_sessions_bycohort)

cohort
alpha    3.666667
beta     3.857143
gamma    3.625000
Name: attended_sessions, dtype: float64


In [389]:
 assert set(consistent_cohort['cohort']) == set(avg_attendance_sessions_bycohort.index)

In [390]:
avg_attendance_sessions_bycohort.index

Index(['alpha', 'beta', 'gamma'], dtype='str', name='cohort')

# Task 5: Add a derived field and validate it
Create a new column attendance_ok that is True when attended_sessions is at least expected_sessions, otherwise False. Use a boolean comparison rather than a loop. Then validate the column by confirming that every row in low_attendance has attendance_ok equal to False.



In [391]:
df['attendance_ok'] = df['attended_sessions'] >= df['expected_sessions']

In [392]:
df

Unnamed: 0_level_0,cohort,attended_sessions,expected_sessions,adjusted_attendance,attendance_ok
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
S001,alpha,5,6,6.0,False
S002,beta,3,6,3.0,False
S003,beta,6,6,7.0,True
S004,alpha,4,6,4.0,False
S005,alpha,0,6,2.0,False
S006,beta,6,6,6.0,True
S007,gamma,5,6,6.0,False
S008,gamma,5,6,5.0,False
S009,alpha,0,6,0.0,False
S010,gamma,3,6,4.0,False


In [393]:
low_attendance['attendance_ok'] = low_attendance['attended_sessions'] >= low_attendance['expected_sessions']
low_attendance

Unnamed: 0_level_0,cohort,attended_sessions,expected_sessions,adjusted_attendance,attendance_ok
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
S001,alpha,5,6,6.0,False
S002,beta,3,6,3.0,False
S004,alpha,4,6,4.0,False
S005,alpha,0,6,2.0,False
S007,gamma,5,6,6.0,False
S008,gamma,5,6,5.0,False
S009,alpha,0,6,0.0,False
S010,gamma,3,6,4.0,False
S011,beta,2,6,2.0,False
S013,gamma,4,6,4.0,False


In [394]:
low_attendance.loc[:, "attendance_ok"] = (
    low_attendance["attended_sessions"] >= low_attendance["expected_sessions"]
)
low_attendance

Unnamed: 0_level_0,cohort,attended_sessions,expected_sessions,adjusted_attendance,attendance_ok
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
S001,alpha,5,6,6.0,False
S002,beta,3,6,3.0,False
S004,alpha,4,6,4.0,False
S005,alpha,0,6,2.0,False
S007,gamma,5,6,6.0,False
S008,gamma,5,6,5.0,False
S009,alpha,0,6,0.0,False
S010,gamma,3,6,4.0,False
S011,beta,2,6,2.0,False
S013,gamma,4,6,4.0,False


In [395]:
assert (~low_attendance["attendance_ok"]).all()


In [396]:
# all(low_attendance['attendance_ok'] == False) we can also use this,but in pandas previous is more preferable