In [28]:
import pandas as pd
from pathlib import Path
import itertools
import random
from mlxtend.frequent_patterns import apriori, association_rules

In [29]:
lessons_required = pd.read_csv("data/bronze/bronze_lessons_required.csv")

student_subject = pd.read_csv("data/bronze/bronze_student_subject.csv")
student_grade = pd.read_csv("data/bronze/bronze_student_grade.csv")

teacher_subject = pd.read_csv("data/bronze/bronze_teacher_subject.csv")
teacher_grade = pd.read_csv("data/bronze/bronze_teacher_grade.csv")

period_interval = pd.read_csv("data/bronze/bronze_period_interval.csv")
weekdays = pd.read_csv("data/raw/weekdays.csv")

grades = pd.read_csv("data/bronze/bronze_grades.csv")

In [30]:
days = weekdays['weekday'].tolist()
slots = period_interval.index.tolist()  # 0-8

all_slots = list(itertools.product(days, slots))  # total 45 slots

In [31]:
lessons_required = (
    lessons_required.groupby(['num_lessons', 'grade'])['subject']
    .agg([
        ('subject', lambda x: ', '.join(sorted(x.unique())))
    ])
    .reset_index()
)

In [32]:
# Group subjects for each student into a single string + count
student_subject_list = (
    student_subject.groupby('student_id')['subject']
    .agg([
        ('subjects', lambda x: ', '.join(sorted(x.unique()))),
        ('total_subjects', lambda x: x.nunique())
    ])
    .reset_index()
)

student_subject_combination = student_subject_list[['subjects']].drop_duplicates()
# Define categories
language_set = {'afrikaans', 'english', 'isizulu'}
math_set = {'math', 'maths-lit'}

# Function to classify subjects
def split_subjects(subj_str):
    subj_list = [s.strip() for s in subj_str.split(',')]
    languages = [s for s in subj_list if s in language_set]
    maths = [s for s in subj_list if s in math_set]
    others = [s for s in subj_list if s not in language_set and s not in math_set]
    return pd.Series({
        'languages': ', '.join(languages),
        'maths': ', '.join(maths),
        'others': ', '.join(others)
    })

# Apply to the DataFrame
student_subject_combination[['languages', 'maths', 'others']] = student_subject_combination['subjects'].apply(split_subjects)
student_subject_combination = student_subject_combination.drop_duplicates().reset_index(drop=True)

In [34]:
student_subject_list

Unnamed: 0,student_id,subjects,total_subjects
0,2001,"accn, afrikaans, math, phsc",4
1,2002,"accn, afrikaans, bstd, econ, english, lo, math",7
2,2003,"accn, afrikaans, bstd, econ, english, lo, math...",7
3,2004,"accn, bstd, econ, math",4
4,2005,"accn, bstd, math",3
...,...,...,...
456,2457,"maths-lit, tourism",2
457,2458,tourism,1
458,2459,"math, tourism",2
459,2460,vsla,1


In [6]:
# Explode lessons_required subjects
lessons_required_exp = lessons_required.assign(
    subject=lessons_required['subject'].str.split(', ')
).explode('subject')

# Merge with student_subject to get student grade + lessons
student_lessons = student_subject.merge(
    student_grade, on='student_id', how='left'
).merge(
    lessons_required_exp, left_on=['grade', 'subject'], right_on=['grade','subject'], how='left'
)

# Merge with teacher_subject to get teacher allocation
student_lessons = student_lessons.merge(
    teacher_subject, on='subject', how='left'
)

In [7]:
# Drop subjects that don't have lesson requirements
student_lessons = student_lessons.dropna(subset=['num_lessons'])

# Ensure integer type
student_lessons['num_lessons'] = student_lessons['num_lessons'].astype(int)

# Expand num_lessons for easier scheduling
student_lessons_expanded = student_lessons.loc[
    student_lessons.index.repeat(student_lessons['num_lessons'])
].copy()

student_lessons_expanded['lesson_id'] = range(len(student_lessons_expanded))

In [9]:
grade_timetables = {}

for grade in grades['grade']:
    # Empty timetable: periods as rows, days as columns
    timetable = pd.DataFrame(index=slots, columns=days)
    grade_timetables[grade] = timetable

In [20]:
# Track teacher assignments to avoid clashes
teacher_schedule = {t: set() for t in teacher_subject['teacher_id'].unique()}

for grade in grades['grade']:
    lessons = student_lessons_expanded[student_lessons_expanded['grade']==grade].copy()
    timetable = grade_timetables[grade]

    for _, row in lessons.iterrows():
        subject = row['subject']
        teacher = row['teacher_id']

        # Shuffle slots for randomness
        random.shuffle(all_slots)
        for day, slot in all_slots:
            # Check if slot is free for teacher & timetable
            if timetable.loc[slot, day] is pd.NA or pd.isna(timetable.loc[slot, day]) or timetable.loc[slot, day] is None:
                if (day, slot) not in teacher_schedule[teacher]:
                    # Assign
                    timetable.loc[slot, day] = subject
                    teacher_schedule[teacher].add((day, slot))
                    break

    grade_timetables[grade] = timetable

In [23]:
teacher_timetables = {t: pd.DataFrame(index=slots, columns=days) for t in teacher_subject['teacher_id'].unique()}

for teacher, lessons in teacher_schedule.items():
    for day, slot in lessons:
        # Find the subject assigned
        for grade, timetable in grade_timetables.items():
            if timetable.loc[slot, day] is not None and teacher_subject[teacher_subject['teacher_id']==teacher]['subject'].values[0] == timetable.loc[slot, day]:
                teacher_timetables[teacher].loc[slot, day] = timetable.loc[slot, day]

In [22]:
grade_timetables[11]

Unnamed: 0,MON,TUE,WED,THU,FRI
0,math,accn,bstd,accn,accn
1,phsc,phsc,accn,phsc,accn
2,phsc,math,accn,math,lo
3,math,accn,math,phsc,math
4,phsc,math,math,phsc,math
5,phsc,bstd,lo,math,lo
6,accn,accn,bstd,accn,accn
7,bstd,math,phsc,math,math
8,accn,phsc,phsc,phsc,lo


In [26]:
import pandas as pd
import itertools
import random

# Step 1: Generate all slots with labels
days = ['Mon','Tue','Wed','Thu','Fri']
slots = period_interval.reset_index()

all_slots = []
for day in days:
    for i, row in slots.iterrows():
        timeslot_label = f"{day}-{row['start_time']}-{row['end_time']}"
        all_slots.append(timeslot_label)

# Step 2: Prepare lessons (drop missing lesson counts)
student_lessons = student_lessons.dropna(subset=['num_lessons'])
student_lessons['num_lessons'] = student_lessons['num_lessons'].astype(int)

student_lessons_expanded = student_lessons.loc[
    student_lessons.index.repeat(student_lessons['num_lessons'])
].copy()
student_lessons_expanded['lesson_id'] = range(len(student_lessons_expanded))

# Step 3: Allocate lessons greedily
teacher_schedule = {t: set() for t in teacher_subject['teacher_id'].unique()}
allocations = []  # final long-format result

for grade in sorted(student_lessons['grade'].unique()):
    lessons = student_lessons_expanded[student_lessons_expanded['grade']==grade].copy()
    
    for _, row in lessons.iterrows():
        subject = row['subject']
        teacher = row['teacher_id']
        class_name = f"Grade {grade} - {subject}"
        
        random.shuffle(all_slots)
        for ts in all_slots:
            if ts not in teacher_schedule[teacher]:  # teacher is free
                allocations.append((ts, class_name))
                teacher_schedule[teacher].add(ts)
                break

# Step 4: Save as CSV
allocations_df = pd.DataFrame(allocations, columns=['timeslot','class'])
allocations_df.to_csv("timetable.csv", index=False)
print("✅ Timetable saved as timetable.csv")


✅ Timetable saved as timetable.csv


In [27]:
allocations_df

Unnamed: 0,timeslot,class
0,Wed-11:55:00-12:40:00,Grade 10 - accn
1,Tue-12:40:00-13:25:00,Grade 10 - accn
2,Wed-14:10:00-14:50:00,Grade 10 - accn
3,Thu-07:45:00-08:30:00,Grade 10 - accn
4,Wed-10:00:00-10:45:00,Grade 10 - accn
...,...,...
1615,Fri-14:10:00-14:50:00,Grade 12 - geog
1616,Mon-12:40:00-13:25:00,Grade 12 - geog
1617,Mon-07:45:00-08:30:00,Grade 12 - geog
1618,Tue-09:15:00-10:00:00,Grade 12 - geog
