# Badge Attendance Dataset â€“ DataFrame Version

In [2]:
import pandas as pd
import random
from datetime import datetime,timedelta
from faker import Faker

fake=Faker()

In [None]:
%pip install pandas

In [3]:
dim_students_df = pd.DataFrame(columns=[
    "student_id", "badge_id", "name", "department", "year_or_sem"
])

dim_teachers_df = pd.DataFrame(columns=[
    "teacher_id", "badge_id", "name", "department"
])

src_badge_events_df = pd.DataFrame(columns=[
    "badge_id", "person_type", "event_time", "event_type"
])


In [4]:
NUM_STUDENTS = 2000
departments = ["Engineering", "Science", "Business", "Arts", "Medicine", "Law"]
years = ["1st Year", "2nd Year", "3rd Year", "4th Year"]

students_data = []

for i in range(NUM_STUDENTS):
    students_data.append({
        "student_id": i + 1,
        "badge_id": f"BADGE{i:06d}",
        "name": fake.name(),
        "department": random.choice(departments),
        "year_or_sem": random.choice(years)
    })

dim_students_df = pd.DataFrame(students_data)

print("Students:", len(dim_students_df))


Students: 2000


In [5]:
NUM_TEACHERS = 150

teachers_data = []

for i in range(NUM_TEACHERS):
    teachers_data.append({
        "teacher_id": i + 1,
        "badge_id": f"TEACHER{i:05d}",
        "name": fake.name(),
        "department": random.choice(departments)
    })

dim_teachers_df = pd.DataFrame(teachers_data)

print("Teachers:", len(dim_teachers_df))



Teachers: 150


In [6]:
student_badges = dim_students_df["badge_id"].tolist()
teacher_badges = dim_teachers_df["badge_id"].tolist()




In [7]:
TARGET_EVENTS = 1_000_000
events = []

start_date = datetime(2024, 9, 1)
NUM_DAYS = 220
WORK_START_HOUR = 8
WORK_END_HOUR = 17

while len(events) + 2 <= TARGET_EVENTS:

    date = start_date + timedelta(days=random.randint(0, NUM_DAYS - 1))

    if random.random() < 0.85:
        badge = random.choice(student_badges)
        person_type = "student"
    else:
        badge = random.choice(teacher_badges)
        person_type = "teacher"

    in_hour = random.randint(WORK_START_HOUR, WORK_END_HOUR - 1)
    in_min = random.randint(0, 59)

    in_time = date.replace(
        hour=in_hour, minute=in_min,
        second=0, microsecond=0
    )

    out_hour = min(in_hour + random.randint(1, 6), WORK_END_HOUR - 1)
    out_min = random.randint(0, 59)

    out_time = date.replace(
        hour=out_hour, minute=out_min,
        second=0, microsecond=0
    )

    if out_time <= in_time:
        out_time = in_time + timedelta(hours=1)

    events.append({
        "badge_id": badge,
        "person_type": person_type,
        "event_time": in_time,
        "event_type": "IN"
    })

    events.append({
        "badge_id": badge,
        "person_type": person_type,
        "event_time": out_time,
        "event_type": "OUT"
    })


In [8]:
src_badge_events_df = pd.DataFrame(events)

print("Total events:", len(src_badge_events_df))


Total events: 1000000


In [9]:
src_badge_events_df["event_type"].value_counts()


event_type
IN     500000
OUT    500000
Name: count, dtype: int64

In [13]:
src_badge_events_df[src_badge_events_df["badge_id"] == "BADGE000123"].head()


Unnamed: 0,badge_id,person_type,event_time,event_type
2718,BADGE000123,student,2024-11-05 09:12:00,IN
2719,BADGE000123,student,2024-11-05 15:45:00,OUT
5514,BADGE000123,student,2024-09-29 12:38:00,IN
5515,BADGE000123,student,2024-09-29 15:40:00,OUT
8488,BADGE000123,student,2025-03-25 09:59:00,IN


In [14]:
src_badge_events_df[src_badge_events_df["person_type"] == "teacher"].head()


Unnamed: 0,badge_id,person_type,event_time,event_type
12,TEACHER00064,teacher,2025-03-05 14:48:00,IN
13,TEACHER00064,teacher,2025-03-05 16:31:00,OUT
20,TEACHER00063,teacher,2024-11-18 14:18:00,IN
21,TEACHER00063,teacher,2024-11-18 16:12:00,OUT
34,TEACHER00128,teacher,2025-01-02 14:46:00,IN


In [15]:
src_badge_events_df.groupby("person_type").size()


person_type
student    849886
teacher    150114
dtype: int64