# Badge Attendance Dataset – DataFrame Version

In [1]:
import pandas as pd
import random
from datetime import datetime,timedelta
from faker import Faker

fake=Faker()

In [2]:
%pip install pandas

Collecting pandas
  Downloading pandas-2.3.3-cp313-cp313-win_amd64.whl.metadata (19 kB)
Collecting numpy>=1.26.0 (from pandas)
  Downloading numpy-2.4.1-cp313-cp313-win_amd64.whl.metadata (6.6 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading pandas-2.3.3-cp313-cp313-win_amd64.whl (11.0 MB)
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ---- ----------------------------------- 1.3/11.0 MB 7.5 MB/s eta 0:00:02
   ----------- ---------------------------- 3.1/11.0 MB 7.8 MB/s eta 0:00:02
   ------------------- -------------------- 5.2/11.0 MB 8.3 MB/s eta 0:00:01
   ------------------------- -------------- 7.1/11.0 MB 8.4 MB/s eta 0:00:01
   --------------------------------- ------ 9.2/11.0 MB 8.7 MB/s eta 0:00:01
   ---------------------------------------  10.7/11.0 MB 8.8 MB/s eta 0:00:01
   ---------------------------------------- 11.0/11.0 MB 8.4 MB/s  0:00:01
Downloading numpy-2.4.1-cp313-cp

In [2]:
# Students DataFrame (master data)
students_df = pd.DataFrame(columns=[
    "student_id",
    "badge_id",
    "name",
    "department",
    "year_or_sem"
])

# Badge events DataFrame (transactional data)
badge_events_df = pd.DataFrame(columns=[
    "event_id",
    "badge_id",
    "event_time",
    "event_type"
])


In [3]:
students_df.head(), badge_events_df.head()


(Empty DataFrame
 Columns: [student_id, badge_id, name, department, year_or_sem]
 Index: [],
 Empty DataFrame
 Columns: [event_id, badge_id, event_time, event_type]
 Index: [])

In [4]:
NUM_STUDENTS = 2000

departments = ["Engineering", "Science", "Business", "Arts", "Medicine", "Law"]
years = ["1st Year", "2nd Year", "3rd Year", "4th Year"]

students_data = []

for i in range(NUM_STUDENTS):
    students_data.append({
        "student_id": i + 1,
        "badge_id": f"BADGE{i:06d}",
        "name": fake.name(),
        "department": random.choice(departments),
        "year_or_sem": random.choice(years)
    })

# Insert into DataFrame
students_df = pd.concat([students_df, pd.DataFrame(students_data)], ignore_index=True)


In [5]:
students_df.head()


Unnamed: 0,student_id,badge_id,name,department,year_or_sem
0,1,BADGE000000,Felicia Merritt,Business,3rd Year
1,2,BADGE000001,Jeffrey Ray,Business,1st Year
2,3,BADGE000002,Lisa Villanueva,Business,2nd Year
3,4,BADGE000003,Jacob Hall,Arts,3rd Year
4,5,BADGE000004,Christopher Mathis,Medicine,4th Year


In [6]:
len(students_df)



2000

In [7]:
from datetime import datetime, timedelta
import random

# TARGET
TARGET_ROWS = 1_000_000
BATCH_SIZE = 50_000   # safe for Pandas

NUM_DAYS = 220
SESSIONS_PER_DAY_MIN = 2
SESSIONS_PER_DAY_MAX = 4
WORK_START_HOUR = 8
WORK_END_HOUR = 17

start_date = datetime(2024, 9, 1)

badge_ids = students_df["badge_id"].tolist()

event_id = 1
total_rows = 0
batch_no = 0

while total_rows < TARGET_ROWS:

    events_batch = []

    day = random.randint(0, NUM_DAYS - 1)
    current_date = start_date + timedelta(days=day)

    students_today = random.sample(
        badge_ids,
        random.randint(800, 1600)
    )

    for badge in students_today:

        if total_rows >= TARGET_ROWS:
            break

        sessions = random.randint(
            SESSIONS_PER_DAY_MIN,
            SESSIONS_PER_DAY_MAX
        )

        for _ in range(sessions):

            if total_rows >= TARGET_ROWS:
                break

            # IN
            in_hour = random.randint(WORK_START_HOUR, WORK_END_HOUR - 1)
            in_min = random.randint(0, 59)

            in_time = current_date.replace(
                hour=in_hour, minute=in_min,
                second=0, microsecond=0
            )

            events_batch.append({
                "event_id": event_id,
                "badge_id": badge,
                "event_time": in_time,
                "event_type": "IN"
            })
            event_id += 1
            total_rows += 1

            if total_rows >= TARGET_ROWS:
                break

            # OUT
            out_hour = min(in_hour + random.randint(1, 6), WORK_END_HOUR - 1)
            out_min = random.randint(0, 59)

            out_time = current_date.replace(
                hour=out_hour, minute=out_min,
                second=0, microsecond=0
            )

            if out_time <= in_time:
                out_time = in_time + timedelta(hours=1)

            events_batch.append({
                "event_id": event_id,
                "badge_id": badge,
                "event_time": out_time,
                "event_type": "OUT"
            })
            event_id += 1
            total_rows += 1

            if len(events_batch) >= BATCH_SIZE:
                break

        if len(events_batch) >= BATCH_SIZE:
            break

    badge_events_df = pd.concat(
        [badge_events_df, pd.DataFrame(events_batch)],
        ignore_index=True
    )

    batch_no += 1
    print(f"Batch {batch_no} inserted — Total rows: {total_rows}")

print("\nDONE — Final rows:", len(badge_events_df))


  badge_events_df = pd.concat(


Batch 1 inserted — Total rows: 6260
Batch 2 inserted — Total rows: 12286
Batch 3 inserted — Total rows: 18896
Batch 4 inserted — Total rows: 27718
Batch 5 inserted — Total rows: 33970
Batch 6 inserted — Total rows: 39318
Batch 7 inserted — Total rows: 48018
Batch 8 inserted — Total rows: 55498
Batch 9 inserted — Total rows: 64922
Batch 10 inserted — Total rows: 72936
Batch 11 inserted — Total rows: 77824
Batch 12 inserted — Total rows: 83788
Batch 13 inserted — Total rows: 93184
Batch 14 inserted — Total rows: 98278
Batch 15 inserted — Total rows: 103216
Batch 16 inserted — Total rows: 110908
Batch 17 inserted — Total rows: 120358
Batch 18 inserted — Total rows: 125764
Batch 19 inserted — Total rows: 134354
Batch 20 inserted — Total rows: 140180
Batch 21 inserted — Total rows: 148560
Batch 22 inserted — Total rows: 154772
Batch 23 inserted — Total rows: 163584
Batch 24 inserted — Total rows: 169406
Batch 25 inserted — Total rows: 177140
Batch 26 inserted — Total rows: 185730
Batch 27 i

In [8]:
badge_events_df.head()

Unnamed: 0,event_id,badge_id,event_time,event_type
0,1,BADGE001633,2024-11-07 10:44:00,IN
1,2,BADGE001633,2024-11-07 15:52:00,OUT
2,3,BADGE001633,2024-11-07 09:00:00,IN
3,4,BADGE001633,2024-11-07 14:08:00,OUT
4,5,BADGE001633,2024-11-07 14:41:00,IN


In [9]:
badge_events_df["event_type"].value_counts()


event_type
IN     500000
OUT    500000
Name: count, dtype: int64

In [10]:
badge_events_df[badge_events_df["badge_id"] == "BADGE000123"].head()


Unnamed: 0,event_id,badge_id,event_time,event_type
3320,3321,BADGE000123,2024-11-07 16:08:00,IN
3321,3322,BADGE000123,2024-11-07 17:08:00,OUT
3322,3323,BADGE000123,2024-11-07 08:20:00,IN
3323,3324,BADGE000123,2024-11-07 09:48:00,OUT
3324,3325,BADGE000123,2024-11-07 15:22:00,IN


In [11]:
badge_events_df[badge_events_df["event_type"] == "IN"].head()


Unnamed: 0,event_id,badge_id,event_time,event_type
0,1,BADGE001633,2024-11-07 10:44:00,IN
2,3,BADGE001633,2024-11-07 09:00:00,IN
4,5,BADGE001633,2024-11-07 14:41:00,IN
6,7,BADGE001633,2024-11-07 08:50:00,IN
8,9,BADGE001812,2024-11-07 15:06:00,IN
