# Badge Attendance Dataset – DataFrame Version

In [None]:
import pandas as pd
import random
from datetime import datetime,timedelta
from faker import Faker

fake=Faker()

In [None]:
%pip install pandas

In [None]:
# Students DataFrame 
students_df = pd.DataFrame(columns=[
    "student_id",
    "badge_id",
    "name",
    "department",
    "year_or_sem"
])

# Badge events DataFrame 
badge_events_df = pd.DataFrame(columns=[
    "event_id",
    "badge_id",
    "event_time",
    "event_type"
])


In [None]:
students_df.head(), badge_events_df.head()


In [None]:
NUM_STUDENTS = 2000

departments = ["Engineering", "Science", "Business", "Arts", "Medicine", "Law"]
years = ["1st Year", "2nd Year", "3rd Year", "4th Year"]

students_data = []

for i in range(NUM_STUDENTS):
    students_data.append({
        "student_id": i + 1,
        "badge_id": f"BADGE{i:06d}",
        "name": fake.name(),
        "department": random.choice(departments),
        "year_or_sem": random.choice(years)
    })

# Insert into DataFrame
students_df = pd.concat([students_df, pd.DataFrame(students_data)], ignore_index=True)


In [None]:
students_df.head()


In [None]:
len(students_df)



In [None]:
from datetime import datetime, timedelta
import random

# TARGET
TARGET_ROWS = 1_000_000
BATCH_SIZE = 50_000   # safe for Pandas

NUM_DAYS = 220
SESSIONS_PER_DAY_MIN = 2
SESSIONS_PER_DAY_MAX = 4
WORK_START_HOUR = 8
WORK_END_HOUR = 17

start_date = datetime(2024, 9, 1)

badge_ids = students_df["badge_id"].tolist()

event_id = 1
total_rows = 0
batch_no = 0

while total_rows < TARGET_ROWS:

    events_batch = []

    day = random.randint(0, NUM_DAYS - 1)
    current_date = start_date + timedelta(days=day)

    students_today = random.sample(
        badge_ids,
        random.randint(800, 1600)
    )

    for badge in students_today:

        if total_rows >= TARGET_ROWS:
            break

        sessions = random.randint(
            SESSIONS_PER_DAY_MIN,
            SESSIONS_PER_DAY_MAX
        )

        for _ in range(sessions):

            if total_rows >= TARGET_ROWS:
                break

            # IN
            in_hour = random.randint(WORK_START_HOUR, WORK_END_HOUR - 1)
            in_min = random.randint(0, 59)

            in_time = current_date.replace(
                hour=in_hour, minute=in_min,
                second=0, microsecond=0
            )

            events_batch.append({
                "event_id": event_id,
                "badge_id": badge,
                "event_time": in_time,
                "event_type": "IN"
            })
            event_id += 1
            total_rows += 1

            if total_rows >= TARGET_ROWS:
                break

            # OUT
            out_hour = min(in_hour + random.randint(1, 6), WORK_END_HOUR - 1)
            out_min = random.randint(0, 59)

            out_time = current_date.replace(
                hour=out_hour, minute=out_min,
                second=0, microsecond=0
            )

            if out_time <= in_time:
                out_time = in_time + timedelta(hours=1)

            events_batch.append({
                "event_id": event_id,
                "badge_id": badge,
                "event_time": out_time,
                "event_type": "OUT"
            })
            event_id += 1
            total_rows += 1

            if len(events_batch) >= BATCH_SIZE:
                break

        if len(events_batch) >= BATCH_SIZE:
            break

    badge_events_df = pd.concat(
        [badge_events_df, pd.DataFrame(events_batch)],
        ignore_index=True
    )

    batch_no += 1
    print(f"Batch {batch_no} inserted — Total rows: {total_rows}")

print("\nDONE — Final rows:", len(badge_events_df))


In [None]:
badge_events_df.head()

In [None]:
badge_events_df["event_type"].value_counts()


In [None]:
badge_events_df[badge_events_df["badge_id"] == "BADGE000123"].head()


In [None]:
badge_events_df[badge_events_df["event_type"] == "IN"].head()
