In [None]:
%pip install ipython-sql


In [2]:
%load_ext sql


In [3]:
%sql sqlite:///badge_attendance.db

In [4]:
import sqlite3
import random
from datetime import datetime, timedelta
from faker import Faker

fake = Faker()


In [5]:
conn = sqlite3.connect("badge_attendance.db")
cursor = conn.cursor()


In [26]:

%%sql
CREATE TABLE IF NOT EXISTS dim_students (
    student_id INTEGER PRIMARY KEY AUTOINCREMENT,
    badge_id TEXT UNIQUE,
    name TEXT,
    department TEXT,
    year_or_sem TEXT
);



 * sqlite:///badge_attendance.db
Done.


[]

In [27]:
%%sql
CREATE TABLE IF NOT EXISTS dim_teachers (
    teacher_id INTEGER PRIMARY KEY AUTOINCREMENT,
    badge_id TEXT UNIQUE,
    name TEXT,
    department TEXT
);


 * sqlite:///badge_attendance.db
Done.


[]

In [7]:
%%sql
CREATE TABLE src_badge_events (
    event_id INTEGER PRIMARY KEY AUTOINCREMENT,
    badge_id TEXT,
    event_time TEXT,
    event_type TEXT
);



 * sqlite:///badge_attendance.db
Done.


[]

In [29]:
NUM_STUDENTS = 2000

departments = ["Engineering", "Science", "Business", "Arts", "Medicine", "Law"]
years = ["1st Year", "2nd Year", "3rd Year", "4th Year"]

students_data = []

for i in range(NUM_STUDENTS):
    badge_id = f"BADGE{i:06d}"
    name = fake.name()
    department = random.choice(departments)
    year = random.choice(years)

    students_data.append((badge_id, name, department, year))

cursor.executemany(
    "INSERT INTO dim_students (badge_id, name, department, year_or_sem) VALUES (?, ?, ?, ?)",
    students_data
)
conn.commit()

print("Students inserted:", len(students_data))



Students inserted: 2000


In [30]:
NUM_TEACHERS = 150

teachers_data = []

for i in range(NUM_TEACHERS):
    badge_id = f"TEACHER{i:05d}"
    name = fake.name()
    department = random.choice(departments)

    teachers_data.append((badge_id, name, department))

cursor.executemany(
    "INSERT INTO dim_teachers (badge_id, name, department) VALUES (?, ?, ?)",
    teachers_data
)
conn.commit()

print("Teachers inserted:", len(teachers_data))



Teachers inserted: 150


In [9]:
cursor.execute("SELECT badge_id FROM dim_students;")
student_badges = [r[0] for r in cursor.fetchall()]

cursor.execute("SELECT badge_id FROM dim_teachers;")
teacher_badges = [r[0] for r in cursor.fetchall()]



In [10]:
TARGET_EVENTS = 1_000_000
BATCH_SIZE = 50_000

events_data = []
total_events = 0
batch_no = 0

start_date = datetime(2024, 9, 1)

NUM_DAYS = 220
WORK_START_HOUR = 8
WORK_END_HOUR = 17

while total_events + 2 <= TARGET_EVENTS:

    date = start_date + timedelta(days=random.randint(0, NUM_DAYS - 1))

    # 85% students, 15% teachers – BUT we don’t store this info
    if random.random() < 0.85:
        badge = random.choice(student_badges)
    else:
        badge = random.choice(teacher_badges)

    in_hour = random.randint(WORK_START_HOUR, WORK_END_HOUR - 1)
    in_min = random.randint(0, 59)

    in_time = date.replace(
        hour=in_hour, minute=in_min,
        second=0, microsecond=0
    )

    out_hour = min(in_hour + random.randint(1, 6), WORK_END_HOUR - 1)
    out_min = random.randint(0, 59)

    out_time = date.replace(
        hour=out_hour, minute=out_min,
        second=0, microsecond=0
    )

    if out_time <= in_time:
        out_time = in_time + timedelta(hours=1)

    # ONLY store badge and event info
    events_data.append((badge, in_time.isoformat(), "IN"))
    events_data.append((badge, out_time.isoformat(), "OUT"))
    total_events += 2

    if len(events_data) >= BATCH_SIZE:
        cursor.executemany(
            "INSERT INTO src_badge_events (badge_id, event_time, event_type) VALUES (?, ?, ?)",
            events_data
        )
        conn.commit()

        batch_no += 1
        print(f"Batch {batch_no} inserted — Total {total_events}")
        events_data = []

# insert remaining
if events_data:
    cursor.executemany(
        "INSERT INTO src_badge_events (badge_id, event_time, event_type) VALUES (?, ?, ?)",
        events_data
    )
    conn.commit()

print("DONE — Final total:", total_events)


Batch 1 inserted — Total 50000
Batch 2 inserted — Total 100000
Batch 3 inserted — Total 150000
Batch 4 inserted — Total 200000
Batch 5 inserted — Total 250000
Batch 6 inserted — Total 300000
Batch 7 inserted — Total 350000
Batch 8 inserted — Total 400000
Batch 9 inserted — Total 450000
Batch 10 inserted — Total 500000
Batch 11 inserted — Total 550000
Batch 12 inserted — Total 600000
Batch 13 inserted — Total 650000
Batch 14 inserted — Total 700000
Batch 15 inserted — Total 750000
Batch 16 inserted — Total 800000
Batch 17 inserted — Total 850000
Batch 18 inserted — Total 900000
Batch 19 inserted — Total 950000
Batch 20 inserted — Total 1000000
DONE — Final total: 1000000


In [11]:
%%sql
SELECT COUNT(*) FROM src_badge_events;



 * sqlite:///badge_attendance.db
Done.


COUNT(*)
1000000


In [34]:
%%sql
SELECT person_type, COUNT(*) FROM src_badge_events GROUP BY person_type;


 * sqlite:///badge_attendance.db
Done.


person_type,COUNT(*)
student,850056
teacher,149944


In [12]:
%%sql
SELECT *
FROM src_badge_events
LIMIT 5;


 * sqlite:///badge_attendance.db
Done.


event_id,badge_id,event_time,event_type
1,BADGE000124,2024-12-17T11:07:00,IN
2,BADGE000124,2024-12-17T16:41:00,OUT
3,BADGE001627,2025-01-13T10:36:00,IN
4,BADGE001627,2025-01-13T12:46:00,OUT
5,BADGE001075,2025-01-02T11:29:00,IN


In [13]:
%%sql
SELECT event_type, COUNT(*)
FROM src_badge_events
GROUP BY event_type;


 * sqlite:///badge_attendance.db
Done.


event_type,COUNT(*)
IN,500000
OUT,500000


In [14]:
%%sql
SELECT *
FROM src_badge_events
WHERE badge_id = 'TEACHER00069'
LIMIT 10;


 * sqlite:///badge_attendance.db
Done.


event_id,badge_id,event_time,event_type
949,TEACHER00069,2025-01-27T08:51:00,IN
950,TEACHER00069,2025-01-27T13:58:00,OUT
1493,TEACHER00069,2025-02-10T12:29:00,IN
1494,TEACHER00069,2025-02-10T16:32:00,OUT
2287,TEACHER00069,2024-10-09T09:46:00,IN
2288,TEACHER00069,2024-10-09T12:08:00,OUT
8319,TEACHER00069,2024-10-02T11:02:00,IN
8320,TEACHER00069,2024-10-02T12:27:00,OUT
9251,TEACHER00069,2025-02-08T10:57:00,IN
9252,TEACHER00069,2025-02-08T12:50:00,OUT


In [15]:
%%sql
SELECT DATE(event_time) AS day,
       COUNT(*) AS total_events
FROM src_badge_events
GROUP BY day
ORDER BY day
LIMIT 10;


 * sqlite:///badge_attendance.db
Done.


day,total_events
2024-09-01,4394
2024-09-02,4458
2024-09-03,4714
2024-09-04,4556
2024-09-05,4642
2024-09-06,4622
2024-09-07,4428
2024-09-08,4480
2024-09-09,4582
2024-09-10,4532


In [5]:
%%sql
CREATE TABLE IF NOT EXISTS dim_calendar (
    date_id INTEGER PRIMARY KEY,
    full_date TEXT,
    year INTEGER,
    month INTEGER,
    day INTEGER,
    day_name TEXT,
    is_weekend INTEGER
);


 * sqlite:///badge_attendance.db
Done.


[]

In [7]:
%%sql
WITH RECURSIVE dates AS (
    SELECT DATE('2024-01-01') AS d
    UNION ALL
    SELECT DATE(d, '+1 day')
    FROM dates
    WHERE d < DATE('2025-12-31')
)
INSERT INTO dim_calendar
SELECT
    CAST(strftime('%Y%m%d', d) AS INTEGER) AS date_id,
    d AS full_date,
    CAST(strftime('%Y', d) AS INTEGER) AS year,
    CAST(strftime('%m', d) AS INTEGER) AS month,
    CAST(strftime('%d', d) AS INTEGER) AS day,
    CASE strftime('%w', d)
        WHEN '0' THEN 'Sunday'
        WHEN '1' THEN 'Monday'
        WHEN '2' THEN 'Tuesday'
        WHEN '3' THEN 'Wednesday'
        WHEN '4' THEN 'Thursday'
        WHEN '5' THEN 'Friday'
        WHEN '6' THEN 'Saturday'
    END AS day_name,
    CASE
        WHEN strftime('%w', d) IN ('0','6') THEN 1
        ELSE 0
    END AS is_weekend
FROM dates;


 * sqlite:///badge_attendance.db
Done.


[]

In [8]:
%%sql
SELECT * FROM dim_calendar LIMIT 10;


 * sqlite:///badge_attendance.db
Done.


date_id,full_date,year,month,day,day_name,is_weekend
20240101,2024-01-01,2024,1,1,Monday,0
20240102,2024-01-02,2024,1,2,Tuesday,0
20240103,2024-01-03,2024,1,3,Wednesday,0
20240104,2024-01-04,2024,1,4,Thursday,0
20240105,2024-01-05,2024,1,5,Friday,0
20240106,2024-01-06,2024,1,6,Saturday,1
20240107,2024-01-07,2024,1,7,Sunday,1
20240108,2024-01-08,2024,1,8,Monday,0
20240109,2024-01-09,2024,1,9,Tuesday,0
20240110,2024-01-10,2024,1,10,Wednesday,0


In [9]:
%%sql
SELECT COUNT(*) FROM dim_calendar;


 * sqlite:///badge_attendance.db
Done.


COUNT(*)
731


In [10]:
%%sql
SELECT is_weekend, COUNT(*)
FROM dim_calendar
GROUP BY is_weekend;


 * sqlite:///badge_attendance.db
Done.


is_weekend,COUNT(*)
0,523
1,208
