In [1]:
import pandas as pd
import numpy as np
import holidays
import mysql.connector
import os
import random
from faker import Faker
from faker_education import SchoolProvider
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
Faker.seed(1607)
random.seed(1607)
fake = Faker('pt_BR')
fake.add_provider(SchoolProvider)

In [3]:
# create a date dimension table and populate it 20 years ago to today
date_dim_columns = ['date', 'year', 'month', 'day', 'weekday', 'day_of_year', 'is_holiday', 'is_weekend']
date_dim_df = pd.DataFrame(columns=date_dim_columns)
date_dim_df['date'] = pd.date_range(start='20020101', end='20221231')
date_dim_df['year'] = date_dim_df['date'].dt.year
date_dim_df['month'] = date_dim_df['date'].dt.month
date_dim_df['day'] = date_dim_df['date'].dt.day
date_dim_df['weekday'] = date_dim_df['date'].dt.weekday
date_dim_df['day_of_year'] = date_dim_df['date'].dt.dayofyear
date_dim_df['is_holiday'] = date_dim_df['date'].apply(lambda x: x in holidays.BR(state='SP', years=x.year))
date_dim_df['is_weekend'] = date_dim_df['date'].dt.weekday.isin([5, 6])
# date_dim_df.to_csv('date_dim.csv', index=False)
date_dim_df.head()

Unnamed: 0,date,year,month,day,weekday,day_of_year,is_holiday,is_weekend
0,2002-01-01,2002,1,1,1,1,True,False
1,2002-01-02,2002,1,2,2,2,False,False
2,2002-01-03,2002,1,3,3,3,False,False
3,2002-01-04,2002,1,4,4,4,False,False
4,2002-01-05,2002,1,5,5,5,False,True


In [4]:
cnx = mysql.connector.connect(user=os.environ['MYSQL_USER'],
                            password=os.environ['MYSQL_PASSWORD'],
                            host=os.environ['MYSQL_HOST'],
                            database=os.environ['MYSQL_DATABASE'],
                            auth_plugin='mysql_native_password')

In [5]:
# create table with the same structure as the df date_dim_df, with date_id auto incremented, and insert data from the df date_dim_df
cursor = cnx.cursor()
cursor.execute('DROP TABLE IF EXISTS date_dim')
cursor.execute('CREATE TABLE date_dim (date_id INT AUTO_INCREMENT PRIMARY KEY, date DATE, year INT, month INT, day INT, weekday INT, day_of_year INT, is_holiday BOOLEAN, is_weekend BOOLEAN)')
cursor.executemany('INSERT INTO date_dim (date, year, month, day, weekday, day_of_year, is_holiday, is_weekend) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)', date_dim_df.values.tolist())
cnx.commit()
cursor.close()

True

In [6]:
# check if the table was created and populated correctly, print the first 10 rows
cursor = cnx.cursor()
cursor.execute('SELECT * FROM date_dim LIMIT 10')
for row in cursor:
    print(row)
cursor.close()

(1, datetime.date(2002, 1, 1), 2002, 1, 1, 1, 1, 1, 0)
(2, datetime.date(2002, 1, 2), 2002, 1, 2, 2, 2, 0, 0)
(3, datetime.date(2002, 1, 3), 2002, 1, 3, 3, 3, 0, 0)
(4, datetime.date(2002, 1, 4), 2002, 1, 4, 4, 4, 0, 0)
(5, datetime.date(2002, 1, 5), 2002, 1, 5, 5, 5, 0, 1)
(6, datetime.date(2002, 1, 6), 2002, 1, 6, 6, 6, 0, 1)
(7, datetime.date(2002, 1, 7), 2002, 1, 7, 0, 7, 0, 0)
(8, datetime.date(2002, 1, 8), 2002, 1, 8, 1, 8, 0, 0)
(9, datetime.date(2002, 1, 9), 2002, 1, 9, 2, 9, 0, 0)
(10, datetime.date(2002, 1, 10), 2002, 1, 10, 3, 10, 0, 0)


True

In [7]:
# we are modelling a network of 1000 schools from the Alan Turing Group, each with a different number of students, a different location, and different teachers
SCHOOLS = 1000
school_dim_columns = ['school_name', 'school_district', 'school_level', 'school_state', 'max_students']
school_dim_df = pd.DataFrame(columns=school_dim_columns)
schools = [fake.school_object() for _ in range(SCHOOLS)]
school_dim_df['school_name'] = [school['school'] for school in schools]
school_dim_df['max_students'] = [random.randint(10, 100) * 10 for _ in range(SCHOOLS)]
school_dim_df['school_district'] = [school['district'] for school in schools]
school_dim_df['school_level'] = [random.choice(['Elementary', 'Middle', 'High']) for school in schools]
school_dim_df['school_state'] = [school['state'] for school in schools]
# school_dim_df.to_csv('school_dim.csv', index=False)
school_dim_df.head()

Unnamed: 0,school_name,school_district,school_level,school_state,max_students
0,FERNANDINA BEACH HIGH SCHOOL,NASSAU,High,FL,890
1,H D STAPLES EL,JOSHUA ISD,Middle,TX,990
2,Sacopee Valley High Sch,RSU 55/MSAD 55,Middle,ME,260
3,James Island Charter High,Charleston 01,Elementary,SC,520
4,Chatham Correspondence,Chatham School District,Middle,AK,290


In [8]:
# insert the data from the df school_dim_df into the table school_dim
cursor = cnx.cursor()
cursor.execute('DROP TABLE IF EXISTS school_dim')
cursor.execute('CREATE TABLE school_dim (school_id INT AUTO_INCREMENT PRIMARY KEY, school_name VARCHAR(255), school_district VARCHAR(255), school_level VARCHAR(255), school_state VARCHAR(255), max_students INT)')
cursor.executemany('INSERT INTO school_dim (school_name, school_district, school_level, school_state, max_students) VALUES (%s, %s, %s, %s, %s)', school_dim_df.values.tolist())
cnx.commit()

In [9]:
# check if the table was created and populated correctly, print the first 10 rows
cursor = cnx.cursor()
cursor.execute('SELECT * FROM school_dim LIMIT 10')
for row in cursor:
    print(row)
cursor.close()


(1, 'FERNANDINA BEACH HIGH SCHOOL', 'NASSAU', 'High', 'FL', 890)
(2, 'H D STAPLES EL', 'JOSHUA ISD', 'Middle', 'TX', 990)
(3, 'Sacopee Valley High Sch', 'RSU 55/MSAD 55', 'Middle', 'ME', 260)
(4, 'James Island Charter High', 'Charleston 01', 'Elementary', 'SC', 520)
(5, 'Chatham Correspondence', 'Chatham School District', 'Middle', 'AK', 290)
(6, 'Wilkins Elementary', 'Platteville School District', 'High', 'WI', 180)
(7, 'FRONTIER ELEMENTARY SCHOOL', 'JOINT SCHOOL DISTRICT NO. 2', 'Middle', 'ID', 460)
(8, 'Downeast School', 'Bangor Public Schools', 'Elementary', 'ME', 240)
(9, 'Northside Elementary School', 'Coweta County', 'High', 'GA', 350)
(10, 'Hollywood Elementary', "St. Mary's County Public Schools", 'Middle', 'MD', 380)


True

In [10]:
# Math,  'Science', 'History', 'English', 'Art', 'Music', 'PE', 'Social Studies', 'Foreign Language', 'Technology'
school_subjects = [
    {'school_subject':'Math',
    'classes_per_week': 5,
    'code': 'MTH'},
    {'school_subject':'Science',
    'classes_per_week': 5,
    'code': 'SCI'},
    {'school_subject':'History',
    'classes_per_week': 5,
    'code': 'HIS'},
    {'school_subject':'English',
    'classes_per_week': 5,
    'code': 'ENG'},
    {'school_subject':'Art',
    'classes_per_week': 2,
    'code': 'ART'},
    {'school_subject':'Music',
    'classes_per_week': 2,
    'code': 'MUS'},
    {'school_subject':'PE',
    'classes_per_week': 3,
    'code': 'PE'},
    {'school_subject':'Social Studies',
    'classes_per_week': 4,
    'code': 'SOC'},
    {'school_subject':'Foreign Language',
    'classes_per_week': 3,
    'code': 'FL'},
    {'school_subject':'Technology',
    'classes_per_week': 2,
    'code': 'TEC'}
]
school_subject_dim_df = pd.DataFrame(data=school_subjects)
# school_subject_dim_df.to_csv('school_subject_dim.csv', index=False)
school_subject_dim_df

Unnamed: 0,school_subject,classes_per_week,code
0,Math,5,MTH
1,Science,5,SCI
2,History,5,HIS
3,English,5,ENG
4,Art,2,ART
5,Music,2,MUS
6,PE,3,PE
7,Social Studies,4,SOC
8,Foreign Language,3,FL
9,Technology,2,TEC


In [11]:
# insert the data from the df school_subject_dim_df into the table school_subject_dim
cursor = cnx.cursor()
cursor.execute('DROP TABLE IF EXISTS school_subject_dim')
cursor.execute('CREATE TABLE school_subject_dim (school_subject_id INT AUTO_INCREMENT PRIMARY KEY, school_subject VARCHAR(255), classes_per_week INT, code CHAR(3))')
cursor.executemany('INSERT INTO school_subject_dim (school_subject, classes_per_week, code) VALUES (%s, %s, %s)', school_subject_dim_df.values.tolist())
cnx.commit()

In [12]:
# check if the table was created and populated correctly, print the first 10 rows
cursor = cnx.cursor()
cursor.execute('SELECT * FROM school_subject_dim LIMIT 10')
for row in cursor:
    print(row)
cursor.close()

(1, 'Math', 5, 'MTH')
(2, 'Science', 5, 'SCI')
(3, 'History', 5, 'HIS')
(4, 'English', 5, 'ENG')
(5, 'Art', 2, 'ART')
(6, 'Music', 2, 'MUS')
(7, 'PE', 3, 'PE')
(8, 'Social Studies', 4, 'SOC')
(9, 'Foreign Language', 3, 'FL')
(10, 'Technology', 2, 'TEC')


True

In [13]:
# for each school, we generate a number of teachers proportional to the max_students, and for each teacher we a starting and an end year, with minimum of 5 years, so that at least 10 teachers are teaching at each school at each time
number_of_teachers = [random.randint(1, int(school_dim_df['max_students'][i])) for i in range(SCHOOLS)]
teachers = []
for i in range(SCHOOLS):
    for j in range(number_of_teachers[i]):
        profile = fake.profile()
        teachers.append({'school_id': i + 1,
                            'teacher_name': profile['name'],
                            'sex': profile['sex'],
                            'birthdate': profile['birthdate'],
                            'email': profile['mail'],
                         'start_year': random.randint(2000, 2019),
                         'end_year': random.randint(2005, 2024)})
teacher_dim_df = pd.DataFrame(data=teachers)
# teacher_dim_df.to_csv('teacher_dim.csv', index=False)
teacher_dim_df.head()

Unnamed: 0,school_id,teacher_name,sex,birthdate,email,start_year,end_year
0,1,Alexia Lima,F,1981-04-02,milena71@hotmail.com,2019,2011
1,1,Nicole Costela,F,2016-11-02,cardosobrenda@gmail.com,2003,2008
2,1,Brenda Freitas,F,1934-07-14,barbara12@yahoo.com.br,2016,2021
3,1,Leonardo Moraes,M,1958-03-18,ana-sophia78@gmail.com,2007,2015
4,1,Sr. João Felipe Rocha,M,1922-03-26,lvieira@ig.com.br,2016,2011


In [14]:
# insert the data from the df teacher_dim_df into the table teacher_dim, but only the start and end year, the school_id will be generated automatically
cursor = cnx.cursor()
cursor.execute('DROP TABLE IF EXISTS teacher_dim')
cursor.execute('CREATE TABLE teacher_dim (teacher_id INT AUTO_INCREMENT PRIMARY KEY, teacher_name VARCHAR(255), sex CHAR(1), birthdate DATE, email VARCHAR(255), start_year INT, end_year INT)')
cursor.executemany('INSERT INTO teacher_dim (teacher_name, sex, birthdate, email, start_year, end_year) VALUES (%s, %s, %s, %s, %s, %s)', teacher_dim_df[['teacher_name', 'sex', 'birthdate', 'email', 'start_year', 'end_year']].values.tolist())
cnx.commit()

In [15]:
# check if the table was created and populated correctly, print the first 10 rows
cursor = cnx.cursor()
cursor.execute('SELECT * FROM teacher_dim LIMIT 10')
for row in cursor:
    print(row)
cursor.close()

(1, 'Alexia Lima', 'F', datetime.date(1981, 4, 2), 'milena71@hotmail.com', 2019, 2011)
(2, 'Nicole Costela', 'F', datetime.date(2016, 11, 2), 'cardosobrenda@gmail.com', 2003, 2008)
(3, 'Brenda Freitas', 'F', datetime.date(1934, 7, 14), 'barbara12@yahoo.com.br', 2016, 2021)
(4, 'Leonardo Moraes', 'M', datetime.date(1958, 3, 18), 'ana-sophia78@gmail.com', 2007, 2015)
(5, 'Sr. João Felipe Rocha', 'M', datetime.date(1922, 3, 26), 'lvieira@ig.com.br', 2016, 2011)
(6, 'Dr. Benício Vieira', 'M', datetime.date(1952, 8, 24), 'udias@ig.com.br', 2006, 2024)
(7, 'Ana Laura Pereira', 'F', datetime.date(1972, 4, 22), 'portopedro-miguel@bol.com.br', 2006, 2006)
(8, 'Evelyn Ferreira', 'F', datetime.date(1948, 11, 7), 'vitoriada-cruz@ig.com.br', 2002, 2019)
(9, 'João Guilherme da Paz', 'M', datetime.date(1914, 12, 31), 'diasrenan@uol.com.br', 2004, 2010)
(10, 'Dr. Antônio Rodrigues', 'M', datetime.date(1960, 10, 28), 'da-rosajoao-felipe@gmail.com', 2016, 2017)


True

In [16]:
# create a table called type_df with different types of classes, either 'ONLINE' or 'IN PERSON'
class_type_df = pd.DataFrame(data=[{'type': 'ONLINE'}, {'type': 'IN PERSON'}])
# class_type_df.to_csv('type_dim.csv', index=False)
class_type_df

Unnamed: 0,type
0,ONLINE
1,IN PERSON


In [17]:
# insert the data from the df class_type_df into the table class_type_dim
cursor = cnx.cursor()
cursor.execute('DROP TABLE IF EXISTS class_type_dim')
cursor.execute('CREATE TABLE class_type_dim (type_id INT AUTO_INCREMENT PRIMARY KEY, type CHAR(20))')
cursor.executemany('INSERT INTO class_type_dim (type) VALUES (%s)', class_type_df.values.tolist())
cnx.commit()

In [18]:
#  check if the table was created and populated correctly
cursor = cnx.cursor()
cursor.execute('SELECT * FROM class_type_dim')
for row in cursor:
    print(row)
cursor.close()


(1, 'ONLINE')
(2, 'IN PERSON')


True

In [19]:
# create a table called classroom with different codes for each school
classroom = []
for i in range(SCHOOLS):
    for j in range(school_dim_df['max_students'][i] // random.randint(30,50)):
        classroom.append({'school_id': i + 1,
                          'classroom_code': fake.unique.bothify(text='???###')})
classroom_df = pd.DataFrame(data=classroom)
# classroom_df.to_csv('classroom_dim.csv', index=False)
classroom_df.head()


Unnamed: 0,school_id,classroom_code
0,1,csb997
1,1,Ybz612
2,1,PCa319
3,1,FOU108
4,1,Pqa664


In [20]:
# insert the data from the df classroom_df into the table classroom_dim, but only the classroom_code
cursor = cnx.cursor()
cursor.execute('DROP TABLE IF EXISTS classroom_dim')
cursor.execute('CREATE TABLE classroom_dim (classroom_id INT AUTO_INCREMENT PRIMARY KEY, classroom_code CHAR(6))')
cursor.executemany('INSERT INTO classroom_dim (classroom_code) VALUES (%s)', classroom_df[['classroom_code']].values.tolist())
cnx.commit()

In [21]:
# check if the table was created and populated correctly, print the first 10 rows
cursor = cnx.cursor()
cursor.execute('SELECT * FROM classroom_dim LIMIT 10')
for row in cursor:
    print(row)
cursor.close()

(1, 'csb997')
(2, 'Ybz612')
(3, 'PCa319')
(4, 'FOU108')
(5, 'Pqa664')
(6, 'aLD808')
(7, 'ktO002')
(8, 'FYo650')
(9, 'Okq469')
(10, 'kBb301')


True

In [23]:
# for each year from 2000 to 2022, and for each school, create a list of classes with random.randint(30,50) students
# so that the total number of students is not greater than the max_students of the school
classes = []
CLASS_LEVEL_OPTIONS = {
    'High': ['Elementary', 'Middle', 'High'],
    'Middle': ['Elementary', 'Middle'],
    'Elementary': ['Elementary']
}
for i in range(SCHOOLS):
    for j in range(2000, 2023):
        max_students = school_dim_df['max_students'][i]
        number_of_classes = max_students // 50
        for k in range(number_of_classes):
            classes.append({'school_id': i + 1,
                            'year': j,
                            'class_code': fake.unique.bothify(text='???###'),
                            'students': random.randint(30, 50),
                            'class_period': random.choice(['Morning', 'Afternoon', 'Evening']),
                            'class_level': random.choice(CLASS_LEVEL_OPTIONS[school_dim_df['school_level'][i]])
                            })
classes_df = pd.DataFrame(data=classes)
# classes_df.to_csv('classes_dim.csv', index=False)
classes_df.head()

Unnamed: 0,school_id,year,class_code,students,class_period,class_level
0,1,2000,FXz103,48,Morning,High
1,1,2000,puA234,48,Morning,Middle
2,1,2000,vrg139,39,Morning,High
3,1,2000,TxZ209,50,Afternoon,Elementary
4,1,2000,Xpr474,36,Evening,High


In [24]:
# insert the data from the df classes_df into the table classes_dim, but only the class_code, students, class_period and class_level
cursor = cnx.cursor()
cursor.execute('DROP TABLE IF EXISTS classes_dim')
cursor.execute('CREATE TABLE classes_dim (class_id INT AUTO_INCREMENT PRIMARY KEY, class_code CHAR(6), students INT, class_period CHAR(20), class_level CHAR(20))')
cursor.executemany('INSERT INTO classes_dim (class_code, students, class_period, class_level) VALUES (%s, %s, %s, %s)', classes_df[['class_code', 'students', 'class_period', 'class_level']].values.tolist())
cnx.commit()

In [25]:
# check if the table was created and populated correctly, print the first 10 rows
cursor = cnx.cursor()
cursor.execute('SELECT * FROM classes_dim LIMIT 10')
for row in cursor:
    print(row)
cursor.close()

(1, 'FXz103', 48, 'Morning', 'High')
(2, 'puA234', 48, 'Morning', 'Middle')
(3, 'vrg139', 39, 'Morning', 'High')
(4, 'TxZ209', 50, 'Afternoon', 'Elementary')
(5, 'Xpr474', 36, 'Evening', 'High')
(6, 'Wpt601', 44, 'Morning', 'Elementary')
(7, 'aKn096', 48, 'Afternoon', 'High')
(8, 'mBE865', 35, 'Afternoon', 'Middle')
(9, 'Xcf050', 47, 'Morning', 'Middle')
(10, 'CUw638', 33, 'Morning', 'High')


True

In [26]:
# count the number of rows in each table of the escola database
cursor = cnx.cursor()
cursor.execute('SELECT COUNT(*) FROM date_dim')
for row in cursor:
    print('date_dim:', row[0])
cursor.execute('SELECT COUNT(*) FROM school_dim')
for row in cursor:
    print('school_dim:', row[0])
cursor.execute('SELECT COUNT(*) FROM school_subject_dim')
for row in cursor:
    print('school_subject_dim:', row[0])
cursor.execute('SELECT COUNT(*) FROM teacher_dim')
for row in cursor:
    print('teacher_dim:', row[0])
cursor.execute('SELECT COUNT(*) FROM class_type_dim')
for row in cursor:
    print('class_type_dim:', row[0])
cursor.execute('SELECT COUNT(*) FROM classroom_dim')
for row in cursor:
    print('classroom_dim:', row[0])
cursor.execute('SELECT COUNT(*) FROM classes_dim')
for row in cursor:
    print('classes_dim:', row[0])
cursor.close()


date_dim: 7670
school_dim: 1000
school_subject_dim: 10
teacher_dim: 273386
class_type_dim: 2
classroom_dim: 13663
classes_dim: 243731


True