In [53]:
import pandas as pd
import holidays
import mysql.connector
import os
from faker import Faker
from faker_education import SchoolProvider
from dotenv import load_dotenv
load_dotenv()

True

In [54]:
Faker.seed(1607)
fake = Faker('pt_BR')
fake.add_provider(SchoolProvider)

In [55]:
# create a date dimension table and populate it 20 years ago to today
date_dim_columns = ['date', 'year', 'month', 'day', 'weekday', 'day_of_year', 'is_holiday', 'is_weekend']
date_dim_df = pd.DataFrame(columns=date_dim_columns)
date_dim_df['date'] = pd.date_range(start='20020101', end='20221231')
date_dim_df['year'] = date_dim_df['date'].dt.year
date_dim_df['month'] = date_dim_df['date'].dt.month
date_dim_df['day'] = date_dim_df['date'].dt.day
date_dim_df['weekday'] = date_dim_df['date'].dt.weekday
date_dim_df['day_of_year'] = date_dim_df['date'].dt.dayofyear
date_dim_df['is_holiday'] = date_dim_df['date'].apply(lambda x: x in holidays.BR(state='SP', years=x.year))
date_dim_df['is_weekend'] = date_dim_df['date'].dt.weekday.isin([5, 6])
# date_dim_df.to_csv('date_dim.csv', index=False)
date_dim_df.head()

Unnamed: 0,date,year,month,day,weekday,day_of_year,is_holiday,is_weekend
0,2002-01-01,2002,1,1,1,1,True,False
1,2002-01-02,2002,1,2,2,2,False,False
2,2002-01-03,2002,1,3,3,3,False,False
3,2002-01-04,2002,1,4,4,4,False,False
4,2002-01-05,2002,1,5,5,5,False,True


In [56]:
cnx = mysql.connector.connect(user=os.environ['MYSQL_USER'],
                            password=os.environ['MYSQL_PASSWORD'],
                            host=os.environ['MYSQL_HOST'],
                            database=os.environ['MYSQL_DATABASE'],
                            auth_plugin='mysql_native_password')

In [57]:
# create table with the same structure as the df date_dim_df, with date_id auto incremented, and insert data from the df date_dim_df
cursor = cnx.cursor()
cursor.execute('DROP TABLE IF EXISTS date_dim')
cursor.execute('CREATE TABLE date_dim (date_id INT AUTO_INCREMENT PRIMARY KEY, date DATE, year INT, month INT, day INT, weekday INT, day_of_year INT, is_holiday BOOLEAN, is_weekend BOOLEAN)')
cursor.executemany('INSERT INTO date_dim (date, year, month, day, weekday, day_of_year, is_holiday, is_weekend) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)', date_dim_df.values.tolist())
cnx.commit()
cursor.close()

True

In [58]:
# check if the table was created and populated correctly, print the first 10 rows
cursor = cnx.cursor()
cursor.execute('SELECT * FROM date_dim LIMIT 10')
for row in cursor:
    print(row)
cursor.close()

(1, datetime.date(2002, 1, 1), 2002, 1, 1, 1, 1, 1, 0)
(2, datetime.date(2002, 1, 2), 2002, 1, 2, 2, 2, 0, 0)
(3, datetime.date(2002, 1, 3), 2002, 1, 3, 3, 3, 0, 0)
(4, datetime.date(2002, 1, 4), 2002, 1, 4, 4, 4, 0, 0)
(5, datetime.date(2002, 1, 5), 2002, 1, 5, 5, 5, 0, 1)
(6, datetime.date(2002, 1, 6), 2002, 1, 6, 6, 6, 0, 1)
(7, datetime.date(2002, 1, 7), 2002, 1, 7, 0, 7, 0, 0)
(8, datetime.date(2002, 1, 8), 2002, 1, 8, 1, 8, 0, 0)
(9, datetime.date(2002, 1, 9), 2002, 1, 9, 2, 9, 0, 0)
(10, datetime.date(2002, 1, 10), 2002, 1, 10, 3, 10, 0, 0)


True

In [61]:
# we are modelling a network of 1000 schools from the Alan Turing Group, each with a different number of students, a different location, and different teachers
SCHOOLS = 1000
school_dim_columns = ['school_name', 'school_district', 'school_level', 'school_state', 'school_students']
school_dim_df = pd.DataFrame(columns=school_dim_columns)
schools = [fake.school_object() for _ in range(SCHOOLS)]
school_dim_df['school_name'] = [school['school'] for school in schools]
school_dim_df['school_district'] = [school['district'] for school in schools]
school_dim_df['school_level'] = [school['level'] for school in schools]
school_dim_df['school_state'] = [school['state'] for school in schools]
school_dim_df['school_students'] = pd.Series(pd.np.random.randint(low=100, high=1000, size=SCHOOLS))
# school_dim_df.to_csv('school_dim.csv', index=False)
school_dim_df.head()

  school_dim_df['school_students'] = pd.Series(pd.np.random.randint(low=100, high=1000, size=SCHOOLS))


Unnamed: 0,school_name,school_district,school_level,school_state,school_students
0,CATOOSA HS,CATOOSA,High,OK,316
1,Churchill Elem School,CUSD 308,Elementary,IL,705
2,DEAUVILLE GARDENS EAST ELEMENTARY SCHOOL,COPIAGUE UNION FREE SCHOOL DISTRICT,Elementary,NY,429
3,G.H. REID ELEM,RICHMOND CITY PBLC SCHS,Elementary,VA,327
4,BADGER ELEMENTARY,BADGER PUBLIC SCHOOL DISTRICT,Elementary,MN,462
