# Data Engineering Capstone Project


In [18]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [19]:
# Do all imports and installs here
import sys, os
import logging
import pandas as pd
from pathlib import Path
from typing import Iterable
from IPython import display as ICD

In [20]:
src_path: str = "../src"
sys.path.append(src_path)
logging.getLogger().setLevel(logging.INFO)

In [21]:
from utils.io import process_config
from utils.aws import create_s3_bucket
from utils.spark import create_spark_session
from data.tables import ON_LOAD_TABLES_SCHEMA, ON_LOAD_TABLES_FILES

In [22]:
user_config, dl_config = (
    process_config(Path(os.getcwd()).parent.joinpath("_user.cfg")),
    process_config(Path(os.getcwd()).parent.joinpath("dl.cfg")),
)
spark = create_spark_session(user_config, dl_config)
s3_bucket_prefix = dl_config.get("S3", "BUCKET_NAME")

---

## 1. Preview raw data


In [23]:
for table_name, table_schema in ON_LOAD_TABLES_SCHEMA.items():
    table_paths = ON_LOAD_TABLES_FILES[table_name]
    table_df = spark.read.csv(
        (
            str(table_paths)
            if not isinstance(table_paths, Iterable)
            else [str(p) for p in table_paths]
        ),
        schema=ON_LOAD_TABLES_SCHEMA[table_name],
        header=True,
    )

    n_elem = table_df.count()
    table_df_preview = spark.createDataFrame(
        table_df.take(5),
        schema=ON_LOAD_TABLES_SCHEMA[table_name],
    ).toPandas()

    print(f"First 5 rows of {table_name}:")
    print(f"Columns: {table_df.columns}.")
    ICD.display(table_df_preview)
    print(f"The full table contains a total of {n_elem} records\n\n")

                                                                                

First 5 rows of i94_immigration:
Columns: ['cicid', 'i94yr', 'i94mon', 'i94cit', 'i94res', 'i94port', 'arrdate', 'i94mode', 'i94addr', 'depdate', 'i94bir', 'i94visa', 'count', 'dtadfile', 'visapost', 'occup', 'entdepa', 'entdepd', 'entdepu', 'matflag', 'biryear', 'dtaddto', 'gender', 'insnum', 'airline', 'admnum', 'fltno', 'visatype'].


Unnamed: 0,cicid,i94yr,i94mon,i94cit,i94res,i94port,arrdate,i94mode,i94addr,depdate,...,entdepu,matflag,biryear,dtaddto,gender,insnum,airline,admnum,fltno,visatype
0,1.0,2016.0,7.0,254.0,276.0,LOS,20636.0,1.0,CA,20640.0,...,,M,1978.0,9282016.0,M,,OZ,63092900000.0,202,WT
1,2.0,2016.0,7.0,140.0,140.0,NYC,20636.0,1.0,NY,20657.0,...,,M,1971.0,9282016.0,F,,DL,63092900000.0,9858,WT
2,3.0,2016.0,7.0,135.0,135.0,ORL,20636.0,1.0,FL,20657.0,...,,M,2006.0,9282016.0,M,,VS,63092900000.0,71,WT
3,4.0,2016.0,7.0,124.0,124.0,TAM,20636.0,1.0,FL,20645.0,...,,M,1999.0,9282016.0,M,,LH,63092900000.0,482,WT
4,5.0,2016.0,7.0,130.0,130.0,LOS,20636.0,1.0,CA,20662.0,...,,M,2015.0,9282016.0,M,,SU,63092900000.0,106,WT


The full table contains a total of 40790529 records


First 5 rows of us_demographics:
Columns: ['City', 'State', 'Median Age', 'Male Population', 'Female Population', 'Total Population', 'Number of Veterans', 'Foreign-born', 'Average Household Size', 'State Code', 'Race', 'Count'].


Unnamed: 0,City,State,Median Age,Male Population,Female Population,Total Population,Number of Veterans,Foreign-born,Average Household Size,State Code,Race,Count
0,Silver Spring,Maryland,33.799999,40601.0,41862.0,82463,1562.0,30908.0,2.6,MD,Hispanic or Latino,25924
1,Quincy,Massachusetts,41.0,44129.0,49500.0,93629,4147.0,32935.0,2.39,MA,White,58723
2,Hoover,Alabama,38.5,38040.0,46799.0,84839,4819.0,8229.0,2.58,AL,Asian,4759
3,Rancho Cucamonga,California,34.5,88127.0,87105.0,175232,5821.0,33878.0,3.18,CA,Black or African-American,24437
4,Newark,New Jersey,34.599998,138040.0,143873.0,281913,5829.0,86253.0,2.73,NJ,White,76402


The full table contains a total of 2891 records


First 5 rows of airport_codes:
Columns: ['ident', 'type', 'name', 'elevation_ft', 'continent', 'iso_country', 'iso_region', 'municipality', 'gps_code', 'iata_code', 'local_code', 'coordinates'].


Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates
0,00A,heliport,Total Rf Heliport,11.0,,US,US-PA,Bensalem,00A,,00A,"-74.93360137939453, 40.07080078125"
1,00AA,small_airport,Aero B Ranch Airport,3435.0,,US,US-KS,Leoti,00AA,,00AA,"-101.473911, 38.704022"
2,00AK,small_airport,Lowell Field,450.0,,US,US-AK,Anchor Point,00AK,,00AK,"-151.695999146, 59.94919968"
3,00AL,small_airport,Epps Airpark,820.0,,US,US-AL,Harvest,00AL,,00AL,"-86.77030181884766, 34.86479949951172"
4,00AR,closed,Newport Hospital & Clinic Heliport,237.0,,US,US-AR,Newport,,,,"-91.254898, 35.6087"


The full table contains a total of 55075 records




                                                                                

First 5 rows of world_temperature:
Columns: ['dt', 'AverageTemperature', 'AverageTemperatureUncertainty', 'City', 'Country', 'Latitude', 'Longitude'].


Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
0,1743-11-01,6.068,1.737,Århus,Denmark,57.05N,10.33E
1,1744-04-01,5.788,3.624,Århus,Denmark,57.05N,10.33E
2,1744-05-01,10.644,1.283,Århus,Denmark,57.05N,10.33E
3,1744-06-01,14.051,1.347,Århus,Denmark,57.05N,10.33E
4,1744-07-01,16.082001,1.396,Århus,Denmark,57.05N,10.33E


The full table contains a total of 8235082 records




In [24]:
from etl import (
    extract_dim_cities,
    extract_dim_airports,
    extract_fact_temps,
    extract_fact_us_demogr,
)
import pyspark.sql.functions as F

In [25]:
us_demographics_path = f"s3a://{s3_bucket_prefix}/clean/us_demographics"
airport_codes_path = f"s3a://{s3_bucket_prefix}/clean/airport_codes"
s3_save_path = f"s3a://{s3_bucket_prefix}/star/dim_cities"

dim_cities = extract_dim_cities(
    spark, us_demographics_path, airport_codes_path, s3_save_path
)

INFO:root:dim_cities has 12069 records                                          
                                                                                

In [26]:
airport_codes_path = f"s3a://{s3_bucket_prefix}/clean/airport_codes"
dim_cities_path = f"s3a://{s3_bucket_prefix}/star/dim_cities"
s3_save_path = f"s3a://{s3_bucket_prefix}/star/dim_airports"

dim_airports = extract_dim_airports(
    spark, airport_codes_path, dim_cities_path, s3_save_path
)

INFO:root:dim_airports has 11951 records                                        
                                                                                

In [27]:
world_temperature_path = f"s3a://{s3_bucket_prefix}/clean/world_temperature"
dim_cities_path = f"s3a://{s3_bucket_prefix}/star/dim_cities"
s3_save_path = f"s3a://{s3_bucket_prefix}/star/fact_temps"

fact_temps = extract_fact_temps(
    spark, world_temperature_path, dim_cities_path, s3_save_path
)

INFO:root:fact_temps has 246 records                                            
                                                                                

In [28]:
us_demographics_path = f"s3a://{s3_bucket_prefix}/clean/us_demographics"
dim_cities_path = f"s3a://{s3_bucket_prefix}/star/dim_cities"
s3_save_path = f"s3a://{s3_bucket_prefix}/star/fact_us_demogr"

fact_us_demogr = extract_fact_us_demogr(
    spark, us_demographics_path, dim_cities_path, s3_save_path
)

INFO:root:fact_us_demogr has 596 records                                        
                                                                                

In [29]:
fact_us_demogr.toPandas()

                                                                                

Unnamed: 0,median_age,male_population,female_population,total_population,number_of_veterans,foreign_born,average_household_size,american_indian_and_alaska_native,asian,black_or_african_american,hispanic_or_latino,white,city_id
0,29.100000,47293.0,51045.0,98338,3647.0,4706.0,2.67,261.0,2733.0,42331.0,2475,52603.0,443
1,41.400002,75358.0,74363.0,149721,6056.0,55158.0,2.65,1010.0,4696.0,34916.0,53247,107916.0,1945
2,31.000000,50792.0,50091.0,100883,4294.0,11480.0,2.75,2449.0,2202.0,2856.0,39271,92874.0,1567
3,32.799999,88385.0,94000.0,182385,18896.0,12589.0,2.46,2201.0,8275.0,79245.0,15780,95151.0,10942
4,35.299999,49363.0,60453.0,109816,7242.0,19875.0,3.30,2327.0,16041.0,9259.0,32944,77700.0,1147
...,...,...,...,...,...,...,...,...,...,...,...,...,...
591,24.200001,40015.0,27348.0,67363,8252.0,3732.0,2.51,1741.0,4204.0,13253.0,11947,51245.0,6368
592,31.700001,64018.0,63643.0,127661,5206.0,5396.0,2.54,1183.0,2776.0,39862.0,6556,85282.0,4276
593,37.200001,63316.0,66186.0,129502,4724.0,38552.0,3.17,1262.0,6573.0,27225.0,40817,90896.0,1935
594,36.900002,35527.0,39023.0,74550,3537.0,5715.0,2.67,603.0,2788.0,22179.0,6653,46362.0,6364


---

## 2. Run ETL pipeline


Create S3 bucket for clean data


In [30]:
assert create_s3_bucket(user_config, dl_config), "Error creating S3 bucket."

INFO:root:Bucket cupm-de-capstone already exists.
INFO:root:Available buckets: [s3.Bucket(name='cupm-de-capstone')]


Trigger Airflow DAG here ([reference](https://stackoverflow.com/questions/60055151/how-to-trigger-an-airflow-dag-run-from-within-a-python-script))


---

## 3. Run analytics queries on dimensional tables


---

### Step 2: Explore and Assess the Data

#### Explore the Data

Identify data quality issues, like missing values, duplicate data, etc.

#### Cleaning Steps

Document steps necessary to clean the data


In [31]:
# Performing cleaning tasks here

---

### Step 3: Define the Data Model

#### 3.1 Conceptual Data Model

Map out the conceptual data model and explain why you chose that model

#### 3.2 Mapping Out Data Pipelines

List the steps necessary to pipeline the data into the chosen data model


---

### Step 4: Run Pipelines to Model the Data

#### 4.1 Create the data model

Build the data pipelines to create the data model.


In [32]:
# Write code here

#### 4.2 Data Quality Checks

Explain the data quality checks you'll perform to ensure the pipeline ran as expected. These could include:

- Integrity constraints on the relational database (e.g., unique key, data type, etc.)
- Unit tests for the scripts to ensure they are doing the right thing
- Source/Count checks to ensure completeness

Run Quality Checks


In [33]:
# Perform quality checks here

#### 4.3 Data dictionary

Create a data dictionary for your data model. For each field, provide a brief description of what the data is and where it came from. You can include the data dictionary in the notebook or in a separate file.


---

### Step 5: Complete Project Write Up

- Clearly state the rationale for the choice of tools and technologies for the project.
- Propose how often the data should be updated and why.
- Write a description of how you would approach the problem differently under the following scenarios:
- The data was increased by 100x.
- The data populates a dashboard that must be updated on a daily basis by 7am every day.
- The database needed to be accessed by 100+ people.
