# Data Engineering Capstone Project


In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# Do all imports and installs here
import sys, os
import logging
import pandas as pd
from pandas_profiling import ProfileReport
from pathlib import Path
from typing import Iterable
from IPython import display as ICD

In [None]:
src_path: str = "../src"
sys.path.append(src_path)
logging.getLogger().setLevel(logging.INFO)

In [None]:
from utils.io import process_config
from utils.aws import create_s3_bucket
from utils.spark import create_spark_session
from data.tables import (
    ON_LOAD_TABLES_SCHEMA,
    ON_LOAD_TABLES_FILES,
    STAR_EXTRACT_TABLES_ARGS,
)

In [None]:
user_config, dl_config = (
    process_config(Path(os.getcwd()).parent.joinpath("_user.cfg")),
    process_config(Path(os.getcwd()).parent.joinpath("dl.cfg")),
)
spark = create_spark_session(user_config, dl_config)
s3_bucket_prefix = dl_config.get("S3", "BUCKET_NAME")

---

## 1. Preview raw data


In [None]:
for table_name, table_schema in ON_LOAD_TABLES_SCHEMA.items():
    table_paths = ON_LOAD_TABLES_FILES[table_name]
    table_df = spark.read.csv(
        (
            str(table_paths)
            if not isinstance(table_paths, Iterable)
            else [str(p) for p in table_paths]
        ),
        schema=ON_LOAD_TABLES_SCHEMA[table_name],
        header=True,
    )

    n_elem = table_df.count()
    table_df_preview = spark.createDataFrame(
        table_df.take(5),
        schema=ON_LOAD_TABLES_SCHEMA[table_name],
    ).toPandas()

    print(f"First 5 rows of {table_name}:")
    print(f"Columns: {table_df.columns}.")
    ICD.display(table_df_preview)
    print(f"The full table contains a total of {n_elem} records\n\n")

---

## 2. Run ETL pipeline to extract STAR dimensional tables


Create S3 bucket to store all results


In [None]:
assert create_s3_bucket(user_config, dl_config), "Error creating S3 bucket."

#### Run Airflow DAG (`capstone_etl`) now.


---

## 3. Run analytics queries on dimensional tables


In [None]:
profiling_path = Path("../data").joinpath("profiling_reports")
profiling_path.mkdir(exist_ok=True)

### 3.1. Data profiling of dimensional tables

WARNING: Avoid for tables with numbers of rows in the order of dozens of millions, according to memory availability.


In [None]:
for table_name, table_args in STAR_EXTRACT_TABLES_ARGS.items():
    star_table = spark.read.parquet(table_args["op_kwargs"]["s3_save_path"]).toPandas()
    ProfileReport(star_table).to_file(profiling_path.joinpath(f"{table_name}.html"))

### 3.2. Example queries using combinations of dimensional tables


#### Do immigrants prefer destinations with higher or lower population?


Interpretation...

#### Do immigrants prefer destinations with higher or lower temperature?


Interpretation...

#### Do immigrants prefer destinations with higher or less airports?


Interpretation...

---


#### 4.3 Data dictionary

Create a data dictionary for your data model. For each field, provide a brief description of what the data is and where it came from. You can include the data dictionary in the notebook or in a separate file.


---

### Step 5: Complete Project Write Up

- Clearly state the rationale for the choice of tools and technologies for the project.
- Propose how often the data should be updated and why.
- Write a description of how you would approach the problem differently under the following scenarios:
  - The data was increased by 100x.
  - The data populates a dashboard that must be updated on a daily basis by 7am every day.
  - The database needed to be accessed by 100+ people.
