# Data Engineering Capstone Project


In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# Do all imports and installs here
import sys, os
import logging
import pandas as pd
from pandas_profiling import ProfileReport
from pathlib import Path
from typing import Iterable
from IPython import display as ICD

In [None]:
src_path: str = "../src"
sys.path.append(src_path)
logging.getLogger().setLevel(logging.INFO)

In [None]:
from utils.io import process_config
from utils.aws import create_s3_bucket
from utils.spark import create_spark_session
from data.tables import (
    ON_LOAD_TABLES_SCHEMA,
    ON_LOAD_TABLES_FILES,
    STAR_EXTRACT_TABLES_ARGS,
)

In [None]:
user_config, dl_config = (
    process_config(Path(os.getcwd()).parent.joinpath("_user.cfg")),
    process_config(Path(os.getcwd()).parent.joinpath("dl.cfg")),
)
spark = create_spark_session(user_config, dl_config)
s3_bucket_prefix = dl_config.get("S3", "BUCKET_NAME")

---

## 1. Preview raw data


In [None]:
for table_name, table_schema in ON_LOAD_TABLES_SCHEMA.items():
    table_paths = ON_LOAD_TABLES_FILES[table_name]
    table_df = spark.read.csv(
        (
            str(table_paths)
            if not isinstance(table_paths, Iterable)
            else [str(p) for p in table_paths]
        ),
        schema=ON_LOAD_TABLES_SCHEMA[table_name],
        header=True,
    )

    n_elem = table_df.count()
    table_df_preview = spark.createDataFrame(
        table_df.take(5),
        schema=ON_LOAD_TABLES_SCHEMA[table_name],
    ).toPandas()

    print(f"First 5 rows of {table_name}:")
    print(f"Columns: {table_df.columns}.")
    ICD.display(table_df_preview)
    print(f"The full table contains a total of {n_elem} records\n\n")

---

## 2. Run ETL pipeline to extract STAR dimensional tables


Create S3 bucket to store all results


In [None]:
assert create_s3_bucket(user_config, dl_config), "Error creating S3 bucket."

#### Run Airflow DAG (`capstone_etl`) now.
![Capstone DAG](../images/capstone_dag.png)


---

## 3. Run analytics queries on dimensional tables


In [None]:
profiling_path = Path("../data").joinpath("profiling_reports")
profiling_path.mkdir(exist_ok=True)

In [None]:
star_tables = {
    table_name: spark.read.parquet(table_args["op_kwargs"]["s3_save_path"])
    for table_name, table_args in STAR_EXTRACT_TABLES_ARGS.items()
}

### 3.1. Data profiling of dimensional tables

WARNING: Avoid for tables with numbers of rows in the order of dozens of millions, according to memory availability.


In [None]:
for table_name, table_df in star_tables.items():
    if table_name == "fact_immigration":
        continue
    star_table = table_df.toPandas()
    ProfileReport(star_table).to_file(profiling_path.joinpath(f"{table_name}.html"))

### 3.2. Example queries using combinations of dimensional tables


#### Do immigrants prefer destinations with higher or lower population?


In [None]:
df = (
    star_tables["fact_immigration"]
    .groupBy("city_id")
    .count()
    .join(
        star_tables["fact_us_demogr"],
        (
            star_tables["fact_immigration"]["city_id"]
            == star_tables["fact_us_demogr"]["city_id"]
        ),
    )
    .dropna(subset=["total_population"])
    .select(["count", "total_population"])
)

In [None]:
df.stat.corr("count", "total_population")

There is a high positive correlation between the population size of a city and the number of immigrants it attracts.


#### Do immigrants prefer destinations with higher or lower temperature?


In [None]:
df = (
    star_tables["fact_immigration"]
    .groupBy("city_id")
    .count()
    .join(
        star_tables["fact_temps"],
        (
            star_tables["fact_immigration"]["city_id"]
            == star_tables["fact_temps"]["city_id"]
        ),
    )
    .dropna(subset=["avg_temperature"])
    .select(["count", "avg_temperature"])
)

In [None]:
df.stat.corr("count", "avg_temperature")

There is no correlation between number of immigrants and average temperature of a city.


#### Do immigrants prefer destinations with more or less airports?


In [None]:
df = (
    star_tables["fact_immigration"]
    .groupBy("city_id")
    .count()
    .join(
        (
            star_tables["dim_airports"]
            .groupBy("city_id")
            .count()
            .withColumnRenamed("count", "airports_count")
        ),
        (
            star_tables["fact_immigration"]["city_id"]
            == star_tables["dim_airports"]["city_id"]
        ),
    )
    .select(["count", "airports_count"])
)

In [None]:
df.stat.corr("count", "airports_count")

There is a low, positive correlation between number of immigrants and number of airports in the receiving city.