# Data Engineering Capstone Project


In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# Do all imports and installs here
import sys, os
import pandas as pd
from pathlib import Path

In [None]:
src_path: str = "../src"
sys.path.append(src_path)

In [None]:
from utils.io import process_config
from utils.spark import create_spark_session

In [None]:
data_path: Path = Path("../data")
user_config, dl_config = (
    process_config(Path(os.getcwd()).parent.joinpath("_user.cfg")),
    process_config(Path(os.getcwd()).parent.joinpath("dl.cfg")),
)
spark = create_spark_session(user_config, dl_config)


---

## 1. Data preview and exploration


### 1.1. I94 Immigration Data


In [None]:
i94_df = pd.read_csv(
    data_path.joinpath("i94_inmigration_data_2016").joinpath("data_sample.csv.bz2"),
    index_col=0,
)
print(i94_df.columns)
i94_df

### 1.2. World Temperature Data


In [None]:
temp_df = pd.read_csv(
    data_path.joinpath("global_land_temperature_by_city.csv.bz2"),
    index_col=0,
).dropna(subset=["AverageTemperature"])
print(temp_df.columns)
temp_df

In [None]:
temp_df[temp_df["Country"] == "United States"]

### 1.3. U.S. City Demographic Data


In [None]:
us_dem_df = pd.read_csv(data_path.joinpath("us_cities_demographics.csv.bz2"))
print(us_dem_df.columns)
us_dem_df

### 1.4. Airport Codes


In [None]:
airp_df = pd.read_csv(data_path.joinpath("airport_codes.csv.bz2"), index_col=0)
print(airp_df.columns)
airp_df

US Airports

In [None]:
airp_df[airp_df["iso_country"] == "US"]

---

### Step 2: Explore and Assess the Data

#### Explore the Data

Identify data quality issues, like missing values, duplicate data, etc.

#### Cleaning Steps

Document steps necessary to clean the data


In [None]:
# Performing cleaning tasks here

---

### Step 3: Define the Data Model

#### 3.1 Conceptual Data Model

Map out the conceptual data model and explain why you chose that model

#### 3.2 Mapping Out Data Pipelines

List the steps necessary to pipeline the data into the chosen data model


---

### Step 4: Run Pipelines to Model the Data

#### 4.1 Create the data model

Build the data pipelines to create the data model.


In [None]:
# Write code here

#### 4.2 Data Quality Checks

Explain the data quality checks you'll perform to ensure the pipeline ran as expected. These could include:

- Integrity constraints on the relational database (e.g., unique key, data type, etc.)
- Unit tests for the scripts to ensure they are doing the right thing
- Source/Count checks to ensure completeness

Run Quality Checks


In [None]:
# Perform quality checks here

#### 4.3 Data dictionary

Create a data dictionary for your data model. For each field, provide a brief description of what the data is and where it came from. You can include the data dictionary in the notebook or in a separate file.


---

### Step 5: Complete Project Write Up

- Clearly state the rationale for the choice of tools and technologies for the project.
- Propose how often the data should be updated and why.
- Write a description of how you would approach the problem differently under the following scenarios:
- The data was increased by 100x.
- The data populates a dashboard that must be updated on a daily basis by 7am every day.
- The database needed to be accessed by 100+ people.
