# US Immigration 
### Data Engineering Capstone Project
#### Sample Data Analysis queries

In [1]:
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql.types import DateType, StringType
from pyspark.sql.functions import col, sum, udf, when, round
from pyspark.sql.functions import monotonically_increasing_id

In [2]:
# create the spark session
spark = SparkSession.builder.\
config("spark.jars.repositories", "https://repos.spark-packages.org/").\
config("spark.jars.packages", "saurfang:spark-sas7bdat:2.0.0-s_2.11").\
enableHiveSupport().getOrCreate()

#### Import data frames

In [3]:
data_storage = 'data_warehouse/'

In [4]:
fact_i94 = spark.read.parquet('{}fact_i94'.format(data_storage))

In [5]:
dim_city = spark.read.parquet('{}dim_city'.format(data_storage))
dim_state = spark.read.parquet('{}dim_state'.format(data_storage))
dim_country = spark.read.parquet('{}dim_country'.format(data_storage))
dim_visacat = spark.read.parquet('{}dim_visacat'.format(data_storage))
dim_travelmode = spark.read.parquet('{}dim_travelmode'.format(data_storage))
dim_demographics = spark.read.parquet('{}dim_demographics'.format(data_storage))

In [6]:
# create temporary views for SQL queries
dim_country.createOrReplaceTempView('country')
dim_state.createOrReplaceTempView('states')
dim_city.createOrReplaceTempView('city')
dim_visacat.createOrReplaceTempView('visacat')

In [7]:
fact_i94.createOrReplaceTempView('fact_i94_table')

In [8]:
dim_demographics.createOrReplaceTempView('city_demo')

#### basic data analysis
---

Select a table with the count of the three visa categories per city (port of entry) combined with median age, average household size, and total population.

In [9]:
query = """
        WITH cte AS (
            SELECT c.city AS city, COUNT(v.visacat) AS visacatcount, v.visacat 
            FROM fact_i94_table AS f 
            JOIN visacat AS v ON f.i94visa = v.i94visa
            JOIN city AS c ON f.i94port = c.city_code
            GROUP BY c.city, v.visacat
            ORDER BY c.city
        )
        SELECT cte.*, cd.median_age, cd.average_household_size, cd.total_population 
        FROM cte
        JOIN city_demo AS cd ON UPPER(cd.city) = cte.city
        ORDER BY cd.total_population DESC
"""
i94 = spark.sql(query)
i94.show()

+------------+------------+--------+----------+----------------------+----------------+
|        city|visacatcount| visacat|median_age|average_household_size|total_population|
+------------+------------+--------+----------+----------------------+----------------+
|    NEW YORK|       12421| Student|      36.0|                  2.68|         8550405|
|    NEW YORK|       69867|Business|      36.0|                  2.68|         8550405|
|    NEW YORK|      538234|Pleasure|      36.0|                  2.68|         8550405|
| LOS ANGELES|      354203|Pleasure|      35.0|                  2.86|         3971896|
| LOS ANGELES|       51170|Business|      35.0|                  2.86|         3971896|
| LOS ANGELES|       10035| Student|      35.0|                  2.86|         3971896|
|     CHICAGO|      122085|Pleasure|      34.2|                  2.53|         2720556|
|     CHICAGO|       51854|Business|      34.2|                  2.53|         2720556|
|     CHICAGO|        9648| Stud

---