In [None]:
# Import PySpark
from pyspark.sql import SparkSession

#Create SparkSession
spark = SparkSession.builder.appName("Capstone Analytics Samples").getOrCreate()

In [None]:
fact_immigration = spark.read.parquet('tables/fact_immigration')
dim_immigrant_person = spark.read.parquet('tables/dim_immigrant_person')
dim_city = spark.read.parquet('tables/dim_city')
dim_time = spark.read.parquet('tables/dim_time')

Create tempory views for the queries

In [None]:
fact_immigration.createOrReplaceTempView("fact_immigration_table")
dim_immigrant_person.createOrReplaceTempView("immigrant_table")
dim_city.createOrReplaceTempView("dim_city_table")
dim_time.createOrReplaceTempView("dim_time_table")

In [None]:
foreign_born_inhabitants = spark.sql("""
            SELECT DISTINCT
                f.cicid as immigrant_id,
                f.i94cit,
                c.city_code,
                c.city_name,
                c.foreign_born
            FROM dim_city_table c
            JOIN fact_immigration_table f
                ON f.cicid = c.city_code
            GROUP BY c.city_name, c.foreign_born
""")

In [None]:
arrivals_by_weekday = spark.sql("""
    SELECT t.day_of_week, COUNT(*) as count
    FROM fact_immigration_table i
    INNER JOIN dim_city_table c ON i.i94port = c.city_code
    INNER JOIN dim_time_table t ON i.arrival_ts = t.ts
    WHERE t.year=2016 AND t.month=2
    GROUP BY t.day_of_week
    ORDER BY t.day_of_week
""")

In [None]:
most_used_ports = spark.sql("""
    SELECT c.city_code, c.port_state, COUNT(*) as count
    FROM fact_immigration i
    INNER JOIN dim_city_table p ON i.port_id = p.port_id
    INNER JOIN dim_time_table t ON i.arrival_ts = t.ts
    WHERE t.year=2016 AND t.month=2
    GROUP BY c.city_code, c.city_name
    ORDER BY count DESC
    LIMIT 10
    """)