# US Cities Demographics
## 1. ETL Process

In [1]:
import os

from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import first, count, when, col, coalesce, lit, abs as absolute

In [2]:
spark = SparkSession \
    .builder \
    .config('spark.jars.packages', 'org.apache.hadoop:hadoop-aws:2.7.0') \
    .appName('us_cities_demographics') \
    .getOrCreate()

Enter AWS credentials below and uncomment if data are to be stored in S3:

In [3]:
# spark.sparkContext \
#      ._jsc \
#      .hadoopConfiguration().set('fs.s3a.access.key', <enter access key>)

# spark.sparkContext \
#      ._jsc \
#      .hadoopConfiguration().set('fs.s3a.secret.key', <enter secret key>)
    
# spark.sparkContext \
#      ._jsc \
#      .hadoopConfiguration().set('fs.s3a.endpoint', 's3.amazonaws.com')

# spark.sparkContext \
#      ._jsc \
#      .hadoopConfiguration().set('mapreduce.fileoutputcommitter.algorithm.version', '2')

Read data from .csv file to Spark DataFrame.

In [4]:
path = 'datasets/us-cities-demographics.csv'
df = spark.read.csv(path, header=True, sep=';')

All data are cast from strings to their respective type, cities are capitalized and prefix SAINT is changed to ST to match city names in us_immigration table.

In [5]:
df.createOrReplaceTempView('us_cities_demographics')

In [6]:
df = spark.sql("""
    SELECT REPLACE(UPPER(City), 'SAINT', 'ST') AS city,
           `State Code` AS state_code,
           State AS state,
           CAST(`Total Population` AS INT) AS total_population,
           CAST(`Male Population` AS INT) AS male_population,
           CAST(`Female Population` AS INT) AS female_population,
           CAST(`Median Age` AS DOUBLE) AS median_age,
           CAST(`Average Household Size` AS DOUBLE) AS average_household_size,
           CAST(`Number of Veterans` AS INT) AS number_of_veterans,
           CAST(`Foreign-born` AS INT) AS foreign_born,
           Race AS race,
           CAST(Count AS INT) AS count
      FROM us_cities_demographics
""")

To transform the data into the desired form, race is used as a pivot. As a result, each row now represents a unique city where the population is not only split by gender, but also by race.

In [7]:
group_by_cols = [col for col in df.columns if col not in ['race','count']]

df = df.groupBy(*group_by_cols).pivot('race').agg(first('count'))

To account for the case where a particular race demographic does not exist in a city, nulls are converted to 0. Notice that columns 'Asian' and 'White' should not be dropped since no distinction is made between lowercase and capitalization.

In [8]:
df = df.withColumn('american_indian_and_alaska_native', coalesce(df['American Indian and Alaska Native'], lit(0))) \
       .withColumn('asian',                             coalesce(df['Asian'],                             lit(0))) \
       .withColumn('black_or_african_american',         coalesce(df['Black or African-American'],         lit(0))) \
       .withColumn('hispanic_or_latino',                coalesce(df['Hispanic or Latino'],                lit(0))) \
       .withColumn('white',                             coalesce(df['White'],                             lit(0))) \
       .drop('American Indian and Alaska Native') \
       .drop('Black or African-American') \
       .drop('Hispanic or Latino')

In [9]:
df = df.orderBy('state_code', 'city')

Save DataFrame as .parquet in s3a://us-immigration-project or locally.

In [10]:
# write to S3 bucket
# output_data = 's3a://us-immigration-project/'
# df.write.parquet(os.path.join(output_data, 'us_cities_demographics'), 'overwrite')

# for local write
# df.write.parquet('./output/us_cities_demographics', 'overwrite')

## 2. Dataset Info

DataFrame Schema:

In [11]:
df.printSchema()

root
 |-- city: string (nullable = true)
 |-- state_code: string (nullable = true)
 |-- state: string (nullable = true)
 |-- total_population: integer (nullable = true)
 |-- male_population: integer (nullable = true)
 |-- female_population: integer (nullable = true)
 |-- median_age: double (nullable = true)
 |-- average_household_size: double (nullable = true)
 |-- number_of_veterans: integer (nullable = true)
 |-- foreign_born: integer (nullable = true)
 |-- asian: integer (nullable = false)
 |-- white: integer (nullable = false)
 |-- american_indian_and_alaska_native: integer (nullable = false)
 |-- black_or_african_american: integer (nullable = false)
 |-- hispanic_or_latino: integer (nullable = false)



Sample DataFrame records:

In [12]:
df.show(n=2, truncate=False, vertical=True)

-RECORD 0---------------------------------------
 city                              | ANCHORAGE  
 state_code                        | AK         
 state                             | Alaska     
 total_population                  | 298695     
 male_population                   | 152945     
 female_population                 | 145750     
 median_age                        | 32.2       
 average_household_size            | 2.77       
 number_of_veterans                | 27492      
 foreign_born                      | 33258      
 asian                             | 36825      
 white                             | 212696     
 american_indian_and_alaska_native | 36339      
 black_or_african_american         | 23107      
 hispanic_or_latino                | 27261      
-RECORD 1---------------------------------------
 city                              | BIRMINGHAM 
 state_code                        | AL         
 state                             | Alabama    
 total_population   

DataFrame rows and columns:

In [13]:
print('rows: {}'.format(df.count()), 'columns: {}'.format(len(df.columns)))

rows: 596 columns: 15


## 3. Data Quality Check

Check for nulls in each column:

In [14]:
df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show(truncate=False, vertical=True)

-RECORD 0--------------------------------
 city                              | 0   
 state_code                        | 0   
 state                             | 0   
 total_population                  | 0   
 male_population                   | 1   
 female_population                 | 1   
 median_age                        | 0   
 average_household_size            | 8   
 number_of_veterans                | 7   
 foreign_born                      | 7   
 asian                             | 0   
 white                             | 0   
 american_indian_and_alaska_native | 0   
 black_or_african_american         | 0   
 hispanic_or_latino                | 0   



Check that Total Population = Male Population + Female Population in all cities:

In [15]:
df.filter((df.male_population + df.female_population - df.total_population) != 0).count()

0

Check that Total Population = Asian + White + American Indian and Alaska Native + Black + Hispanic in all cities:

In [16]:
df.filter((df.asian + df.white + df.american_indian_and_alaska_native \
           + df.black_or_african_american + df.hispanic_or_latino - df.total_population) != 0).count()

596

Total Population is not equal to the sum of population of each race in every city. One can speculate that the race demographics come from a different census.

In [17]:
df.filter(absolute(df.asian + df.white + df.american_indian_and_alaska_native \
                     + df.black_or_african_american + df.hispanic_or_latino \
                       - df.total_population) > 0.1*df.total_population).count()

415

In most cases this difference is greater than 10% of the total population.

## 4. Primary Key & Columns Related to Other Datasets

Total Cities in DataFrame:

In [18]:
df.select('city', 'state').orderBy('city').count()

596

Display all cities and the state they belong:

In [19]:
df.select('city', 'state').orderBy('city').show(596, truncate=False)

+-----------------------------------------------+--------------------+
|city                                           |state               |
+-----------------------------------------------+--------------------+
|ABILENE                                        |Texas               |
|AKRON                                          |Ohio                |
|ALAFAYA                                        |Florida             |
|ALAMEDA                                        |California          |
|ALBANY                                         |Georgia             |
|ALBANY                                         |New York            |
|ALBUQUERQUE                                    |New Mexico          |
|ALEXANDRIA                                     |Virginia            |
|ALHAMBRA                                       |California          |
|ALLEN                                          |Pennsylvania        |
|ALLEN                                          |Texas               |
|AMARI

Some cities have the same name but are in different states.

Total states in DataFrame:

In [20]:
df.select('state', 'state_code').distinct().orderBy('state').count()

49

Display all states and state codes:

In [21]:
df.select('state', 'state_code').distinct().orderBy('state').show(49, truncate=False)

+--------------------+----------+
|state               |state_code|
+--------------------+----------+
|Alabama             |AL        |
|Alaska              |AK        |
|Arizona             |AZ        |
|Arkansas            |AR        |
|California          |CA        |
|Colorado            |CO        |
|Connecticut         |CT        |
|Delaware            |DE        |
|District of Columbia|DC        |
|Florida             |FL        |
|Georgia             |GA        |
|Hawaii              |HI        |
|Idaho               |ID        |
|Illinois            |IL        |
|Indiana             |IN        |
|Iowa                |IA        |
|Kansas              |KS        |
|Kentucky            |KY        |
|Louisiana           |LA        |
|Maine               |ME        |
|Maryland            |MD        |
|Massachusetts       |MA        |
|Michigan            |MI        |
|Minnesota           |MN        |
|Mississippi         |MS        |
|Missouri            |MO        |
|Montana      