

# Importing Modules

In [1]:
import pyspark
import findspark
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.functions import col , udf
from pyspark.sql.types import StringType, IntegerType
from pyspark.sql.functions import trim
import pyspark.sql.functions as f


import pandas as pd 
import numpy as np 
import re

findspark.init('D:\\Spark\\')

# Creating SparkSession

In [2]:
spark=SparkSession.builder\
    .master('local') \
    .appName('Udacity') \
    .getOrCreate()

In [3]:
spark

# Loading and Inspsecting Dataframe

In [4]:
df = spark.read.options(delimiter=';').csv('data/us-cities-demographics.csv',header=True)

In [5]:
df.printSchema()

root
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Median Age: string (nullable = true)
 |-- Male Population: string (nullable = true)
 |-- Female Population: string (nullable = true)
 |-- Total Population: string (nullable = true)
 |-- Number of Veterans: string (nullable = true)
 |-- Foreign-born: string (nullable = true)
 |-- Average Household Size: string (nullable = true)
 |-- State Code: string (nullable = true)
 |-- Race: string (nullable = true)
 |-- Count: string (nullable = true)



In [6]:
df.show()

+----------------+--------------+----------+---------------+-----------------+----------------+------------------+------------+----------------------+----------+--------------------+------+
|            City|         State|Median Age|Male Population|Female Population|Total Population|Number of Veterans|Foreign-born|Average Household Size|State Code|                Race| Count|
+----------------+--------------+----------+---------------+-----------------+----------------+------------------+------------+----------------------+----------+--------------------+------+
|   Silver Spring|      Maryland|      33.8|          40601|            41862|           82463|              1562|       30908|                   2.6|        MD|  Hispanic or Latino| 25924|
|          Quincy| Massachusetts|      41.0|          44129|            49500|           93629|              4147|       32935|                  2.39|        MA|               White| 58723|
|          Hoover|       Alabama|      38.5|      

In [7]:
# Duplicates per Race
df.where(col('City')=='New York').show()

+--------+--------+----------+---------------+-----------------+----------------+------------------+------------+----------------------+----------+--------------------+-------+
|    City|   State|Median Age|Male Population|Female Population|Total Population|Number of Veterans|Foreign-born|Average Household Size|State Code|                Race|  Count|
+--------+--------+----------+---------------+-----------------+----------------+------------------+------------+----------------------+----------+--------------------+-------+
|New York|New York|      36.0|        4081698|          4468707|         8550405|            156961|     3212500|                  2.68|        NY|               White|3835726|
|New York|New York|      36.0|        4081698|          4468707|         8550405|            156961|     3212500|                  2.68|        NY|               Asian|1304564|
|New York|New York|      36.0|        4081698|          4468707|         8550405|            156961|     3212500|  

# Cleaning Data 

+ ~~Pivot of Race; thus remove duplicates~~ 
+ ~~Repalce nulls with zero~~ 
+ ~~Join both frame~~
+ ~~Drop Columns~~

In [8]:
# Groupby and Pivot
df_grp=df.groupBy('City','State').pivot('Race').agg(f.sum('Count').cast(IntegerType()))
# Create Key Column
df_grp=df_grp.withColumn('StateCity',f.concat(col('State'),f.lit('_'),col('City')))
# Drop State and City 
col_to_drop=['City','State']
df_grp=df_grp.drop(*col_to_drop)
# Fill Nulls with zero
df_grp=df_grp.fillna(0)

In [9]:
# Create Join key origin df
df=df.withColumn('StateCity',f.concat(col('State'),f.lit('_'),col('City')))
# Join per key 
df=df.join(df_grp,df.StateCity==df_grp.StateCity,how='left').drop(df_grp.StateCity)

In [10]:
# Dropping Columns
col_to_drop=['StateCity','Race','Count']
df=df.drop(*col_to_drop)

In [11]:
df.show()

+----------------+--------------+----------+---------------+-----------------+----------------+------------------+------------+----------------------+----------+---------------------------------+------+-------------------------+------------------+------+
|            City|         State|Median Age|Male Population|Female Population|Total Population|Number of Veterans|Foreign-born|Average Household Size|State Code|American Indian and Alaska Native| Asian|Black or African-American|Hispanic or Latino| White|
+----------------+--------------+----------+---------------+-----------------+----------------+------------------+------------+----------------------+----------+---------------------------------+------+-------------------------+------------------+------+
|   Silver Spring|      Maryland|      33.8|          40601|            41862|           82463|              1562|       30908|                   2.6|        MD|                             1084|  8841|                    21330|       

In [15]:
# Write to csv 
df.write.csv('processed/demographics.csv',mode='overwrite',header=True)