In [8]:
import pandas as pd
import pyspark
import os 
from pyspark.sql import SparkSession
import pyspark.sql.functions as sf      # sf = spark functions
import pyspark.sql.types as st          # st = spark types
import datetime as dt

spark = SparkSession.builder.master('local').getOrCreate()

# spark = SparkSession.builder.getOrCreate()

In [6]:
data_dir = './data'

cpi_file = 'US-CPI.csv'
unemp_file = 'USUnemployment.csv'

c_schema = 'Yearmon date, CPI float'
u_schema = 'Year date, Jan float ,Feb float ,Mar float,Apr float ,May float,Jun float ,Jul float ,Aug float ,Sep float,Oct float,Nov float,Dec float'

cpi_df = spark.read.csv(os.path.join(data_dir, cpi_file), schema=c_schema, header=0, enforceSchema=True)
unemp_df = spark.read.csv(os.path.join(data_dir, unemp_file), schema=u_schema, header=0, enforceSchema=True)

cpi_df.printSchema()
unemp_df.printSchema()

root
 |-- Yearmon: date (nullable = true)
 |-- CPI: float (nullable = true)

root
 |-- Year: date (nullable = true)
 |-- Jan: float (nullable = true)
 |-- Feb: float (nullable = true)
 |-- Mar: float (nullable = true)
 |-- Apr: float (nullable = true)
 |-- May: float (nullable = true)
 |-- Jun: float (nullable = true)
 |-- Jul: float (nullable = true)
 |-- Aug: float (nullable = true)
 |-- Sep: float (nullable = true)
 |-- Oct: float (nullable = true)
 |-- Nov: float (nullable = true)
 |-- Dec: float (nullable = true)



In [9]:
# rename cols in both files

for column in cpi_df.columns: 
  cpi_df = cpi_df.withColumnRenamed(column, column.lower())

cpi_df = cpi_df.withColumnRenamed('yearmon', 'year')

for column in unemp_df.columns: 
  unemp_df = unemp_df.withColumnRenamed(column, column.lower())

# calculate avg unemployment rate per year
udf_avg = sf.udf(lambda array: sum(array)/len(array))
unemp_df = unemp_df.withColumn("avg_unemp_rate_per_year", udf_avg(sf.array('Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec')).cast('float'))
unemp_df = unemp_df.drop('avg_unemp_rate')

# # calculate avg inflation rate per year
# cpi_df = cpi_df.groupBy('year').withColumn('avg_CPI_per_', sf.avg(sf.col('CPI')))

cpi_df.printSchema()
unemp_df.printSchema()

root
 |-- year: date (nullable = true)
 |-- cpi: float (nullable = true)

root
 |-- year: date (nullable = true)
 |-- jan: float (nullable = true)
 |-- feb: float (nullable = true)
 |-- mar: float (nullable = true)
 |-- apr: float (nullable = true)
 |-- may: float (nullable = true)
 |-- jun: float (nullable = true)
 |-- jul: float (nullable = true)
 |-- aug: float (nullable = true)
 |-- sep: float (nullable = true)
 |-- oct: float (nullable = true)
 |-- nov: float (nullable = true)
 |-- dec: float (nullable = true)
 |-- avg_unemp_rate_per_year: float (nullable = true)



In [30]:
# read and rename cpi file / parse_dates=['Yearmon']
cpi  = pd.read_csv(os.path.join(data_dir, cpi_file), header=0)
cpi = cpi.rename(columns={'Yearmon': 'year', 'CPI': 'cpi'})
cpi[['month', 'date', 'year']] = cpi.year.str.split("-", expand=True)

# read and rename unemployment file
unemp = pd.read_csv(os.path.join(data_dir, unemp_file), header=0)
unemp.columns = unemp.columns.str.lower()


# calculate the avg yearly unemp. rate
columns = ['jan',	'feb',	'mar',	'apr',	'may'	,'jun',	'jul',	'aug',	'sep',	'oct',	'nov',	'dec']
unemp['avg_unemp_per_year']  = unemp[columns].mean(axis=1)

# calculate the avg yearly cpi rate
avg_cpi_per_year = cpi.groupby('year').mean('cpi')
avg_cpi_per_year = avg_cpi_per_year.reset_index()

# unemp.head(5)
avg_cpi_per_year.head(10)
unemp.dtypes


year                    int64
jan                   float64
feb                   float64
mar                   float64
apr                   float64
may                   float64
jun                   float64
jul                   float64
aug                   float64
sep                   float64
oct                   float64
nov                   float64
dec                   float64
avg_unemp_per_year    float64
dtype: object

In [None]:
from google.cloud import bigquery

# create full table id
project_id = "team-week-3"
dataset_id = ""
table_name = "titles"
table_id = f"{project_id}.{dataset_id}.{table_name}"

