In [6]:
from google.cloud import bigquery
from google.oauth2 import service_account
from google.cloud.exceptions import NotFound
import pandas as pd
import os      
import datetime as dt

data_dir = 'dsa-airflow/data'

cpi_file = 'US-CPI.csv'
unemp_file = 'USUnemployment.csv'

In [3]:
# read and rename unemployment file
unemp = pd.read_csv(os.path.join(data_dir, unemp_file), header=0)
unemp.columns = unemp.columns.str.lower()


# calculate the avg yearly unemp. rate
columns = ['jan',	'feb',	'mar',	'apr',	'may'	,'jun',	'jul',	'aug',	'sep',	'oct',	'nov',	'dec']
unemp['avg_unemp_per_year']  = unemp[columns].mean(axis=1)

unemp.to_csv(os.path.join(data_dir, 'unemp.csv'), header=True, index=False)

print(unemp.head(5))
print(unemp.dtypes)
# unemp.dtypes


   year  jan  feb  mar  apr  may  jun  jul  aug  sep  oct  nov  dec  \
0  1948  3.4  3.8  4.0  3.9  3.5  3.6  3.6  3.9  3.8  3.7  3.8  4.0   
1  1949  4.3  4.7  5.0  5.3  6.1  6.2  6.7  6.8  6.6  7.9  6.4  6.6   
2  1950  6.5  6.4  6.3  5.8  5.5  5.4  5.0  4.5  4.4  4.2  4.2  4.3   
3  1951  3.7  3.4  3.4  3.1  3.0  3.2  3.1  3.1  3.3  3.5  3.5  3.1   
4  1952  3.2  3.1  2.9  2.9  3.0  3.0  3.2  3.4  3.1  3.0  2.8  2.7   

   avg_unemp_per_year  
0            3.750000  
1            6.050000  
2            5.208333  
3            3.283333  
4            3.025000  
year                    int64
jan                   float64
feb                   float64
mar                   float64
apr                   float64
may                   float64
jun                   float64
jul                   float64
aug                   float64
sep                   float64
oct                   float64
nov                   float64
dec                   float64
avg_unemp_per_year    float64
dtype: ob

In [9]:
# read and rename cpi file / parse_dates=['Yearmon']
cpi  = pd.read_csv(os.path.join(data_dir, cpi_file), header=0)
cpi = cpi.rename(columns={'Yearmon': 'year', 'CPI': 'cpi'})
cpi[['month', 'date', 'year']] = cpi.year.str.split("-", expand=True)

# calculate the avg yearly cpi rate
cpi['avg_cpi_per_year'] = cpi.groupby('year')['cpi'].transform('mean')
cpi[['month','year','date']] = cpi[['month','year','date']].astype('int')

#write cpi to_csv
cpi.to_csv(os.path.join(data_dir, 'cpi.csv'), header=True, index=False)

print(cpi.head(5))
print(cpi.dtypes)

   year  cpi  month  date  avg_cpi_per_year
0  1913  9.8      1     1          9.883333
1  1913  9.8      1     2          9.883333
2  1913  9.8      1     3          9.883333
3  1913  9.8      1     4          9.883333
4  1913  9.7      1     5          9.883333
year                  int64
cpi                 float64
month                 int64
date                  int64
avg_cpi_per_year    float64
dtype: object


In [45]:
# inflation_rate (btw current year and last year in %)
def calculate_inflation(row):
    final_inflation_rate = ((row['difference_cpi_btw_2years']/ row['avg_cpi_shifted']) * 100)
    return final_inflation_rate

In [47]:
# make copy of cpi and drop duplicate
inflation_df = cpi[['year', 'avg_cpi_per_year']].drop_duplicates()

# calculate difference btw 2 rows
inflation_df['difference_cpi_btw_2years'] = inflation_df['avg_cpi_per_year'].diff()
inflation_df['avg_cpi_shifted'] = inflation_df['avg_cpi_per_year'].shift(1) 
inflation_df['final_inflation_rate'] = inflation_df.apply(calculate_inflation, axis=1)

inflation_df

Unnamed: 0,year,avg_cpi_per_year,difference_cpi_btw_2years,avg_cpi_shifted,final_inflation_rate
0,1913,9.883333,,,
12,1914,10.016667,0.133333,9.883333,1.349073
24,1915,10.108333,0.091667,10.016667,0.915141
36,1916,10.883333,0.775000,10.108333,7.666941
48,1917,12.825000,1.941667,10.883333,17.840735
...,...,...,...,...,...
1248,2017,245.119583,5.112417,240.007167,2.130110
1260,2018,251.106833,5.987250,245.119583,2.442583
1272,2019,255.657417,4.550583,251.106833,1.812210
1284,2020,258.811167,3.153750,255.657417,1.233584


In [4]:
from google.cloud import bigquery
from google.oauth2 import service_account

# create full table id
PROJECT_ID = "team-week-3"
DATASET_ID = "tech_stocks_world_events"

# create bigquery client
# key_path = os.path.expanduser("/home/chloe_ycl/.creds/team-week-3.json")
# credentials = service_account.Credentials.from_service_account_file(key_path, 
#                                                               scopes=["https://www.googleapis.com/auth/cloud-platform"])

client = bigquery.Client(project=PROJECT_ID)

print("Successfully created a BiqQuery client")
print(f"Project: {client.project}")

# create dataset
dataset_id = "{}.tech_stocks_world_events".format(client.project)

#construct dataset obj to send to API
dataset = bigquery.Dataset(dataset_id)

#specify the location
dataset.location = "US"

dataset = client.create_dataset(dataset, timeout=30, exists_ok=True) #Make an API request
print("Created dataset {}.{}".format(client.project, dataset.dataset_id))

Successfully created a BiqQuery client
Project: team-week-3
Created dataset team-week-3.tech_stocks_world_events


In [5]:
# funtion to load table from dataframes

def load_table(
    df: pd.DataFrame, 
    client: bigquery.Client, 
    table_name: str, 
    schema: bigquery.SchemaField,
    create_disposition: str = 'CREATE_IF_NEEDED', 
    write_disposition: str = 'WRITE_TRUNCATE'
    ) -> None:
    
    job_config = bigquery.LoadJobConfig(
        create_disposition=create_disposition,
        write_disposition=write_disposition,
        schema=schema
    )
    
    job = client.load_table_from_dataframe(df, destination=table_name, job_config=job_config)
    job.result()        # wait for the job to finish



# create our own bigquery schema
CPI_METADATA = {
    'cpi_rates': {
        'table_name': 'cpi_rates',
        'schema' : [
            bigquery.SchemaField("year", "INTEGER", mode="REQUIRED"),
            bigquery.SchemaField("cpi", "FLOAT", mode="REQUIRED"),
            bigquery.SchemaField("month", "INTEGER", mode="REQUIRED"),
            bigquery.SchemaField("date", "INTEGER", mode="REQUIRED"),
            bigquery.SchemaField("avg_cpi_per_year", "FLOAT", mode="REQUIRED"),]
    }}

UNEMP_METADATA = {
    'unemployment_rates' :{
        'table_name': 'unemployment_rates',
        'schema': [
            bigquery.SchemaField("year", "INTEGER", mode="REQUIRED"),
            bigquery.SchemaField("jan", "FLOAT", mode="REQUIRED"),
            bigquery.SchemaField("feb", "FLOAT", mode="REQUIRED"),
            bigquery.SchemaField("mar", "FLOAT", mode="REQUIRED"),
            bigquery.SchemaField("apr", "FLOAT", mode="REQUIRED"),
            bigquery.SchemaField("may", "FLOAT", mode="REQUIRED"),
            bigquery.SchemaField("jun", "FLOAT", mode="REQUIRED"),
            bigquery.SchemaField("jul", "FLOAT", mode="REQUIRED"),
            bigquery.SchemaField("aug", "FLOAT", mode="REQUIRED"),
            bigquery.SchemaField("sep", "FLOAT", mode="REQUIRED"),
            bigquery.SchemaField("oct", "FLOAT", mode="REQUIRED"),
            bigquery.SchemaField("nov", "FLOAT", mode="REQUIRED"),
            bigquery.SchemaField("dec", "FLOAT", mode="REQUIRED"),
            bigquery.SchemaField("avg_unemp_per_year", "FLOAT", mode="REQUIRED")]
    }}

In [7]:
# load to bq
table_name = f"{PROJECT_ID}.{DATASET_ID}.{CPI_METADATA['cpi_rates']['table_name']}"
schema = CPI_METADATA['cpi_rates']['schema']
load_table(cpi, client, table_name, schema)

table_name = f"{PROJECT_ID}.{DATASET_ID}.{UNEMP_METADATA['unemployment_rates']['table_name']}"
schema = UNEMP_METADATA['unemployment_rates']['schema']
load_table(unemp, client, table_name, schema)

In [9]:
# query Stock file from bq
STOCK_DATASET_ID = 'tech_stocks_world_events'

stocks = f""" SELECT * FROM {PROJECT_ID}.{STOCK_DATASET_ID}.stocks """

result = client.query(stocks)

# read to dataframe
df = result.to_dataframe()

# read to csv file
stock_file = 'stocks.csv'
df.to_csv(os.path.join(data_dir, stock_file), header=True, index=False)

# read stocks.csv
stocks_df = pd.read_csv(os.path.join(data_dir, stock_file), header=0)

stocks_df


Unnamed: 0,stock_name,year,month,day,date,open,high,low,close,adj_close,volume,sd_id
0,AAPL,2022,6,1,2022-06-01 00:00:00+00:00,149.899994,151.740005,147.679993,148.710007,148.257782,74286600,AAPL2022-06-01
1,ADBE,2022,6,1,2022-06-01 00:00:00+00:00,428.000000,437.549988,413.790009,418.160004,418.160004,3409900,ADBE2022-06-01
2,AMZN,2022,6,1,2022-06-01 00:00:00+00:00,122.255997,125.179001,120.622498,121.683998,121.683998,127528000,AMZN2022-06-01
3,CRM,2022,6,1,2022-06-01 00:00:00+00:00,178.009995,184.419998,174.369995,176.070007,176.070007,37037400,CRM2022-06-01
4,CSCO,2022,6,1,2022-06-01 00:00:00+00:00,45.549999,45.799999,44.770000,45.230000,44.061939,20666000,CSCO2022-06-01
...,...,...,...,...,...,...,...,...,...,...,...,...
47766,NFLX,2017,1,31,2017-01-31 00:00:00+00:00,140.550003,141.830002,139.699997,140.710007,140.710007,4411600,NFLX2017-01-31
47767,NVDA,2017,1,31,2017-01-31 00:00:00+00:00,27.237499,27.512501,27.049999,27.295000,26.914753,36275600,NVDA2017-01-31
47768,ORCL,2017,1,31,2017-01-31 00:00:00+00:00,40.209999,40.250000,39.669998,40.110001,36.346123,10766700,ORCL2017-01-31
47769,TSLA,2017,1,31,2017-01-31 00:00:00+00:00,16.615999,17.059334,16.513332,16.795334,16.795334,61741500,TSLA2017-01-31


In [None]:
from airflow.sensors.filesystem import FileSensor
from airflow.hooks.filesystem import FSHook


# do transformation for unemployment_df, cpi_df
def cpi_transformation():
    # read and rename cpi file / parse_dates=['Yearmon']
    cpi  = pd.read_csv(os.path.join(data_dir, cpi_file), header=0)
    cpi = cpi.rename(columns={'Yearmon': 'year', 'CPI': 'cpi'})
    cpi[['month', 'date', 'year']] = cpi.year.str.split("-", expand=True)

    # calculate the avg yearly cpi rate
    cpi['avg_cpi_per_year'] = cpi.groupby('year')['cpi'].transform('mean')
    cpi[['month','year','date']] = cpi[['month','year','date']].astype('int')
    print('done')


def unemp_transformation():
    # read and rename unemployment file
    unemp = pd.read_csv(os.path.join(data_dir, unemp_file), header=0)
    unemp.columns = unemp.columns.str.lower()

    # calculate the avg yearly unemp. rate
    columns = ['jan',	'feb',	'mar',	'apr',	'may'	,'jun',	'jul',	'aug',	'sep',	'oct',	'nov',	'dec']
    unemp['avg_unemp_per_year']  = unemp[columns].mean(axis=1)
    print('done')
