Weather is from https://www.weather.gov/sgx/cliplot

In [53]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import os
import datetime
import time

# directories
clean = 'Clean_Data/'
raw = 'Foundation/Raw Data/'
root = '/root/'

In [54]:
dfs = []

weather_folder_path = root + raw + '/Scraped_Gov_Weather/'
weather_folders = os.listdir(weather_folder_path)

for folder in weather_folders:
  parent_dir = weather_folder_path + folder

  children_folders = os.listdir(parent_dir)

  for child_folder in children_folders:
    child_dir = parent_dir + '/' + f'{child_folder}'
    
    files = os.listdir(child_dir)

    for file_ in files:
      file_path = child_dir + '/' + file_
      df = pd.read_csv(file_path)


      date = df.iloc[0,1]

      # grab only observed lowh/igh, and normal low/high
      df = df.iloc[1:,1:6]

      # make the first row into the column names
      df.rename(columns={'0': df.iloc[0,0],
              '1': df.iloc[0,1],
              '2': df.iloc[0,2],
              '3': df.iloc[0,3],
              '4': df.iloc[0,4]},
              inplace=True)

      # drop first row of column names
      df = df.iloc[1:,:]

      # remove last row which contains Average for each columnd
      del_last_row = df.shape[0] - 1
      df = df.iloc[0:del_last_row,:]

      # write location name from folder variable
      df['location'] = folder

      # get date and assign to columns
      date_list = date.split(' ')[1]
      df['Month'] = date_list.split('/')[0]
      df['Year'] = date_list.split('/')[1]


      dfs.append(df)

weather_df = pd.concat(dfs)
weather_df = weather_df.reset_index(drop=True)
weather_df.head(3)

Unnamed: 0,Date,ObservedLow(F),ObservedHigh(F),NormalLow(F),NormalHigh(F),location,Month,Year
0,1,52,84,45,84,Campo,10,2016
1,2,44,76,45,84,Campo,10,2016
2,3,33,71,45,84,Campo,10,2016


In [55]:
# change colum names to something easier to work with
weather_df.rename(columns={'ObservedLow(F)': 'Low',
                           'ObservedHigh(F)': 'High',
                           'NormalLow(F)': 'NormLow',
                           'NormalHigh(F)': 'NormHigh'},
                  inplace=True)
                    
weather_df.head(3)

Unnamed: 0,Date,Low,High,NormLow,NormHigh,location,Month,Year
0,1,52,84,45,84,Campo,10,2016
1,2,44,76,45,84,Campo,10,2016
2,3,33,71,45,84,Campo,10,2016


In [56]:
weather_df.head(3)

Unnamed: 0,Date,Low,High,NormLow,NormHigh,location,Month,Year
0,1,52,84,45,84,Campo,10,2016
1,2,44,76,45,84,Campo,10,2016
2,3,33,71,45,84,Campo,10,2016


In [57]:
locations = list(set(weather_df['location']))
locations

['linbergh_field',
 'Oceanside Airport',
 'Campo',
 'San Diego Mont. Field',
 'San Diego Brown Field',
 'Ramona',
 'Carlsbad']

In [58]:
weather_df.describe()

Unnamed: 0,Date,Low,High,NormLow,NormHigh,location,Month,Year
count,25571,25571,25571,25571,25571,25571,25571,25571
unique,31,66,73,37,34,7,12,10
top,22,M,M,44,66,San Diego Brown Field,1,2020
freq,840,1498,1498,1210,3046,3659,2449,2921


That is interesting, the "top" row (3rd down) has a low and high of M.  Let's fix that.

In [59]:
high_M = weather_df[weather_df['High'] == 'M']
high_M.head()

Unnamed: 0,Date,Low,High,NormLow,NormHigh,location,Month,Year
31,1,M,M,39,75,Campo,11,2016
32,2,M,M,39,74,Campo,11,2016
33,3,M,M,39,74,Campo,11,2016
34,4,M,M,39,74,Campo,11,2016
35,5,M,M,38,74,Campo,11,2016


In [60]:
# clean up those Ms!
new_low = []
new_high = []

for index, row in weather_df.iterrows():
  
  
  if row['Low'] == 'M':

    if row['NormLow'] == 'M':
      new_low.append(np.nan)
    else:
      new_low.append(row['NormLow'])

  else:
    new_low.append(row['Low'])
  
  
  if row['High'] == 'M':

    if row['NormHigh'] == 'M':
      new_high.append(np.nan)
    else:
      new_high.append(row['NormHigh'])

  else:
    new_high.append(row['High'])


weather_df = weather_df.loc[:,['Date', 'location', 'Month', 'Year']].copy()
weather_df['High'] = new_high
weather_df['Low'] = new_low

weather_df.head(3)

Unnamed: 0,Date,location,Month,Year,High,Low
0,1,Campo,10,2016,84,52
1,2,Campo,10,2016,76,44
2,3,Campo,10,2016,71,33


In [61]:
weather_df[weather_df['High'] == 'M']

Unnamed: 0,Date,location,Month,Year,High,Low


In [62]:
weather_df[weather_df['Low'] == 'M']

Unnamed: 0,Date,location,Month,Year,High,Low


In [63]:
bool_ = weather_df['Low'].isna()

weather_df.loc[bool_,:]

Unnamed: 0,Date,location,Month,Year,High,Low


In [64]:
bool_ = weather_df['High'].isna()

weather_df.loc[bool_,:]

Unnamed: 0,Date,location,Month,Year,High,Low


## Data Prep Continuation

In [65]:
weather_df = weather_df.astype({'Date':'int',
                   'Month':'int',
                   'Year':'int',
                   'Low':'int',
                   'High':'int',})

weather_df.dtypes

Date         int64
location    object
Month        int64
Year         int64
High         int64
Low          int64
dtype: object

In [66]:
min_df = weather_df.groupby(by = ['Year', 'Month', 'location']).min()

min_df = min_df.drop(columns=['Date', 'High'])

min_df.head(14)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Low
Year,Month,location,Unnamed: 3_level_1
2012,1,Campo,23
2012,1,Carlsbad,41
2012,1,Oceanside Airport,31
2012,1,Ramona,22
2012,1,San Diego Brown Field,43
2012,1,San Diego Mont. Field,38
2012,1,linbergh_field,43
2012,2,Campo,23
2012,2,Carlsbad,40
2012,2,Oceanside Airport,34


In [67]:
min_df.tail(14)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Low
Year,Month,location,Unnamed: 3_level_1
2021,10,San Diego Mont. Field,47
2021,10,linbergh_field,50
2021,11,Carlsbad,46
2021,11,Oceanside Airport,37
2021,11,Ramona,32
2021,11,San Diego Brown Field,42
2021,11,San Diego Mont. Field,47
2021,11,linbergh_field,45
2021,12,Carlsbad,40
2021,12,Oceanside Airport,31


In [68]:
max_df = weather_df.groupby(by = ['Year', 'Month', 'location']).max()

max_df = max_df.drop(columns=['Date', 'Low'])

max_df.head(14)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,High
Year,Month,location,Unnamed: 3_level_1
2012,1,Campo,81
2012,1,Carlsbad,81
2012,1,Oceanside Airport,82
2012,1,Ramona,86
2012,1,San Diego Brown Field,77
2012,1,San Diego Mont. Field,85
2012,1,linbergh_field,83
2012,2,Campo,76
2012,2,Carlsbad,80
2012,2,Oceanside Airport,79


In [69]:
max_df.tail(14)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,High
Year,Month,location,Unnamed: 3_level_1
2021,10,San Diego Mont. Field,95
2021,10,linbergh_field,90
2021,11,Carlsbad,90
2021,11,Oceanside Airport,95
2021,11,Ramona,92
2021,11,San Diego Brown Field,91
2021,11,San Diego Mont. Field,93
2021,11,linbergh_field,92
2021,12,Carlsbad,67
2021,12,Oceanside Airport,69


In [70]:
avg_df = weather_df.groupby(by = ['Year', 'Month', 'location']).mean()

avg_df = avg_df.rename(columns={'High':'AvgHigh',
                       'Low':'AvgLow'})

avg_df.head(14)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Date,AvgHigh,AvgLow
Year,Month,location,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2012,1,Campo,16.0,65.129032,33.774194
2012,1,Carlsbad,16.0,67.903226,47.322581
2012,1,Oceanside Airport,16.0,68.451613,37.580645
2012,1,Ramona,16.0,70.870968,32.774194
2012,1,San Diego Brown Field,16.0,66.225806,44.0
2012,1,San Diego Mont. Field,16.0,70.032258,44.516129
2012,1,linbergh_field,16.0,67.354839,49.032258
2012,2,Campo,15.0,60.103448,32.413793
2012,2,Carlsbad,15.0,63.896552,47.965517
2012,2,Oceanside Airport,15.0,65.965517,40.482759


In [71]:
avg_df.tail(14)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Date,AvgHigh,AvgLow
Year,Month,location,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021,10,San Diego Mont. Field,16.0,76.580645,56.096774
2021,10,linbergh_field,16.0,75.096774,58.419355
2021,11,Carlsbad,15.5,70.533333,54.1
2021,11,Oceanside Airport,15.5,73.633333,46.966667
2021,11,Ramona,15.5,79.466667,40.2
2021,11,San Diego Brown Field,15.5,75.366667,50.533333
2021,11,San Diego Mont. Field,15.5,76.2,53.633333
2021,11,linbergh_field,15.5,73.133333,55.366667
2021,12,Carlsbad,16.0,60.645161,47.580645
2021,12,Oceanside Airport,16.0,63.032258,41.870968


In [72]:
final_weather_df = pd.merge(min_df, max_df, on = ['Year', 'Month', 'location'])
final_weather_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Low,High
Year,Month,location,Unnamed: 3_level_1,Unnamed: 4_level_1
2012,1,Campo,23,81
2012,1,Carlsbad,41,81
2012,1,Oceanside Airport,31,82
2012,1,Ramona,22,86
2012,1,San Diego Brown Field,43,77


In [73]:
final_weather_df2 = pd.merge(final_weather_df, avg_df, on = ['Year', 'Month', 'location']).reset_index()
final_weather_df2 = final_weather_df2.drop(columns='Date')

final_weather_df2.head(3)

Unnamed: 0,Year,Month,location,Low,High,AvgHigh,AvgLow
0,2012,1,Campo,23,81,65.129032,33.774194
1,2012,1,Carlsbad,41,81,67.903226,47.322581
2,2012,1,Oceanside Airport,31,82,68.451613,37.580645


In [74]:
date_columns = ['Month','Year']
final_weather_df2['Full_date'] = final_weather_df2.loc[:,date_columns].apply(lambda x: '/'.join(x.values.astype(str)), axis='columns')
final_weather_df2.head(3)

Unnamed: 0,Year,Month,location,Low,High,AvgHigh,AvgLow,Full_date
0,2012,1,Campo,23,81,65.129032,33.774194,1/2012
1,2012,1,Carlsbad,41,81,67.903226,47.322581,1/2012
2,2012,1,Oceanside Airport,31,82,68.451613,37.580645,1/2012


In [75]:
final_weather_df2['Full_date'] = pd.to_datetime(final_weather_df2['Full_date'])
final_weather_df2.dtypes

Year                  int64
Month                 int64
location             object
Low                   int64
High                  int64
AvgHigh             float64
AvgLow              float64
Full_date    datetime64[ns]
dtype: object

In [76]:
clean_data_dir = root + clean

final_weather_df2.to_parquet(clean_data_dir + 'weather.parquet.gzip', compression='gzip', index=False)

In [77]:
print('Weather Data is READY TO ROCK!!!!')

Weather Data is READY TO ROCK!!!!


In [78]:
import boto3
import sagemaker

# Sagemaker session
sess = sagemaker.Session()

# bucket = "my-bucket"
bucket = sess.default_bucket()

file_name = 'weather.parquet.gzip'
file_path = 'Clean_Data/' + file_name

# Upload flow to S3
s3_client = boto3.client("s3")
s3_client.upload_file(clean_data_dir + file_name, bucket, file_path)