Weather is from https://www.weather.gov/sgx/cliplot

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import os
import datetime
import time

# directories
clean = 'Clean_Data/'
raw = 'Raw Data/'
root = '/root/'

In [5]:
dfs = []

weather_folder_path = root + raw + '/Scraped_Gov_Weather/'
weather_folders = os.listdir(weather_folder_path)

for folder in weather_folders:
  parent_dir = weather_folder_path + folder

  children_folders = os.listdir(parent_dir)

  for child_folder in children_folders:
    child_dir = parent_dir + '/' + f'{child_folder}'
    
    files = os.listdir(child_dir)

    for file_ in files:
      file_path = child_dir + '/' + file_
      df = pd.read_csv(file_path)


      date = df.iloc[0,1]

      # grab only observed lowh/igh, and normal low/high
      df = df.iloc[1:,1:6]

      # make the first row into the column names
      df.rename(columns={'0': df.iloc[0,0],
              '1': df.iloc[0,1],
              '2': df.iloc[0,2],
              '3': df.iloc[0,3],
              '4': df.iloc[0,4]},
              inplace=True)

      # drop first row of column names
      df = df.iloc[1:,:]

      # remove last row which contains Average for each columnd
      del_last_row = df.shape[0] - 1
      df = df.iloc[0:del_last_row,:]

      # write location name from folder variable
      df['location'] = folder

      # get date and assign to columns
      date_list = date.split(' ')[1]
      df['Month'] = date_list.split('/')[0]
      df['Year'] = date_list.split('/')[1]


      dfs.append(df)

weather_df = pd.concat(dfs)
weather_df = weather_df.reset_index(drop=True)
weather_df.head(3)

Unnamed: 0,Date,ObservedLow(F),ObservedHigh(F),NormalLow(F),NormalHigh(F),location,Month,Year
0,1,52,84,45,84,Campo,10,2016
1,2,44,76,45,84,Campo,10,2016
2,3,33,71,45,84,Campo,10,2016


In [6]:
# change colum names to something easier to work with
weather_df.rename(columns={'ObservedLow(F)': 'Low',
                           'ObservedHigh(F)': 'High',
                           'NormalLow(F)': 'NormLow',
                           'NormalHigh(F)': 'NormHigh'},
                  inplace=True)
                    
weather_df.head(3)

Unnamed: 0,Date,Low,High,NormLow,NormHigh,location,Month,Year
0,1,52,84,45,84,Campo,10,2016
1,2,44,76,45,84,Campo,10,2016
2,3,33,71,45,84,Campo,10,2016


In [7]:
date_columns = ['Date','Month','Year']
weather_df['Full_date'] = weather_df.loc[:,date_columns].apply(lambda x: '/'.join(x.values.astype(str)), axis='columns')

weather_df['Full_date'] = pd.to_datetime(weather_df['Full_date'])
weather_df.dtypes

Date                 object
Low                  object
High                 object
NormLow              object
NormHigh             object
location             object
Month                object
Year                 object
Full_date    datetime64[ns]
dtype: object

In [8]:
weather_df.head(3)

Unnamed: 0,Date,Low,High,NormLow,NormHigh,location,Month,Year,Full_date
0,1,52,84,45,84,Campo,10,2016,2016-01-10
1,2,44,76,45,84,Campo,10,2016,2016-02-10
2,3,33,71,45,84,Campo,10,2016,2016-03-10


In [9]:
locations = list(set(weather_df['location']))
locations

There are 7 within the weather dataset. They are:


['San Diego Mont. Field',
 'Carlsbad',
 'linbergh_field',
 'Oceanside Airport',
 'San Diego Brown Field',
 'Campo',
 'Ramona']

In [10]:
weather_df.describe()

  """Entry point for launching an IPython kernel.


Unnamed: 0,Date,Low,High,NormLow,NormHigh,location,Month,Year,Full_date
count,25571.0,25571,25571,25571.0,25571.0,25571,25571.0,25571.0,25571
unique,31.0,66,73,37.0,34.0,7,12.0,10.0,3653
top,1.0,M,M,44.0,66.0,San Diego Brown Field,1.0,2020.0,2020-10-11 00:00:00
freq,840.0,1498,1498,1210.0,3046.0,3659,2449.0,2921.0,20
first,,,,,,,,,2012-01-01 00:00:00
last,,,,,,,,,2021-12-31 00:00:00


That is interesting, the "top" row (3rd down) has a low and high of M.  Let's fix that.

In [11]:
high_M = weather_df[weather_df['High'] == 'M']
high_M.head()

Unnamed: 0,Date,Low,High,NormLow,NormHigh,location,Month,Year,Full_date
31,1,M,M,39,75,Campo,11,2016,2016-01-11
32,2,M,M,39,74,Campo,11,2016,2016-02-11
33,3,M,M,39,74,Campo,11,2016,2016-03-11
34,4,M,M,39,74,Campo,11,2016,2016-04-11
35,5,M,M,38,74,Campo,11,2016,2016-05-11


In [12]:
# clean up those Ms!
new_low = []
new_high = []

for index, row in weather_df.iterrows():
  
  
  if row['Low'] == 'M':

    if row['NormLow'] == 'M':
      new_low.append(np.nan)
    else:
      new_low.append(row['NormLow'])

  else:
    new_low.append(row['Low'])
  
  
  if row['High'] == 'M':

    if row['NormHigh'] == 'M':
      new_high.append(np.nan)
    else:
      new_high.append(row['NormHigh'])

  else:
    new_high.append(row['High'])


weather_df = weather_df.loc[:,['Date', 'location', 'Month', 'Year', 'Full_date']].copy()
weather_df['High'] = new_high
weather_df['Low'] = new_low

weather_df.head(3)

Unnamed: 0,Date,location,Month,Year,Full_date,High,Low
0,1,Campo,10,2016,2016-01-10,84,52
1,2,Campo,10,2016,2016-02-10,76,44
2,3,Campo,10,2016,2016-03-10,71,33


In [13]:
weather_df[weather_df['High'] == 'M']

Unnamed: 0,Date,location,Month,Year,Full_date,High,Low


In [14]:
weather_df[weather_df['Low'] == 'M']

Unnamed: 0,Date,location,Month,Year,Full_date,High,Low


In [15]:
bool_ = weather_df['Low'].isna()

weather_df.loc[bool_,:]

Unnamed: 0,Date,location,Month,Year,Full_date,High,Low


In [16]:
bool_ = weather_df['High'].isna()

weather_df.loc[bool_,:]

Unnamed: 0,Date,location,Month,Year,Full_date,High,Low


## Data Prep Continuation

In [17]:
weather_df = weather_df.astype({'Date':'int',
                   'Month':'int',
                   'Year':'int',
                   'Low':'int',
                   'High':'int',})

weather_df.dtypes

Date                  int64
location             object
Month                 int64
Year                  int64
Full_date    datetime64[ns]
High                  int64
Low                   int64
dtype: object

In [19]:
min_df = weather_df.groupby(by = ['Year', 'Month', 'location']).min()

min_df = min_df.drop(columns=['Date', 'Full_date', 'High'])

min_df.head(14)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Low
Year,Month,location,Unnamed: 3_level_1
2012,1,Campo,23
2012,1,Carlsbad,41
2012,1,Oceanside Airport,31
2012,1,Ramona,22
2012,1,San Diego Brown Field,43
2012,1,San Diego Mont. Field,38
2012,1,linbergh_field,43
2012,2,Campo,23
2012,2,Carlsbad,40
2012,2,Oceanside Airport,34


In [20]:
max_df = weather_df.groupby(by = ['Year', 'Month', 'location']).max()

max_df = max_df.drop(columns=['Date', 'Full_date', 'Low'])

max_df.head(14)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,High
Year,Month,location,Unnamed: 3_level_1
2012,1,Campo,81
2012,1,Carlsbad,81
2012,1,Oceanside Airport,82
2012,1,Ramona,86
2012,1,San Diego Brown Field,77
2012,1,San Diego Mont. Field,85
2012,1,linbergh_field,83
2012,2,Campo,76
2012,2,Carlsbad,80
2012,2,Oceanside Airport,79


In [21]:
avg_df = weather_df.groupby(by = ['Year', 'Month', 'location']).mean()

avg_df = avg_df.rename(columns={'High':'AvgHigh',
                       'Low':'AvgLow'})

avg_df.head(14)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Date,AvgHigh,AvgLow
Year,Month,location,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2012,1,Campo,16.0,65.129032,33.774194
2012,1,Carlsbad,16.0,67.903226,47.322581
2012,1,Oceanside Airport,16.0,68.451613,37.580645
2012,1,Ramona,16.0,70.870968,32.774194
2012,1,San Diego Brown Field,16.0,66.225806,44.0
2012,1,San Diego Mont. Field,16.0,70.032258,44.516129
2012,1,linbergh_field,16.0,67.354839,49.032258
2012,2,Campo,15.0,60.103448,32.413793
2012,2,Carlsbad,15.0,63.896552,47.965517
2012,2,Oceanside Airport,15.0,65.965517,40.482759


In [22]:
final_weather_df = pd.merge(min_df, max_df, on = ['Year', 'Month', 'location'])
final_weather_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Low,High
Year,Month,location,Unnamed: 3_level_1,Unnamed: 4_level_1
2012,1,Campo,23,81
2012,1,Carlsbad,41,81
2012,1,Oceanside Airport,31,82
2012,1,Ramona,22,86
2012,1,San Diego Brown Field,43,77


In [23]:
final_weather_df2 = pd.merge(final_weather_df, avg_df, on = ['Year', 'Month', 'location']).reset_index()
final_weather_df2 = final_weather_df2.drop(columns='Date')

final_weather_df2.head(3)

Unnamed: 0,Year,Month,location,Low,High,AvgHigh,AvgLow
0,2012,1,Campo,23,81,65.129032,33.774194
1,2012,1,Carlsbad,41,81,67.903226,47.322581
2,2012,1,Oceanside Airport,31,82,68.451613,37.580645


In [24]:
clean_data_dir = root + clean

final_weather_df2.to_parquet(clean_data_dir + 'weather.parquet.gzip', compression='gzip')

In [None]:
print('Weather Data is READY TO ROCK!!!!')

In [None]:
import boto3
import sagemaker

# Sagemaker session
sess = sagemaker.Session()

# bucket = "my-bucket"
bucket = sess.default_bucket()

file_name = 'weather.parquet.gzip'
file_path = 'Clean_Data/' + file_name

# Upload flow to S3
s3_client = boto3.client("s3")
s3_client.upload_file(clean_data_dir + file_name, bucket, file_path)