# Wind ETL: Extract, Transform, Load

## Initial Imports

In [1]:
# Import Dependencies
import requests
import pandas as pd
import json
import numpy as np
import datetime

In [2]:
# Import API Key
from config import key

# To create an API key, visit the site below:
#https://www.worldweatheronline.com/developer/my/analytics.aspx?key_id=222419

# Extracting the Wind Weather Data

## Define the Functions

In [3]:
def makeARequest(location, startDate, endDate, yourAPIKey):
    '''
    Make a request to the worldweatheronline local history weather API page.
    '''
    # Documentation on how to make an API Request using World Weather Online:
    # https://www.worldweatheronline.com/developer/api/docs/local-city-town-weather-api.aspx

    baseURL = "http://api.worldweatheronline.com/premium/v1/past-weather.ashx"
    timeInterval = "1" # Specifies the time interval in hours
    outputFormatToReturn = "json" # The output format to return

    # Add request parameters to base URL
    requestURL = f"{baseURL}?q={location}&date={startDate}&enddate={endDate}&tp={timeInterval}&format={outputFormatToReturn}&key={yourAPIKey}"
    
    # Get the webpage using the requests library and requestURL
    response = requests.get(requestURL)

    # If status response code indicates that the request has succeeded
    if response.status_code == 200:
        # Turn the response into a JSON object
        responseJson = response.json()
        return responseJson
    else:
        # Else, print the Error status code 
        return print(response.status_code)

In [4]:
def monthlyHistoricalWindDF(firstDayOfMonth, lastDayOfMonth, jsonResponse):
    ''' 
    Pull wind weather variables from the response JSON and turn into Pandas DataFrame. 
    '''
    # Create a datetime object from the firstDayOfMonth and lastDayOfMonth input strings
    first = datetime.datetime.strptime(firstDayOfMonth, '%Y-%m-%d')
    last = datetime.datetime.strptime(lastDayOfMonth, '%Y-%m-%d')
    # Subtract the fist day from the last day of each month 
    numberOfDays = last.day - first.day

    # Initiate the HourlyHistoricalWeather list
    HourlyHistoricalWeather = []
    
    # For each day of the month
    for day in np.arange(0,numberOfDays + 1,1):
        # and each hour of each day
        for hour in np.arange(0,24,1):
            # append the wind weather variables to the HourlyHistoricalWeather list 
            HourlyHistoricalWeather.append({
                "Date" : jsonResponse["data"]["weather"][day]["date"],
                "Time" : jsonResponse["data"]["weather"][day]["hourly"][hour]["time"],
                "Weather_Description" : jsonResponse["data"]["weather"][day]["hourly"][hour]["weatherDesc"][0]["value"],
                "Temperature_F" : jsonResponse["data"]["weather"][day]["hourly"][hour]["tempF"],
                "WindSpeed_mph" : jsonResponse["data"]["weather"][day]["hourly"][hour]["windspeedMiles"],
                "WindDirection_degrees" : jsonResponse["data"]["weather"][day]["hourly"][hour]["winddirDegree"],
                "WindDirection_compass" : jsonResponse["data"]["weather"][day]["hourly"][hour]["winddir16Point"],
                "WindGust_mph" : jsonResponse["data"]["weather"][day]["hourly"][hour]["WindGustMiles"],
                "Humidity_percent" : jsonResponse["data"]["weather"][day]["hourly"][hour]["humidity"]
            })

    # Store the wind weather variables in a Pandas DataFrame 
    weatherDataFrame = pd.DataFrame(HourlyHistoricalWeather)
    return weatherDataFrame

## Define the Hackberry Wind Farm Location

In [5]:
# Define the Latitude and longitude of Hackberry Wind Farm
latLong = "32.776111,-99.476444"

## API Calls for January 2017 - July 2020

In [6]:
# Define the yearList and monthList variables
    # Making API calls for 2017 - 2020 
yearList = [2017, 2018, 2019, 2020]
monthList = np.arange(1,13,1)

In [7]:
lastDayOfMonth = []

# for each year in the yearList
for year in yearList: 
    # and each month of each year
    for month in monthList:
        # if the month does not equal 12 (December)
        if month != 12:
            # subtract one day from the 1st of each month
            # this will handle leap years
            date = datetime.date(year=year, month=month + 1, day=1) - datetime.timedelta(days=1)
            dateStr = date.strftime('%Y-%m-%d')
            lastDayOfMonth.append(dateStr)
        else:
            # the last day of Decemeber is set to the 31st for each year
            date = datetime.date(year=year, month=12, day=31)
            dateStr = date.strftime('%Y-%m-%d')
            lastDayOfMonth.append(dateStr)

# slice the list since the 2020 MWH data only goes through July 31st, 2020  
lastDayOfMonth = lastDayOfMonth[0:43]

In [8]:
firstDayOfMonth = []

# for each year in the yearList
for year in yearList: 
    # and each month of each year
    for month in monthList:
        # set the date to the 1st for each month in each year
        date = datetime.date(year, month, 1)
        dateStr = date.strftime('%Y-%m-%d')
        firstDayOfMonth.append(dateStr)

# slice the list since the 2020 MWH data only goes through July 31st, 2020  
firstDayOfMonth = firstDayOfMonth[0:43]

In [9]:
hourlyWeatherDF = pd.DataFrame()

# use the start and end dates of each month to make an API call and append the weather data to a DataFrame
for i in np.arange(0,43,1):
     responseJson = makeARequest(latLong, firstDayOfMonth[i], lastDayOfMonth[i], key)
     hourlyWeatherDF = hourlyWeatherDF.append([monthlyHistoricalWindDF(firstDayOfMonth[i], lastDayOfMonth[i], responseJson)])

# Display the DataFrame
hourlyWeatherDF.head()

Unnamed: 0,Date,Time,Weather_Description,Temperature_F,WindSpeed_mph,WindDirection_degrees,WindDirection_compass,WindGust_mph,Humidity_percent
0,2017-01-01,0,Partly cloudy,45,5,2,N,9,59
1,2017-01-01,100,Partly cloudy,44,4,118,ESE,8,61
2,2017-01-01,200,Partly cloudy,44,4,234,SW,8,64
3,2017-01-01,300,Partly cloudy,43,4,350,N,7,67
4,2017-01-01,400,Partly cloudy,44,3,235,SW,5,66


In [10]:
# Re-Index the DataFrame: 24 hours * 1308 days 
index = np.arange(0,24*1308,1)
hourlyWeatherDF = hourlyWeatherDF.set_index(index)

# Display the DataFrame
hourlyWeatherDF

Unnamed: 0,Date,Time,Weather_Description,Temperature_F,WindSpeed_mph,WindDirection_degrees,WindDirection_compass,WindGust_mph,Humidity_percent
0,2017-01-01,0,Partly cloudy,45,5,2,N,9,59
1,2017-01-01,100,Partly cloudy,44,4,118,ESE,8,61
2,2017-01-01,200,Partly cloudy,44,4,234,SW,8,64
3,2017-01-01,300,Partly cloudy,43,4,350,N,7,67
4,2017-01-01,400,Partly cloudy,44,3,235,SW,5,66
...,...,...,...,...,...,...,...,...,...
31387,2020-07-31,1900,Partly cloudy,88,8,104,ESE,11,35
31388,2020-07-31,2000,Partly cloudy,86,8,78,ENE,12,39
31389,2020-07-31,2100,Partly cloudy,84,7,52,NE,13,43
31390,2020-07-31,2200,Partly cloudy,82,7,55,NE,13,47


## Explore the Data

In [11]:
# Check the Data Types
# Returns objects. 
hourlyWeatherDF.dtypes

Date                     object
Time                     object
Weather_Description      object
Temperature_F            object
WindSpeed_mph            object
WindDirection_degrees    object
WindDirection_compass    object
WindGust_mph             object
Humidity_percent         object
dtype: object

In [12]:
# Check the Value Counts
# Returns 24 unique values for each day. This is expected since weather data is being extracted for each hour of each day. 
hourlyWeatherDF['Date'].value_counts()

2018-10-20    24
2019-03-09    24
2017-10-08    24
2017-09-07    24
2018-03-26    24
              ..
2020-03-11    24
2020-03-03    24
2018-08-14    24
2019-06-21    24
2019-10-19    24
Name: Date, Length: 1308, dtype: int64

In [13]:
# Check the Value Counts
# Returns 578 unique values for each time. This is expected since data is being extracted for 578 days (Jan 2019 - July 2020). 
hourlyWeatherDF['Time'].value_counts()

500     1308
2000    1308
1500    1308
2200    1308
800     1308
1700    1308
1900    1308
2100    1308
200     1308
1400    1308
1800    1308
1600    1308
0       1308
1200    1308
700     1308
2300    1308
400     1308
900     1308
1300    1308
1100    1308
1000    1308
100     1308
600     1308
300     1308
Name: Time, dtype: int64

# Performing Transformation on Wind Weather Data

## Define the Functions

In [14]:
def cleaningDataFrame_datetime(df):
    '''
    Clean the date and time columns and combine into a single Date_Time column.
    '''
    # Convert the Dates to a datetime object
    df['Date'] = pd.to_datetime(df['Date'])
    # Convert the Time data type to integer in order to perform mathematical operations
    df['Time'] = df['Time'].astype(int)
    # Divide the Time by 100 (which converts the data type to float) and convert data type to integer
    df['Time'] = (df['Time']/100).astype(int)
    # Convert Time to a timedelta object
    # This allows us to do simple addition arithmetic on datetimes
    df['Time'] = df['Time'].astype('timedelta64[h]')
    # Add Time to Date to create a new Date_Time column 
    # This creates a datetime object that has both date and time in the same object
    df['Date_Time'] = df['Date'] + df['Time']
    # Drop the Date and Time columns since a new Date_Time column has been created
    df = df.drop(['Date', 'Time'], 1)
    return df

In [15]:
def cleaningDataFrame_wind(df):
    '''
    Clean the data types of the weather variables. 
    '''
    # Covert Temperature data type to integer
    df['Temperature_F'] = df['Temperature_F'].astype(int)
    # Convert the WindSpeed data type to integer 
    df['WindSpeed_mph'] = df['WindSpeed_mph'].astype(int)
    # Convert the WindDirection data type to integer 
    df['WindDirection_degrees'] = df['WindDirection_degrees'].astype(int)
    # Convert the WindGust data type to integer 
    df['WindGust_mph'] = df['WindGust_mph'].astype(int)
    # Convert the Humidity data type to integer
    df['Humidity_percent'] = df['Humidity_percent'].astype(int)
    return df

## Clean the Original Hourly Weather DataFrame

In [16]:
# Clean the date and time columns of the original DataFrame
hourlyWeatherDF_cleanedDateTime = cleaningDataFrame_datetime(hourlyWeatherDF)

# Display the DataFrame
hourlyWeatherDF_cleanedDateTime.head()

Unnamed: 0,Weather_Description,Temperature_F,WindSpeed_mph,WindDirection_degrees,WindDirection_compass,WindGust_mph,Humidity_percent,Date_Time
0,Partly cloudy,45,5,2,N,9,59,2017-01-01 00:00:00
1,Partly cloudy,44,4,118,ESE,8,61,2017-01-01 01:00:00
2,Partly cloudy,44,4,234,SW,8,64,2017-01-01 02:00:00
3,Partly cloudy,43,4,350,N,7,67,2017-01-01 03:00:00
4,Partly cloudy,44,3,235,SW,5,66,2017-01-01 04:00:00


In [17]:
# Clean the weather variables 
cleaned_hourlyWeatherDF = cleaningDataFrame_wind(hourlyWeatherDF_cleanedDateTime)

# Print the shape (rows, columns) of the DataFrame
print(cleaned_hourlyWeatherDF.shape)

# Display the DataFrame
cleaned_hourlyWeatherDF.head()

(31392, 8)


Unnamed: 0,Weather_Description,Temperature_F,WindSpeed_mph,WindDirection_degrees,WindDirection_compass,WindGust_mph,Humidity_percent,Date_Time
0,Partly cloudy,45,5,2,N,9,59,2017-01-01 00:00:00
1,Partly cloudy,44,4,118,ESE,8,61,2017-01-01 01:00:00
2,Partly cloudy,44,4,234,SW,8,64,2017-01-01 02:00:00
3,Partly cloudy,43,4,350,N,7,67,2017-01-01 03:00:00
4,Partly cloudy,44,3,235,SW,5,66,2017-01-01 04:00:00


In [18]:
# Check data types to make sure they are correct
cleaned_hourlyWeatherDF.dtypes

Weather_Description              object
Temperature_F                     int32
WindSpeed_mph                     int32
WindDirection_degrees             int32
WindDirection_compass            object
WindGust_mph                      int32
Humidity_percent                  int32
Date_Time                datetime64[ns]
dtype: object

## Export the Cleaned Hourly Weather DataFrame to a CSV File

In [19]:
# # Uncomment to output the cleaned DataFrame as a csv file
# cleaned_hourlyWeatherDF.to_csv(r'../Resources/Output/Cleaned_Hourly_Wind_Weather.csv', index = False)

# Extracting and Transforming the Hackberry Wind Energy Output (MWH) Data

## Define the Functions

In [20]:
def cleanRenewableFarmData(df):
    '''
    This function cleans the raw farm data.
    '''
    # Drop the Unit column
    df = df.drop('Unit', 1)
    # Convert Date into a datetime object
    df['Date'] = pd.to_datetime(df['Date'], format='%Y%m%d')
    # Convert Hour Ending data type into a string
    df['Hour Ending'] = df['Hour Ending'].astype(str)
    # Create a new column 'lastDigit' that takes the last digit from the Hour Ending column
    # lastDigit = 'D', indicates daylight saving 
    df['lastDigit'] = df['Hour Ending'].str.strip().str[-1]
    # Convert lastDigit data type into a string
    df['lastDigit'].astype(str)
    # If lastDigit = 'D', then drop the last 3 characters of the Hour Ending string; else, drop the last 2 characters of the Hour Ending string
    df['Hour Ending'] = df.apply(lambda x: x['Hour Ending'][:-3] if x['lastDigit'] == 'D' else x['Hour Ending'][:-2], axis=1)
    # Convert Hour Ending data type into an integer
    df['Hour Ending'].astype(int)
    # Convert Hour Ending into a timedelta object in order to add the hour to the date 
    df['Hour Ending'] = df['Hour Ending'].astype('timedelta64[h]')
    # Adjust for midnight = 24:00 (not 00:00)
    df['Hour Ending'] = df['Hour Ending'] - pd.to_timedelta(df['Hour Ending'].dt.days, unit='d')
    # Add the Hour Ending (aka time) to the date column and create a new Date_Time column
    df['Date_Time'] = df['Date'] + df['Hour Ending']
    # Drop the Date and Hour Ending columns since they have been combined into a single column, Date_Time
    # Drop the lastDigit column since it was only used to handle cleaning the daylight Saving in the Hour Ending column
    df = df.drop(['Date', 'Hour Ending', 'lastDigit'], 1)
    return df

## Extracting the Hackberry Wind Farm Energy Output (MWH) Data

In [21]:
# Read in the data and store as Pandas DataFrame
data_2017 = "../Resources/Raw Data/Hackberry_and_Webberville_2017.xlsx"
Hackberry_2017_DF = pd.read_excel(data_2017, sheet_name='Hackberry')

# Print the shape (rows, columns) of the DataFrame
print(Hackberry_2017_DF.shape)

# Display the DataFrame
Hackberry_2017_DF.head()

(8760, 4)


Unnamed: 0,Unit,Date,Hour Ending,MWH
0,HWF_HWFG1,20170101,100,10.597825
1,HWF_HWFG1,20170101,200,3.640675
2,HWF_HWFG1,20170101,300,3.16025
3,HWF_HWFG1,20170101,400,0.0782
4,HWF_HWFG1,20170101,500,0.0


In [22]:
# create a pandas DataFrame for all Hackberry Output data
HackberryDF = pd.DataFrame()

# append the 2017 Hackberry Data to the HackberryDF
HackberryDF = HackberryDF.append(Hackberry_2017_DF)

# Print the shape (rows, columns) of the DataFrame
print(HackberryDF.shape)

# Display the DataFrame
HackberryDF.head()

(8760, 4)


Unnamed: 0,Unit,Date,Hour Ending,MWH
0,HWF_HWFG1,20170101,100,10.597825
1,HWF_HWFG1,20170101,200,3.640675
2,HWF_HWFG1,20170101,300,3.16025
3,HWF_HWFG1,20170101,400,0.0782
4,HWF_HWFG1,20170101,500,0.0


In [23]:
# Read in the data and store as Pandas DataFrame
data_2018 = "../Resources/Raw Data/Hackberry_and_Webberville_2018.xlsx"
Hackberry_2018_DF = pd.read_excel(data_2018, sheet_name='Hackberry')

# Print the shape (rows, columns) of the DataFrame
print(Hackberry_2018_DF.shape)

# Display the DataFrame
Hackberry_2018_DF.head()

(8760, 4)


Unnamed: 0,Unit,Date,Hour Ending,MWH
0,HWF_HWFG1,20180101,100,0.0
1,HWF_HWFG1,20180101,200,0.02465
2,HWF_HWFG1,20180101,300,0.003675
3,HWF_HWFG1,20180101,400,0.1995
4,HWF_HWFG1,20180101,500,0.65945


In [24]:
# append the 2018 Hackberry Data to the HackberryDF
HackberryDF = HackberryDF.append(Hackberry_2018_DF)

# Print the shape (rows, columns) of the DataFrame
print(HackberryDF.shape)

# Display the DataFrame
HackberryDF.tail()

(17520, 4)


Unnamed: 0,Unit,Date,Hour Ending,MWH
8755,HWF_HWFG1,20181231,2000,96.7669
8756,HWF_HWFG1,20181231,2100,118.76245
8757,HWF_HWFG1,20181231,2200,124.605975
8758,HWF_HWFG1,20181231,2300,125.2891
8759,HWF_HWFG1,20181231,2400,118.139375


In [25]:
# Read in the data and store as Pandas DataFrame
data_2019_2020 = "../Resources/Raw Data/Hackberry_Generation_2019_2020.csv"
Hackberry_2019_2020_DF = pd.read_csv(data_2019_2020)

# Print the shape (rows, columns) of the DataFrame
print(Hackberry_2019_2020_DF.shape)

# Display the DataFrame
Hackberry_2019_2020_DF

(13871, 4)


Unnamed: 0,Unit,Date,Hour Ending,MWH
0,HWF_HWFG1,20190101,100,110.487950
1,HWF_HWFG1,20190101,200,72.020225
2,HWF_HWFG1,20190101,300,67.639475
3,HWF_HWFG1,20190101,400,63.718900
4,HWF_HWFG1,20190101,500,61.264250
...,...,...,...,...
13866,HWF_HWFG1,20200731,2000,4.998600
13867,HWF_HWFG1,20200731,2100,16.390275
13868,HWF_HWFG1,20200731,2200,20.637800
13869,HWF_HWFG1,20200731,2300,13.998975


In [26]:
# append the 2019 and 2020 Hackberry Data to the HackberryDF
HackberryDF = HackberryDF.append(Hackberry_2019_2020_DF)

# Print the shape (rows, columns) of the DataFrame
print(HackberryDF.shape)

# Display the DataFrame
HackberryDF.tail()

(31391, 4)


Unnamed: 0,Unit,Date,Hour Ending,MWH
13866,HWF_HWFG1,20200731,2000,4.9986
13867,HWF_HWFG1,20200731,2100,16.390275
13868,HWF_HWFG1,20200731,2200,20.6378
13869,HWF_HWFG1,20200731,2300,13.998975
13870,HWF_HWFG1,20200731,2400,12.494875


In [27]:
# Re-Index the DataFrame
index = np.arange(0,31391,1)
HackberryDF = HackberryDF.set_index(index)

# Display the DataFrame
HackberryDF

Unnamed: 0,Unit,Date,Hour Ending,MWH
0,HWF_HWFG1,20170101,100,10.597825
1,HWF_HWFG1,20170101,200,3.640675
2,HWF_HWFG1,20170101,300,3.160250
3,HWF_HWFG1,20170101,400,0.078200
4,HWF_HWFG1,20170101,500,0.000000
...,...,...,...,...
31386,HWF_HWFG1,20200731,2000,4.998600
31387,HWF_HWFG1,20200731,2100,16.390275
31388,HWF_HWFG1,20200731,2200,20.637800
31389,HWF_HWFG1,20200731,2300,13.998975


## Explore the Data

In [28]:
# Check the data types
HackberryDF.dtypes

Unit            object
Date             int64
Hour Ending     object
MWH            float64
dtype: object

In [29]:
# Check the Value Counts
# The Unit column can be dropped since the values are the same in each row.
HackberryDF['Unit'].value_counts()

HWF_HWFG1    31391
Name: Unit, dtype: int64

In [30]:
# Check the Value Counts
# The value counts are unexpacted:
    # 2019-11-03 has 25 unique values
    # 2019-03-10 has 23 unique values
    # 2020-03-08 has 23 unique values
# Observation: 
# Daylight saving in 2019 occurred on 2019-03-10 and 2019-11-03. Daylight saving in 2020 occurred on 2020-03-08 and 2020-11-01. 
HackberryDF['Date'].value_counts()

20191103    25
20181104    25
20171105    25
20191229    24
20180104    24
            ..
20190824    24
20190310    23
20170312    23
20200308    23
20180311    23
Name: Date, Length: 1308, dtype: int64

In [31]:
# Due to Daylight saving, two values for 02:00am were recorded on 2019-11-03 (200 and 0200D).
HackberryDF.loc[(HackberryDF['Date'] == 20191103)].head()

Unnamed: 0,Unit,Date,Hour Ending,MWH
24863,HWF_HWFG1,20191103,100,87.675175
24864,HWF_HWFG1,20191103,200,121.1328
24865,HWF_HWFG1,20191103,0200D,138.596
24866,HWF_HWFG1,20191103,300,144.523925
24867,HWF_HWFG1,20191103,400,149.124275


In [32]:
# Due to Daylight saving, no value was recorded for 02:00am on 2019-03-10. 
HackberryDF.loc[(HackberryDF['Date'] == 20190310)].head()

Unnamed: 0,Unit,Date,Hour Ending,MWH
19152,HWF_HWFG1,20190310,100,1.41865
19153,HWF_HWFG1,20190310,300,0.683675
19154,HWF_HWFG1,20190310,400,2.56275
19155,HWF_HWFG1,20190310,500,15.25375
19156,HWF_HWFG1,20190310,600,99.7529


In [33]:
# Due to Daylight saving, no value was recorded for 02:00am on 2020-03-08. 
HackberryDF.loc[(HackberryDF['Date'] == 20200308)].head()

Unnamed: 0,Unit,Date,Hour Ending,MWH
27888,HWF_HWFG1,20200308,100,0.503525
27889,HWF_HWFG1,20200308,300,0.473275
27890,HWF_HWFG1,20200308,400,0.604575
27891,HWF_HWFG1,20200308,500,0.393175
27892,HWF_HWFG1,20200308,600,0.533425


## Transforming the Hackberry Wind Farm Energy Output (MWH) Data

In [34]:
# Clean the HackberryDF
cleaned_Hackberry_DF = cleanRenewableFarmData(HackberryDF)

# Print the shape (rows, columns) of the DataFrame
print(cleaned_Hackberry_DF.shape)

# Display the DataFrame
cleaned_Hackberry_DF

(31391, 2)


Unnamed: 0,MWH,Date_Time
0,10.597825,2017-01-01 01:00:00
1,3.640675,2017-01-01 02:00:00
2,3.160250,2017-01-01 03:00:00
3,0.078200,2017-01-01 04:00:00
4,0.000000,2017-01-01 05:00:00
...,...,...
31386,4.998600,2020-07-31 20:00:00
31387,16.390275,2020-07-31 21:00:00
31388,20.637800,2020-07-31 22:00:00
31389,13.998975,2020-07-31 23:00:00


In [35]:
# Check the function properly handled daylight savings
# Two different MWH values are displayed for 02:00am on 2019-11-03
cleaned_Hackberry_DF.loc[(cleaned_Hackberry_DF['Date_Time'] == '2019-11-03 02:00:00')]

Unnamed: 0,MWH,Date_Time
24864,121.1328,2019-11-03 02:00:00
24865,138.596,2019-11-03 02:00:00


In [36]:
# Check data types to make sure they are correct
cleaned_Hackberry_DF.dtypes

MWH                 float64
Date_Time    datetime64[ns]
dtype: object

## Export the Cleaned Hackberry Wind Farm Energy Output (MWH) DataFrame to a CSV File

In [37]:
# # Uncomment to output the cleaned DataFrame as a csv file
# cleaned_Hackberry_DF.to_csv(r'../Resources/Output/Cleaned_Hackberry_Generation.csv', index = False)

# Merging Hackberry Energy Output Data with Wind Weather Data

## Define the Functions

In [38]:
def datetimeSplit(df):
    '''
    Split the datetime object into separate year, month, and day columns. 
    '''
    # Separate Year from datetime object and convert to int32
    df['Year'] = df['Date_Time'].dt.year
    df['Year'] = df['Year'].astype(int)
    # Separate Month from datetime object and convert to int32
    df['Month'] = df['Date_Time'].dt.month
    df['Month'] = df['Month'].astype(int)
    # Separate Day from datetime object and convert to int32
    df['Day'] = df['Date_Time'].dt.day
    df['Day'] = df['Day'].astype(int) 
    # Separate the Hour from datetime object and convert to int32
    df['Hour'] = df['Date_Time'].dt.hour
    df['Hour'] = df['Hour'].astype(int)
    return df

## Merge the Weather and Energy Output DataFrames

In [39]:
# Merge the cleaned weather data with the cleaned wind farm data using an outer join
HackberryWindMWH = pd.merge(cleaned_hourlyWeatherDF, cleaned_Hackberry_DF, on='Date_Time', how='outer')

# Display the DataFrame
HackberryWindMWH

Unnamed: 0,Weather_Description,Temperature_F,WindSpeed_mph,WindDirection_degrees,WindDirection_compass,WindGust_mph,Humidity_percent,Date_Time,MWH
0,Partly cloudy,45,5,2,N,9,59,2017-01-01 00:00:00,150.477750
1,Partly cloudy,44,4,118,ESE,8,61,2017-01-01 01:00:00,10.597825
2,Partly cloudy,44,4,234,SW,8,64,2017-01-01 02:00:00,3.640675
3,Partly cloudy,43,4,350,N,7,67,2017-01-01 03:00:00,3.160250
4,Partly cloudy,44,3,235,SW,5,66,2017-01-01 04:00:00,0.078200
...,...,...,...,...,...,...,...,...,...
31390,Partly cloudy,88,8,104,ESE,11,35,2020-07-31 19:00:00,10.764125
31391,Partly cloudy,86,8,78,ENE,12,39,2020-07-31 20:00:00,4.998600
31392,Partly cloudy,84,7,52,NE,13,43,2020-07-31 21:00:00,16.390275
31393,Partly cloudy,82,7,55,NE,13,47,2020-07-31 22:00:00,20.637800


## Clean the Marged DataFrame

In [40]:
# Count the Null Values
# Displays null values due to daylight saving (3/10/2019  2:00:00 AM and 3/8/2020  2:00:00 AM)
HackberryWindMWH.isna().sum()

Weather_Description      0
Temperature_F            0
WindSpeed_mph            0
WindDirection_degrees    0
WindDirection_compass    0
WindGust_mph             0
Humidity_percent         0
Date_Time                0
MWH                      4
dtype: int64

In [41]:
# Drop the null values
HackberryWindMWH.dropna(inplace=True)

# Print the shape (rows, columns) of the DataFrame
HackberryWindMWH.shape

(31391, 9)

In [42]:
# Split the Date_Time column into separate columns for year, month, and date
# Splitting the Date_Time column so exploratory analysis can be performed based on month, year, and time features
HackberryWindMWH = datetimeSplit(HackberryWindMWH)

In [43]:
# Re-Order the columns
HackberryWindMWH = HackberryWindMWH[["Date_Time", "Year", "Month", "Day", "Hour", "MWH", "Temperature_F", "Humidity_percent", "WindSpeed_mph", "WindGust_mph", "WindDirection_degrees", "WindDirection_compass", "Weather_Description"]]

In [44]:
# Print the shape (rows, columns) of the DataFrame
print(HackberryWindMWH.shape)

# Display the DataFrame
HackberryWindMWH.head()

(31391, 13)


Unnamed: 0,Date_Time,Year,Month,Day,Hour,MWH,Temperature_F,Humidity_percent,WindSpeed_mph,WindGust_mph,WindDirection_degrees,WindDirection_compass,Weather_Description
0,2017-01-01 00:00:00,2017,1,1,0,150.47775,45,59,5,9,2,N,Partly cloudy
1,2017-01-01 01:00:00,2017,1,1,1,10.597825,44,61,4,8,118,ESE,Partly cloudy
2,2017-01-01 02:00:00,2017,1,1,2,3.640675,44,64,4,8,234,SW,Partly cloudy
3,2017-01-01 03:00:00,2017,1,1,3,3.16025,43,67,4,7,350,N,Partly cloudy
4,2017-01-01 04:00:00,2017,1,1,4,0.0782,44,66,3,5,235,SW,Partly cloudy


In [45]:
# Check the final data types
HackberryWindMWH.dtypes

Date_Time                datetime64[ns]
Year                              int32
Month                             int32
Day                               int32
Hour                              int32
MWH                             float64
Temperature_F                     int32
Humidity_percent                  int32
WindSpeed_mph                     int32
WindGust_mph                      int32
WindDirection_degrees             int32
WindDirection_compass            object
Weather_Description              object
dtype: object

## Export the Final DataFrame (Merged Wind Energy Output and Weather Data) to a CSV File

In [46]:
# Uncomment to output the final merged DataFrame as a csv file
HackberryWindMWH.to_csv(r'../Resources/Output/Hackberry_Wind_2017-2020_MWH.csv', index = False)

# Load the Data into MongoDB 


## Initial Imports

In [47]:
# Import Dependencies
import config
import pymongo

In [48]:
# Set string variables
DEFAULT_DATABASE = 'wind_solar_data' 
USERNAME = config.USERNAME
PASSWORD = config.PASSWORD

## Connect to MongoDB 

In [49]:
#create connection to database
client = pymongo.MongoClient(f"mongodb+srv://{USERNAME}:{PASSWORD}@austin-green-energy.pwzpm.mongodb.net/{DEFAULT_DATABASE}?retryWrites=true&w=majority")
try:
    client.server_info()
    print("Mongodb connected")
except:
    print("The Mongodb failed to connect. Check username/password in connection string.")

## Uploading the Wind Data to the Database

In [50]:
# Select database
db = client.get_database('wind_solar_data')
# Select collection
collection = db.wind_data

# Pull the csv file from Output folder
wind_data = wind_data = pd.read_csv('../Resources/Output/Hackberry_Wind_2017-2020_MWH.csv')  
# Turn the csv into a JSON object
wind_data_json = json.loads(wind_data.to_json(orient='records'))

# Remove what is in the collection currently
collection.remove()
# Insert the new JSON data into the database
collection.insert(wind_data_json)

## Pull the Data from the Database and Upload into DataFrame

In [51]:
# Select database
db = client.get_database('wind_solar_data')
# sSlect collection
collection = db.wind_data

# Pull collection into dataframe
wind_df = pd.DataFrame(list(collection.find()))
wind_df