# Solar ETL: Extract, Transform, Load

## Initial Imports

In [1]:
# Import Dependencies
import requests
import pandas as pd
import json
import numpy as np
from datetime import datetime, timedelta

In [2]:
# Import API Key
from config import key

# To create an API key, visit the site below:
#https://www.worldweatheronline.com/developer/my/analytics.aspx?key_id=222419

# Extracting the Solar Weather Data

## Define the Functions

In [3]:
def makeARequest(location, startDate, endDate, yourAPIKey):
    '''
    Make a request to the worldweatheronline local history weather API page.
    '''
    # Documentation on how to make an API Request using World Weather Online:
    # https://www.worldweatheronline.com/developer/api/docs/local-city-town-weather-api.aspx

    baseURL = "http://api.worldweatheronline.com/premium/v1/past-weather.ashx"
    timeInterval = "1" # Specifies the time interval in hours
    outputFormatToReturn = "json" # The output format to return

    # Add request parameters to base URL
    requestURL = f"{baseURL}?q={location}&date={startDate}&enddate={endDate}&tp={timeInterval}&format={outputFormatToReturn}&key={yourAPIKey}"
    
    # Get the webpage using the requests library and requestURL
    response = requests.get(requestURL)

    # If status response code indicates that the request has succeeded
    if response.status_code == 200:
        # Turn the response into a JSON object
        responseJson = response.json()
        return responseJson
    else:
        # Else, print the Error status code 
        return print(response.status_code)

In [4]:
def monthlyHistoricalSolarDF(firstDayOfMonth, lastDayOfMonth, jsonResponse):
    ''' 
    Pull solar weather variables from the response JSON and turn into Pandas DataFrame. 
    '''
    # Create a datetime object from the firstDayOfMonth and lastDayOfMonth input strings
    first = datetime.strptime(firstDayOfMonth, '%Y-%m-%d')
    last = datetime.strptime(lastDayOfMonth, '%Y-%m-%d')
    # Subtract the fist day from the last day of each month 
    numberOfDays = last.day - first.day

    # Initiate the HourlyHistoricalWeather list
    HourlyHistoricalWeather = []
    
    # For each day of the month
    for day in np.arange(0,numberOfDays + 1,1):
        # and each hour of each day
        for hour in np.arange(0,24,1):
            # append the solar weather variables to the HourlyHistoricalWeather list 
            HourlyHistoricalWeather.append({
                "Date" : jsonResponse["data"]["weather"][day]["date"],
                "Time" : jsonResponse["data"]["weather"][day]["hourly"][hour]["time"],
                "Weather_Description" : jsonResponse["data"]["weather"][0]["hourly"][0]["weatherDesc"][0]["value"],
                "Temperature_F" : jsonResponse["data"]["weather"][0]["hourly"][0]["tempF"],
                "Sunhour" : jsonResponse["data"]["weather"][day]["sunHour"],
                "CloudCover_percent" : jsonResponse["data"]["weather"][0]["hourly"][0]["cloudcover"],
                "uvIndex" : jsonResponse["data"]["weather"][0]["hourly"][0]["uvIndex"],
                "Humidity_percent" : jsonResponse["data"]["weather"][day]["hourly"][hour]["humidity"]
            })

    # Store the solar weather variables in a Pandas DataFrame 
    weatherDataFrame = pd.DataFrame(HourlyHistoricalWeather)
    return weatherDataFrame

## Define the Webberville Solar Farm Location

In [5]:
# Define the Latitude and longitude of Webberville Solar Farm
latLong = "30.238333,-97.508611"

## API Calls for January 2019 - December 2019

In [6]:
# January 2019
date = "2019-01-01"
enddate = "2019-01-31"

# Use an API call to get the January 2019 solar weather data
responseJson = makeARequest(latLong, date, enddate, key) 

# Store the January 2019 solar weather data into a DataFrame
Jan2019DF = monthlyHistoricalSolarDF(date, enddate, responseJson)
# Display the DataFrame to check if the functions have been applied
Jan2019DF.head()

Unnamed: 0,Date,Time,Weather_Description,Temperature_F,Sunhour,CloudCover_percent,uvIndex,Humidity_percent
0,2019-01-01,0,Clear,43,6.7,0,1,88
1,2019-01-01,100,Clear,43,6.7,0,1,88
2,2019-01-01,200,Clear,43,6.7,0,1,89
3,2019-01-01,300,Clear,43,6.7,0,1,90
4,2019-01-01,400,Clear,43,6.7,0,1,90


In [7]:
# February 2019
date = "2019-02-01"
enddate = "2019-02-28"

# Use an API call to get the February 2019 solar weather data
responseJson = makeARequest(latLong, date, enddate, key)

# Store the February 2019 solar weather data into a DataFrame
Feb2019DF = monthlyHistoricalSolarDF(date, enddate, responseJson)

In [8]:
# March 2019
date = "2019-03-01"
enddate = "2019-03-31"

# Use an API call to get the March 2019 solar weather data
responseJson = makeARequest(latLong, date, enddate, key)

# Store the March 2019 solar weather data into a DataFrame
March2019DF = monthlyHistoricalSolarDF(date, enddate, responseJson)

In [9]:
# April 2019
date = "2019-04-01"
enddate = "2019-04-30"

# Use an API call to get the April 2019 solar weather data
responseJson = makeARequest(latLong, date, enddate, key)

# Store the April 2019 solar weather data into a DataFrame
April2019DF = monthlyHistoricalSolarDF(date, enddate, responseJson)

In [10]:
# May 2019
date = "2019-05-01"
enddate = "2019-05-31"

# Use an API call to get the May 2019 solar weather data
responseJson = makeARequest(latLong, date, enddate, key)

# Store the May 2019 solar weather data into a DataFrame
May2019DF = monthlyHistoricalSolarDF(date, enddate, responseJson)

In [11]:
# June 2019
date = "2019-06-01"
enddate = "2019-06-30"

# Use an API call to get the June 2019 solar weather data
responseJson = makeARequest(latLong, date, enddate, key)

# Store the June 2019 solar weather data into a DataFrame
June2019DF = monthlyHistoricalSolarDF(date, enddate, responseJson)

In [12]:
# July 2019
date = "2019-07-01"
enddate = "2019-07-31"

# Use an API call to get the July 2019 solar weather data
responseJson = makeARequest(latLong, date, enddate, key)

# Store the July 2019 solar weather data into a DataFrame
July2019DF = monthlyHistoricalSolarDF(date, enddate, responseJson)

In [13]:
# August 2019
date = "2019-08-01"
enddate = "2019-08-31"

# Use an API call to get the August 2019 solar weather data
responseJson = makeARequest(latLong, date, enddate, key)

# Store the August 2019 solar weather data into a DataFrame
Aug2019DF = monthlyHistoricalSolarDF(date, enddate, responseJson)

In [14]:
# September 2019
date = "2019-09-01"
enddate = "2019-09-30"

# Use an API call to get the September 2019 solar weather data
responseJson = makeARequest(latLong, date, enddate, key)

# Store the September 2019 solar weather data into a DataFrame
Sept2019DF = monthlyHistoricalSolarDF(date, enddate, responseJson)

In [15]:
# October 2019
date = "2019-10-01"
enddate = "2019-10-31"

# Use an API call to get the October 2019 solar weather data
responseJson = makeARequest(latLong, date, enddate, key)

# Store the October 2019 solar weather data into a DataFrame
Oct2019DF = monthlyHistoricalSolarDF(date, enddate, responseJson)

In [16]:
# November 2019
date = "2019-11-01"
enddate = "2019-11-30"

# Use an API call to get the November 2019 solar weather data
responseJson = makeARequest(latLong, date, enddate, key)

# Store the November 2019 solar weather data into a DataFrame
Nov2019DF = monthlyHistoricalSolarDF(date, enddate, responseJson)

In [17]:
# December 2019
date = "2019-12-01"
enddate = "2019-12-31"

# Use an API call to get the December 2019 solar weather data
responseJson = makeARequest(latLong, date, enddate, key)

# Store the December 2019 solar weather data into a DataFrame
Dec2019DF = monthlyHistoricalSolarDF(date, enddate, responseJson)

## API Calls for January 2020 - July 2020

In [18]:
# January 2020
date = "2020-01-01"
enddate = "2020-01-31"

# Use an API call to get the January 2020 solar weather data
responseJson = makeARequest(latLong, date, enddate, key)

# Store the January 2020 solar weather data into a DataFrame
Jan2020DF = monthlyHistoricalSolarDF(date, enddate, responseJson)

In [19]:
# February 2020
date = "2020-02-01"
enddate = "2020-02-29"

# Use an API call to get the February 2020 solar weather data
responseJson = makeARequest(latLong, date, enddate, key)

# Store the February 2020 solar weather data into a DataFrame
Feb2020DF = monthlyHistoricalSolarDF(date, enddate, responseJson)

In [20]:
# March 2020
date = "2020-03-01"
enddate = "2020-03-31"

# Use an API call to get the March 2020 solar weather data
responseJson = makeARequest(latLong, date, enddate, key)

# Store the March 2020 solar weather data into a DataFrame
March2020DF = monthlyHistoricalSolarDF(date, enddate, responseJson)

In [21]:
# April 2020
date = "2020-04-01"
enddate = "2020-04-30"

# Use an API call to get the April 2020 solar weather data
responseJson = makeARequest(latLong, date, enddate, key)

# Store the April 2020 solar weather data into a DataFrame
April2020DF = monthlyHistoricalSolarDF(date, enddate, responseJson)

In [22]:
# May 2020
date = "2020-05-01"
enddate = "2020-05-31"

# Use an API call to get the May 2020 solar weather data
responseJson = makeARequest(latLong, date, enddate, key)

# Store the May 2020 solar weather data into a DataFrame
May2020DF = monthlyHistoricalSolarDF(date, enddate, responseJson)

In [23]:
# June 2020
date = "2020-06-01"
enddate = "2020-06-30"

# Use an API call to get the June 2020 solar weather data
responseJson = makeARequest(latLong, date, enddate, key)

# Store the June 2020 solar weather data into a DataFrame
June2020DF = monthlyHistoricalSolarDF(date, enddate, responseJson)

In [24]:
# July 2020
date = "2020-07-01"
enddate = "2020-07-31"

# Use an API call to get the July 2020 solar weather data
responseJson = makeARequest(latLong, date, enddate, key)

# Store the July 2020 solar weather data into a DataFrame
July2020DF = monthlyHistoricalSolarDF(date, enddate, responseJson)

## Combine All of the DataFrames into One

In [25]:
# Combine each month into a single DataFrame
hourlyWeatherDF = Jan2019DF.append([Feb2019DF, March2019DF, April2019DF, May2019DF, June2019DF, July2019DF, Aug2019DF, Sept2019DF, Oct2019DF, Nov2019DF, Dec2019DF, Jan2020DF, Feb2020DF, March2020DF, April2020DF, May2020DF, June2020DF, July2020DF]) 

# Re-Index the DataFrame: 24 hours * 578 days 
index = np.arange(0,24*578,1)
hourlyWeatherDF = hourlyWeatherDF.set_index(index)

# Display the DataFrame
hourlyWeatherDF

Unnamed: 0,Date,Time,Weather_Description,Temperature_F,Sunhour,CloudCover_percent,uvIndex,Humidity_percent
0,2019-01-01,0,Clear,43,6.7,0,1,88
1,2019-01-01,100,Clear,43,6.7,0,1,88
2,2019-01-01,200,Clear,43,6.7,0,1,89
3,2019-01-01,300,Clear,43,6.7,0,1,90
4,2019-01-01,400,Clear,43,6.7,0,1,90
...,...,...,...,...,...,...,...,...
13867,2020-07-31,1900,Partly cloudy,79,6.9,73,1,58
13868,2020-07-31,2000,Partly cloudy,79,6.9,73,1,62
13869,2020-07-31,2100,Partly cloudy,79,6.9,73,1,66
13870,2020-07-31,2200,Partly cloudy,79,6.9,73,1,71


## Explore the Data

In [26]:
# Check the Data Types
# Returns objects. 
hourlyWeatherDF.dtypes

Date                   object
Time                   object
Weather_Description    object
Temperature_F          object
Sunhour                object
CloudCover_percent     object
uvIndex                object
Humidity_percent       object
dtype: object

In [27]:
# Check the Value Counts
# Returns 24 unique values for each day. This is expected since weather data is being extracted for each hour of each day. 
hourlyWeatherDF['Date'].value_counts()

2019-06-09    24
2020-06-25    24
2019-03-10    24
2019-07-12    24
2019-02-12    24
              ..
2019-10-18    24
2019-04-26    24
2019-05-08    24
2019-08-27    24
2019-05-10    24
Name: Date, Length: 578, dtype: int64

In [28]:
# Check the Value Counts
# Returns 578 unique values for each time. This is expected since data is being extracted for 578 days (Jan 2019 - July 2020). 
hourlyWeatherDF['Time'].value_counts()

1700    578
2000    578
2300    578
2100    578
1500    578
300     578
800     578
1800    578
1000    578
500     578
1900    578
100     578
700     578
1200    578
200     578
900     578
1400    578
1100    578
2200    578
1600    578
600     578
400     578
0       578
1300    578
Name: Time, dtype: int64

# Performing Transformation on Solar Weather Data

## Define the Functions

In [29]:
def cleaningDataFrame_datetime(df):
    '''
    Clean the date and time columns and combine into a single Date_Time column.
    '''
    # Convert the Dates to a datetime object
    df['Date'] = pd.to_datetime(df['Date'])
    # Convert the Time data type to integer in order to perform mathematical operations
    df['Time'] = df['Time'].astype(int)
    # Divide the Time by 100 (which converts the data type to float) and convert data type to integer
    df['Time'] = (df['Time']/100).astype(int)
    # Convert Time to a timedelta object
    # This allows us to do simple addition arithmetic on datetimes
    df['Time'] = df['Time'].astype('timedelta64[h]')
    # Add Time to Date to create a new Date_Time column 
    # This creates a datetime object that has both date and time in the same object
    df['Date_Time'] = df['Date'] + df['Time']
    # Drop the Date and Time columns since a new Date_Time column has been created
    df = df.drop(['Date', 'Time'], 1)
    return df

In [30]:
def cleaningDataFrame_solar(df):
    '''
    Clean the data types of the weather variables. 
    '''
    # Covert Temperature data type to integer
    df['Temperature_F'] = df['Temperature_F'].astype(int)
    # Covert Sunhour data type to float
    df['Sunhour'] = df['Sunhour'].astype(float)
    # Covert CloudCover data type to integer
    df['CloudCover_percent'] = df['CloudCover_percent'].astype(int)
    # Covert uvIndex data type to integer
    df['uvIndex'] = df['uvIndex'].astype(int)
    # Convert the Humidity data type to integer
    df['Humidity_percent'] = df['Humidity_percent'].astype(int)
    return df

## Clean the Original DataFrame

In [31]:
# Clean the date and time columns of the original DataFrame
hourlyWeatherDF_cleanedDateTime = cleaningDataFrame_datetime(hourlyWeatherDF)

# Display the DataFrame
hourlyWeatherDF_cleanedDateTime.head()

Unnamed: 0,Weather_Description,Temperature_F,Sunhour,CloudCover_percent,uvIndex,Humidity_percent,Date_Time
0,Clear,43,6.7,0,1,88,2019-01-01 00:00:00
1,Clear,43,6.7,0,1,88,2019-01-01 01:00:00
2,Clear,43,6.7,0,1,89,2019-01-01 02:00:00
3,Clear,43,6.7,0,1,90,2019-01-01 03:00:00
4,Clear,43,6.7,0,1,90,2019-01-01 04:00:00


In [32]:
# Clean the weather variables 
cleaned_hourlyWeatherDF = cleaningDataFrame_solar(hourlyWeatherDF_cleanedDateTime)

# Print the shape (rows, columns) of the DataFrame
print(cleaned_hourlyWeatherDF.shape)

# Display the DataFrame
cleaned_hourlyWeatherDF.head()

(13872, 7)


Unnamed: 0,Weather_Description,Temperature_F,Sunhour,CloudCover_percent,uvIndex,Humidity_percent,Date_Time
0,Clear,43,6.7,0,1,88,2019-01-01 00:00:00
1,Clear,43,6.7,0,1,88,2019-01-01 01:00:00
2,Clear,43,6.7,0,1,89,2019-01-01 02:00:00
3,Clear,43,6.7,0,1,90,2019-01-01 03:00:00
4,Clear,43,6.7,0,1,90,2019-01-01 04:00:00


In [33]:
# Check data types to make sure they are correct
cleaned_hourlyWeatherDF.dtypes

Weather_Description            object
Temperature_F                   int32
Sunhour                       float64
CloudCover_percent              int32
uvIndex                         int32
Humidity_percent                int32
Date_Time              datetime64[ns]
dtype: object

## Export the Cleaned Hourly Weather DataFrame to a CSV File

In [34]:
# # Uncomment to output the cleaned DataFrame as a csv file
# cleaned_hourlyWeatherDF.to_csv(r'Output/Cleaned_Hourly_Solar_Weather.csv', index = False)

# Extracting and Transforming the Webberville Solar Energy Output (MWH) Data

## Define the Functions

In [35]:
def cleanRenewableFarmData(df):
    '''
    This function cleans the raw farm data.
    '''
    # Drop the Unit column
    df = df.drop('Unit', 1)
    # Convert Date into a datetime object
    df['Date'] = pd.to_datetime(df['Date'], format='%Y%m%d')
    # Convert Hour Ending data type into a string
    df['Hour Ending'] = df['Hour Ending'].astype(str)
    # Create a new column 'lastDigit' that takes the last digit from the Hour Ending column
    # lastDigit = 'D', indicates daylight saving 
    df['lastDigit'] = df['Hour Ending'].str.strip().str[-1]
    # Convert lastDigit data type into a string
    df['lastDigit'].astype(str)
    # If lastDigit = 'D', then drop the last 3 characters of the Hour Ending string; else, drop the last 2 characters of the Hour Ending string
    df['Hour Ending'] = df.apply(lambda x: x['Hour Ending'][:-3] if x['lastDigit'] == 'D' else x['Hour Ending'][:-2], axis=1)
    # Convert Hour Ending data type into an integer
    df['Hour Ending'].astype(int)
    # Convert Hour Ending into a timedelta object in order to add the hour to the date 
    df['Hour Ending'] = df['Hour Ending'].astype('timedelta64[h]')
    # Adjust for midnight = 24:00 (not 00:00)
    df['Hour Ending'] = df['Hour Ending'] - pd.to_timedelta(df['Hour Ending'].dt.days, unit='d')
    # Add the Hour Ending (aka time) to the date column and create a new Date_Time column
    df['Date_Time'] = df['Date'] + df['Hour Ending']
    # Drop the Date and Hour Ending columns since they have been combined into a single column, Date_Time
    # Drop the lastDigit column since it was only used to handle cleaning the daylight Saving in the Hour Ending column
    df = df.drop(['Date', 'Hour Ending', 'lastDigit'], 1)
    return df

## Extracting the Webberville Solar Farm Energy Output (MWH) Data

In [36]:
# Read in the data and store as Pandas DataFrame
data = "Resources/Webberville_Gen_Data.csv"
WebbervilleDF = pd.read_csv(data)

# Print the shape (rows, columns) of the DataFrame
print(WebbervilleDF.shape)

# Display the DataFrame
WebbervilleDF

(13871, 4)


Unnamed: 0,Unit,Date,Hour Ending,MWH
0,WEBBER_S_WSP1,20190101,100,0.0
1,WEBBER_S_WSP1,20190101,200,0.0
2,WEBBER_S_WSP1,20190101,300,0.0
3,WEBBER_S_WSP1,20190101,400,0.0
4,WEBBER_S_WSP1,20190101,500,0.0
...,...,...,...,...
13866,WEBBER_S_WSP1,20200731,2000,0.0
13867,WEBBER_S_WSP1,20200731,2100,0.0
13868,WEBBER_S_WSP1,20200731,2200,0.0
13869,WEBBER_S_WSP1,20200731,2300,0.0


## Explore the Data

In [37]:
# Check the data types
WebbervilleDF.dtypes

Unit            object
Date             int64
Hour Ending     object
MWH            float64
dtype: object

In [38]:
# Check the Value Counts
# The Unit column can be dropped since the values are the same in each row.
WebbervilleDF['Unit'].value_counts()

WEBBER_S_WSP1    13871
Name: Unit, dtype: int64

In [39]:
# Check the Value Counts
# The value counts are unexpacted:
    # 2019-11-03 has 25 unique values
    # 2019-03-10 has 23 unique values
    # 2020-03-08 has 23 unique values
# Observation: 
# Daylight saving in 2019 occurred on 2019-03-10 and 2019-11-03. Daylight saving in 2020 occurred on 2020-03-08 and 2020-11-01. 
WebbervilleDF['Date'].value_counts()

20191103    25
20191229    24
20190512    24
20190704    24
20190624    24
            ..
20200426    24
20200418    24
20200103    24
20190310    23
20200308    23
Name: Date, Length: 578, dtype: int64

In [40]:
# Due to Daylight saving, two values for 02:00am were recorded on 2019-11-03 (200 and 0200D).
WebbervilleDF.loc[(WebbervilleDF['Date'] == 20191103)].head()

Unnamed: 0,Unit,Date,Hour Ending,MWH
7343,WEBBER_S_WSP1,20191103,100,0.0
7344,WEBBER_S_WSP1,20191103,200,0.0
7345,WEBBER_S_WSP1,20191103,0200D,0.0
7346,WEBBER_S_WSP1,20191103,300,0.0
7347,WEBBER_S_WSP1,20191103,400,0.0


In [41]:
# Due to Daylight saving, no value was recorded for 02:00am on 2019-03-10. 
WebbervilleDF.loc[(WebbervilleDF['Date'] == 20190310)].head()

Unnamed: 0,Unit,Date,Hour Ending,MWH
1632,WEBBER_S_WSP1,20190310,100,0.0
1633,WEBBER_S_WSP1,20190310,300,0.0
1634,WEBBER_S_WSP1,20190310,400,0.0
1635,WEBBER_S_WSP1,20190310,500,0.0
1636,WEBBER_S_WSP1,20190310,600,0.0


In [42]:
# Due to Daylight saving, no value was recorded for 02:00am on 2020-03-08. 
WebbervilleDF.loc[(WebbervilleDF['Date'] == 20200308)].head()

Unnamed: 0,Unit,Date,Hour Ending,MWH
10368,WEBBER_S_WSP1,20200308,100,0.0
10369,WEBBER_S_WSP1,20200308,300,0.0
10370,WEBBER_S_WSP1,20200308,400,0.0
10371,WEBBER_S_WSP1,20200308,500,0.0
10372,WEBBER_S_WSP1,20200308,600,0.0


## Transforming the Webberville Solar Farm Energy Output (MWH) Data

In [43]:
# Clean the WebbervilleDF
cleaned_WebbervilleDF = cleanRenewableFarmData(WebbervilleDF)

# Print the shape (rows, columns) of the DataFrame
print(cleaned_WebbervilleDF.shape)

# Display the DataFrame
cleaned_WebbervilleDF

(13871, 2)


Unnamed: 0,MWH,Date_Time
0,0.0,2019-01-01 01:00:00
1,0.0,2019-01-01 02:00:00
2,0.0,2019-01-01 03:00:00
3,0.0,2019-01-01 04:00:00
4,0.0,2019-01-01 05:00:00
...,...,...
13866,0.0,2020-07-31 20:00:00
13867,0.0,2020-07-31 21:00:00
13868,0.0,2020-07-31 22:00:00
13869,0.0,2020-07-31 23:00:00


In [44]:
# Check the function properly handled daylight savings
# Two different MWH values are displayed for 02:00am on 2019-11-03
cleaned_WebbervilleDF.loc[(cleaned_WebbervilleDF['Date_Time'] == '2019-11-03 02:00:00')]

Unnamed: 0,MWH,Date_Time
7344,0.0,2019-11-03 02:00:00
7345,0.0,2019-11-03 02:00:00


In [45]:
# Check data types to make sure they are correct
cleaned_WebbervilleDF.dtypes

MWH                 float64
Date_Time    datetime64[ns]
dtype: object

## Export the Cleaned Webberville Solar Farm Energy Output (MWH) DataFrame to a CSV File

In [46]:
# # Uncomment to output the cleaned DataFrame as a csv file
# cleaned_WebbervilleDF.to_csv(r'Output/Cleaned_Webberville_Generation.csv', index = False)

# Merging Webberville Energy Output Data with Solar Weather Data

## Define the Functions

In [47]:
def datetimeSplit(df):
    '''
    Split the datetime object into separate year, month, and day columns. 
    '''
    # Separate Year from datetime object and convert to int32
    df['Year'] = df['Date_Time'].dt.year
    df['Year'] = df['Year'].astype(int)
    # Separate Month from datetime object and convert to int32
    df['Month'] = df['Date_Time'].dt.month
    df['Month'] = df['Month'].astype(int)
    # Separate Day from datetime object and convert to int32
    df['Day'] = df['Date_Time'].dt.day
    df['Day'] = df['Day'].astype(int) 
    # Separate the Hour from datetime object and convert to int32
    df['Hour'] = df['Date_Time'].dt.hour
    df['Hour'] = df['Hour'].astype(int)
    return df

## Merge the weather and energy output DataFrames

In [48]:
# Merge the cleaned weather data with the cleaned solar farm data using an outer join
WebbervilleSolarMWH = pd.merge(cleaned_hourlyWeatherDF, cleaned_WebbervilleDF, on='Date_Time', how='outer')

# Display the DataFrame
WebbervilleSolarMWH

Unnamed: 0,Weather_Description,Temperature_F,Sunhour,CloudCover_percent,uvIndex,Humidity_percent,Date_Time,MWH
0,Clear,43,6.7,0,1,88,2019-01-01 00:00:00,0.0
1,Clear,43,6.7,0,1,88,2019-01-01 01:00:00,0.0
2,Clear,43,6.7,0,1,89,2019-01-01 02:00:00,0.0
3,Clear,43,6.7,0,1,90,2019-01-01 03:00:00,0.0
4,Clear,43,6.7,0,1,90,2019-01-01 04:00:00,0.0
...,...,...,...,...,...,...,...,...
13868,Partly cloudy,79,6.9,73,1,58,2020-07-31 19:00:00,0.0
13869,Partly cloudy,79,6.9,73,1,62,2020-07-31 20:00:00,0.0
13870,Partly cloudy,79,6.9,73,1,66,2020-07-31 21:00:00,0.0
13871,Partly cloudy,79,6.9,73,1,71,2020-07-31 22:00:00,0.0


## Clean the Marged DataFrame

In [49]:
# Count the Null Values
# Displays null values due to daylight saving (3/10/2019  2:00:00 AM and 3/8/2020  2:00:00 AM)
WebbervilleSolarMWH.isna().sum()

Weather_Description    0
Temperature_F          0
Sunhour                0
CloudCover_percent     0
uvIndex                0
Humidity_percent       0
Date_Time              0
MWH                    2
dtype: int64

In [50]:
# Drop the null values
WebbervilleSolarMWH.dropna(inplace=True)

# Print the shape (rows, columns) of the DataFrame
WebbervilleSolarMWH.shape

(13871, 8)

In [51]:
# The solar power is generated by three types of panels:
    # 270 Watts panel - 32,018; 275 Watts panel - 63,238; 280 Watts panel - 32,022.
# Adding the 270 Watt panel number to the 280 Watt panel number is about the same in total to the 275 Watt panels.
    # Hence we can estimate that all panels are 275 Watt type panels and there are 127,278 of them.
# The per panel output is the total MHW/127,278.
WebbervilleSolarMWH['MWH_perPanel'] = WebbervilleSolarMWH['MWH']/127278

In [52]:
# Split the Date_Time column into separate columns for year, month, and date
# Splitting the Date_Time column so exploratory analysis can be performed based on month, year, and time features
WebbervilleSolarMWH = datetimeSplit(WebbervilleSolarMWH)

In [53]:
# Re-Order the columns
WebbervilleSolarMWH = WebbervilleSolarMWH[["Date_Time", "Year", "Month", "Day", "Hour", "MWH", "MWH_perPanel","Temperature_F", "Humidity_percent", "Sunhour", "CloudCover_percent", "uvIndex", "Weather_Description"]]

In [54]:
# Print the shape (rows, columns) of the DataFrame
print(WebbervilleSolarMWH.shape)

# Display the DataFrame
WebbervilleSolarMWH.head()

(13871, 13)


Unnamed: 0,Date_Time,Year,Month,Day,Hour,MWH,MWH_perPanel,Temperature_F,Humidity_percent,Sunhour,CloudCover_percent,uvIndex,Weather_Description
0,2019-01-01 00:00:00,2019,1,1,0,0.0,0.0,43,88,6.7,0,1,Clear
1,2019-01-01 01:00:00,2019,1,1,1,0.0,0.0,43,88,6.7,0,1,Clear
2,2019-01-01 02:00:00,2019,1,1,2,0.0,0.0,43,89,6.7,0,1,Clear
3,2019-01-01 03:00:00,2019,1,1,3,0.0,0.0,43,90,6.7,0,1,Clear
4,2019-01-01 04:00:00,2019,1,1,4,0.0,0.0,43,90,6.7,0,1,Clear


In [55]:
# Check the final data types
WebbervilleSolarMWH.dtypes

Date_Time              datetime64[ns]
Year                            int32
Month                           int32
Day                             int32
Hour                            int32
MWH                           float64
MWH_perPanel                  float64
Temperature_F                   int32
Humidity_percent                int32
Sunhour                       float64
CloudCover_percent              int32
uvIndex                         int32
Weather_Description            object
dtype: object

## Export the Final DataFrame (Merged Solar Energy Output and Weather Data) to a CSV File

In [56]:
# # Uncomment to output the final merged DataFrame as a csv file
# WebbervilleSolarMWH.to_csv(r'Output/Webberville_Solar_MWH.csv', index = False)

# Load the Data into MongoDB 


## Initial Imports

In [None]:
# Import Dependencies
import config
import pymongo

In [None]:
# Set string variables
DEFAULT_DATABASE = 'wind_solar_data' 
USERNAME = config.USERNAME
PASSWORD = config.PASSWORD

## Connect to MongoDB 

In [None]:
#create connection to database
client = pymongo.MongoClient(f"mongodb+srv://{USERNAME}:{PASSWORD}@austin-green-energy.pwzpm.mongodb.net/{DEFAULT_DATABASE}?retryWrites=true&w=majority")
try:
    client.server_info()
    print("Mongodb connected")
except:
    print("The Mongodb failed to connect. Check username/password in connection string.")


## Uploading the Solar Data to the Database

In [None]:
# Select database
db = client.get_database('wind_solar_data')
# Select collection
collection = db.solar_data

# Pull the csv file from Output folder
solar_data = pd.read_csv('.\Output\Webberville_Solar_MWH.csv')  
# Turn the csv into a JSON object
solar_data_json = json.loads(solar_data.to_json(orient='records'))

# Remove what is in the collection currently
collection.remove()
# Insert the new JSON data into the database
collection.insert(solar_data_json)

## Pull the Solar Data from the Database and Upload into DataFrame

In [None]:
# Select database
db = client.get_database('wind_solar_data')
# sSlect collection
collection = db.solar_data

# Pull collection into dataframe
solar_df = pd.DataFrame(list(collection.find()))
solar_df