# Solar ETL: Extract, Transform, Load

## Initial Imports

In [1]:
# Import Dependencies
import requests
import pandas as pd
import json
import numpy as np
import datetime

In [2]:
# Import API Key
from config import key

# To create an API key, visit the site below:
#https://www.worldweatheronline.com/developer/my/analytics.aspx?key_id=222419

# Extracting the Solar Weather Data

## Define the Functions

In [3]:
def makeARequest(location, startDate, endDate, yourAPIKey):
    '''
    Make a request to the worldweatheronline local history weather API page.
    '''
    # Documentation on how to make an API Request using World Weather Online:
    # https://www.worldweatheronline.com/developer/api/docs/local-city-town-weather-api.aspx

    baseURL = "http://api.worldweatheronline.com/premium/v1/past-weather.ashx"
    timeInterval = "1" # Specifies the time interval in hours
    outputFormatToReturn = "json" # The output format to return

    # Add request parameters to base URL
    requestURL = f"{baseURL}?q={location}&date={startDate}&enddate={endDate}&tp={timeInterval}&format={outputFormatToReturn}&key={yourAPIKey}"
    
    # Get the webpage using the requests library and requestURL
    response = requests.get(requestURL)

    # If status response code indicates that the request has succeeded
    if response.status_code == 200:
        # Turn the response into a JSON object
        responseJson = response.json()
        return responseJson
    else:
        # Else, print the Error status code 
        return print(response.status_code)

In [4]:
def monthlyHistoricalSolarDF(firstDayOfMonth, lastDayOfMonth, jsonResponse):
    ''' 
    Pull solar weather variables from the response JSON and turn into Pandas DataFrame. 
    '''
    # Create a datetime object from the firstDayOfMonth and lastDayOfMonth input strings
    first = datetime.datetime.strptime(firstDayOfMonth, '%Y-%m-%d')
    last = datetime.datetime.strptime(lastDayOfMonth, '%Y-%m-%d')
    # Subtract the fist day from the last day of each month 
    numberOfDays = last.day - first.day

    # Initiate the HourlyHistoricalWeather list
    HourlyHistoricalWeather = []
    
    # For each day of the month
    for day in np.arange(0,numberOfDays + 1,1):
        # and each hour of each day
        for hour in np.arange(0,24,1):
            # append the solar weather variables to the HourlyHistoricalWeather list 
            HourlyHistoricalWeather.append({
                "Date" : jsonResponse["data"]["weather"][day]["date"],
                "Time" : jsonResponse["data"]["weather"][day]["hourly"][hour]["time"],
                "Weather_Description" : jsonResponse["data"]["weather"][day]["hourly"][hour]["weatherDesc"][0]["value"],
                "Temperature_F" : jsonResponse["data"]["weather"][day]["hourly"][hour]["tempF"],
                "Sunhour" : jsonResponse["data"]["weather"][day]["sunHour"],
                "CloudCover_percent" : jsonResponse["data"]["weather"][day]["hourly"][hour]["cloudcover"],
                "uvIndex" : jsonResponse["data"]["weather"][day]["hourly"][hour]["uvIndex"],
                "Humidity_percent" : jsonResponse["data"]["weather"][day]["hourly"][hour]["humidity"]
            })

    # Store the solar weather variables in a Pandas DataFrame 
    weatherDataFrame = pd.DataFrame(HourlyHistoricalWeather)
    return weatherDataFrame

## Define the Webberville Solar Farm Location

In [5]:
# Define the Latitude and longitude of Webberville Solar Farm
latLong = "30.238333,-97.508611"

## API Calls for January 2017 - July 2020

In [6]:
# Define the yearList and monthList variables
    # Making API calls for 2017 - 2020 
yearList = [2017, 2018, 2019, 2020]
monthList = np.arange(1,13,1)

In [7]:
lastDayOfMonth = []

# for each year in the yearList
for year in yearList: 
    # and each month of each year
    for month in monthList:
        # if the month does not equal 12 (December)
        if month != 12:
            # subtract one day from the 1st of each month
            # this will handle leap years
            date = datetime.date(year=year, month=month + 1, day=1) - datetime.timedelta(days=1)
            dateStr = date.strftime('%Y-%m-%d')
            lastDayOfMonth.append(dateStr)
        else:
            # the last day of Decemeber is set to the 31st for each year
            date = datetime.date(year=year, month=12, day=31)
            dateStr = date.strftime('%Y-%m-%d')
            lastDayOfMonth.append(dateStr)

# slice the list since the 2020 MWH data only goes through July 31st, 2020  
lastDayOfMonth = lastDayOfMonth[0:43]

In [8]:
firstDayOfMonth = []

# for each year in the yearList
for year in yearList: 
    # and each month of each year
    for month in monthList:
        # set the date to the 1st for each month in each year
        date = datetime.date(year, month, 1)
        dateStr = date.strftime('%Y-%m-%d')
        firstDayOfMonth.append(dateStr)

# slice the list since the 2020 MWH data only goes through July 31st, 2020  
firstDayOfMonth = firstDayOfMonth[0:43]

In [9]:
hourlyWeatherDF = pd.DataFrame()

# use the start and end dates of each month to make an API call and append the weather data to a DataFrame
for i in np.arange(0,43,1):
     responseJson = makeARequest(latLong, firstDayOfMonth[i], lastDayOfMonth[i], key)
     hourlyWeatherDF = hourlyWeatherDF.append([monthlyHistoricalSolarDF(firstDayOfMonth[i], lastDayOfMonth[i], responseJson)])

# Display the DataFrame
hourlyWeatherDF.head()

Unnamed: 0,Date,Time,Weather_Description,Temperature_F,Sunhour,CloudCover_percent,uvIndex,Humidity_percent
0,2017-01-01,0,Mist,58,8.7,11,1,95
1,2017-01-01,100,Mist,57,8.7,11,1,95
2,2017-01-01,200,Mist,56,8.7,11,1,95
3,2017-01-01,300,Mist,56,8.7,12,1,95
4,2017-01-01,400,Mist,55,8.7,13,1,89


In [10]:
# Re-Index the DataFrame: 24 hours * 578 days 
index = np.arange(0,24*1308,1)
hourlyWeatherDF = hourlyWeatherDF.set_index(index)

# Display the DataFrame
hourlyWeatherDF

Unnamed: 0,Date,Time,Weather_Description,Temperature_F,Sunhour,CloudCover_percent,uvIndex,Humidity_percent
0,2017-01-01,0,Mist,58,8.7,11,1,95
1,2017-01-01,100,Mist,57,8.7,11,1,95
2,2017-01-01,200,Mist,56,8.7,11,1,95
3,2017-01-01,300,Mist,56,8.7,12,1,95
4,2017-01-01,400,Mist,55,8.7,13,1,89
...,...,...,...,...,...,...,...,...
31387,2020-07-31,1900,Light rain shower,88,9.8,74,1,58
31388,2020-07-31,2000,Patchy light rain with thunder,86,9.8,77,1,62
31389,2020-07-31,2100,Patchy light rain with thunder,85,9.8,80,1,66
31390,2020-07-31,2200,Patchy light rain with thunder,83,9.8,78,1,71


## Explore the Data

In [11]:
# Check the Data Types
# Returns objects. 
hourlyWeatherDF.dtypes

Date                   object
Time                   object
Weather_Description    object
Temperature_F          object
Sunhour                object
CloudCover_percent     object
uvIndex                object
Humidity_percent       object
dtype: object

In [12]:
# Check the Value Counts
# Returns 24 unique values for each day. This is expected since weather data is being extracted for each hour of each day. 
hourlyWeatherDF['Date'].value_counts()

2019-03-23    24
2019-07-04    24
2018-12-24    24
2017-05-26    24
2017-10-04    24
              ..
2017-01-14    24
2019-03-07    24
2019-05-01    24
2020-06-11    24
2018-06-08    24
Name: Date, Length: 1308, dtype: int64

In [13]:
# Check the Value Counts
# Returns 578 unique values for each time. This is expected since data is being extracted for 578 days (Jan 2019 - July 2020). 
hourlyWeatherDF['Time'].value_counts()

2300    1308
500     1308
1800    1308
900     1308
600     1308
700     1308
200     1308
1700    1308
100     1308
1300    1308
400     1308
1400    1308
1000    1308
0       1308
300     1308
1200    1308
1900    1308
2200    1308
2100    1308
2000    1308
1500    1308
1600    1308
1100    1308
800     1308
Name: Time, dtype: int64

# Performing Transformation on Solar Weather Data

## Define the Functions

In [14]:
def cleaningDataFrame_datetime(df):
    '''
    Clean the date and time columns and combine into a single Date_Time column.
    '''
    # Convert the Dates to a datetime object
    df['Date'] = pd.to_datetime(df['Date'])
    # Convert the Time data type to integer in order to perform mathematical operations
    df['Time'] = df['Time'].astype(int)
    # Divide the Time by 100 (which converts the data type to float) and convert data type to integer
    df['Time'] = (df['Time']/100).astype(int)
    # Convert Time to a timedelta object
    # This allows us to do simple addition arithmetic on datetimes
    df['Time'] = df['Time'].astype('timedelta64[h]')
    # Add Time to Date to create a new Date_Time column 
    # This creates a datetime object that has both date and time in the same object
    df['Date_Time'] = df['Date'] + df['Time']
    # Drop the Date and Time columns since a new Date_Time column has been created
    df = df.drop(['Date', 'Time'], 1)
    return df

In [15]:
def cleaningDataFrame_solar(df):
    '''
    Clean the data types of the weather variables. 
    '''
    # Covert Temperature data type to integer
    df['Temperature_F'] = df['Temperature_F'].astype(int)
    # Covert Sunhour data type to float
    df['Sunhour'] = df['Sunhour'].astype(float)
    # Covert CloudCover data type to integer
    df['CloudCover_percent'] = df['CloudCover_percent'].astype(int)
    # Covert uvIndex data type to integer
    df['uvIndex'] = df['uvIndex'].astype(int)
    # Convert the Humidity data type to integer
    df['Humidity_percent'] = df['Humidity_percent'].astype(int)
    return df

## Clean the Original Hourly Weather DataFrame

In [16]:
# Clean the date and time columns of the original DataFrame
hourlyWeatherDF_cleanedDateTime = cleaningDataFrame_datetime(hourlyWeatherDF)

# Display the DataFrame
hourlyWeatherDF_cleanedDateTime.head()

Unnamed: 0,Weather_Description,Temperature_F,Sunhour,CloudCover_percent,uvIndex,Humidity_percent,Date_Time
0,Mist,58,8.7,11,1,95,2017-01-01 00:00:00
1,Mist,57,8.7,11,1,95,2017-01-01 01:00:00
2,Mist,56,8.7,11,1,95,2017-01-01 02:00:00
3,Mist,56,8.7,12,1,95,2017-01-01 03:00:00
4,Mist,55,8.7,13,1,89,2017-01-01 04:00:00


In [17]:
# Clean the weather variables 
cleaned_hourlyWeatherDF = cleaningDataFrame_solar(hourlyWeatherDF_cleanedDateTime)

# Print the shape (rows, columns) of the DataFrame
print(cleaned_hourlyWeatherDF.shape)

# Display the DataFrame
cleaned_hourlyWeatherDF.head()

(31392, 7)


Unnamed: 0,Weather_Description,Temperature_F,Sunhour,CloudCover_percent,uvIndex,Humidity_percent,Date_Time
0,Mist,58,8.7,11,1,95,2017-01-01 00:00:00
1,Mist,57,8.7,11,1,95,2017-01-01 01:00:00
2,Mist,56,8.7,11,1,95,2017-01-01 02:00:00
3,Mist,56,8.7,12,1,95,2017-01-01 03:00:00
4,Mist,55,8.7,13,1,89,2017-01-01 04:00:00


In [18]:
# Check data types to make sure they are correct
cleaned_hourlyWeatherDF.dtypes

Weather_Description            object
Temperature_F                   int32
Sunhour                       float64
CloudCover_percent              int32
uvIndex                         int32
Humidity_percent                int32
Date_Time              datetime64[ns]
dtype: object

## Export the Cleaned Hourly Weather DataFrame to a CSV File

In [19]:
# # Uncomment to output the cleaned DataFrame as a csv file
# cleaned_hourlyWeatherDF.to_csv(r'../Resources/Output/Cleaned_Hourly_Solar_Weather.csv', index = False)

# Extracting and Transforming the Webberville Solar Energy Output (MWH) Data

## Define the Functions

In [20]:
def cleanRenewableFarmData(df):
    '''
    This function cleans the raw farm data.
    '''
    # Drop the Unit column
    df = df.drop('Unit', 1)
    # Convert Date into a datetime object
    df['Date'] = pd.to_datetime(df['Date'], format='%Y%m%d')
    # Convert Hour Ending data type into a string
    df['Hour Ending'] = df['Hour Ending'].astype(str)
    # Create a new column 'lastDigit' that takes the last digit from the Hour Ending column
    # lastDigit = 'D', indicates daylight saving 
    df['lastDigit'] = df['Hour Ending'].str.strip().str[-1]
    # Convert lastDigit data type into a string
    df['lastDigit'].astype(str)
    # If lastDigit = 'D', then drop the last 3 characters of the Hour Ending string; else, drop the last 2 characters of the Hour Ending string
    df['Hour Ending'] = df.apply(lambda x: x['Hour Ending'][:-3] if x['lastDigit'] == 'D' else x['Hour Ending'][:-2], axis=1)
    # Convert Hour Ending data type into an integer
    df['Hour Ending'].astype(int)
    # Convert Hour Ending into a timedelta object in order to add the hour to the date 
    df['Hour Ending'] = df['Hour Ending'].astype('timedelta64[h]')
    # Adjust for midnight = 24:00 (not 00:00)
    df['Hour Ending'] = df['Hour Ending'] - pd.to_timedelta(df['Hour Ending'].dt.days, unit='d')
    # Add the Hour Ending (aka time) to the date column and create a new Date_Time column
    df['Date_Time'] = df['Date'] + df['Hour Ending']
    # Drop the Date and Hour Ending columns since they have been combined into a single column, Date_Time
    # Drop the lastDigit column since it was only used to handle cleaning the daylight Saving in the Hour Ending column
    df = df.drop(['Date', 'Hour Ending', 'lastDigit'], 1)
    return df

## Extracting the Webberville Solar Farm Energy Output (MWH) Data

In [21]:
# Read in the data and store as Pandas DataFrame
data_2017 = "../Resources/Raw Data/Hackberry_and_Webberville_2017.xlsx"
Webberville_2017_DF = pd.read_excel(data_2017, sheet_name='Webberville')

# Print the shape (rows, columns) of the DataFrame
print(Webberville_2017_DF.shape)

# Display the DataFrame
Webberville_2017_DF.head()

(8760, 4)


Unnamed: 0,Unit,Date,Hour Ending,MWH
0,WEBBER_S_WSP1,20170101,100,0.0
1,WEBBER_S_WSP1,20170101,200,0.0
2,WEBBER_S_WSP1,20170101,300,0.0
3,WEBBER_S_WSP1,20170101,400,0.0
4,WEBBER_S_WSP1,20170101,500,0.0


In [22]:
# create a pandas DataFrame for all Webberville Output data
WebbervilleDF = pd.DataFrame()

# append the 2017 Webberville Data to the WebbervilleDF
WebbervilleDF = WebbervilleDF.append(Webberville_2017_DF)

# Print the shape (rows, columns) of the DataFrame
print(WebbervilleDF.shape)

# Display the DataFrame
WebbervilleDF.head()

(8760, 4)


Unnamed: 0,Unit,Date,Hour Ending,MWH
0,WEBBER_S_WSP1,20170101,100,0.0
1,WEBBER_S_WSP1,20170101,200,0.0
2,WEBBER_S_WSP1,20170101,300,0.0
3,WEBBER_S_WSP1,20170101,400,0.0
4,WEBBER_S_WSP1,20170101,500,0.0


In [23]:
# Read in the data and store as Pandas DataFrame
data_2018 = "../Resources/Raw Data/Hackberry_and_Webberville_2018.xlsx"
Webberville_2018_DF = pd.read_excel(data_2018, sheet_name='Webberville')

# Print the shape (rows, columns) of the DataFrame
print(Webberville_2018_DF.shape)

# Display the DataFrame
Webberville_2018_DF.head()

(8760, 4)


Unnamed: 0,Unit,Date,Hour Ending,MWH
0,WEBBER_S_WSP1,20180101,100,0.0
1,WEBBER_S_WSP1,20180101,200,0.0
2,WEBBER_S_WSP1,20180101,300,0.0
3,WEBBER_S_WSP1,20180101,400,0.0
4,WEBBER_S_WSP1,20180101,500,0.0


In [24]:
# append the 2018 Webberville Data to the WebbervilleDF
WebbervilleDF = WebbervilleDF.append(Webberville_2018_DF)

# Print the shape (rows, columns) of the DataFrame
print(WebbervilleDF.shape)

# Display the DataFrame
WebbervilleDF.tail()

(17520, 4)


Unnamed: 0,Unit,Date,Hour Ending,MWH
8755,WEBBER_S_WSP1,20181231,2000,0.0
8756,WEBBER_S_WSP1,20181231,2100,0.0
8757,WEBBER_S_WSP1,20181231,2200,0.0
8758,WEBBER_S_WSP1,20181231,2300,0.0
8759,WEBBER_S_WSP1,20181231,2400,0.0


In [25]:
# Read in the data and store as Pandas DataFrame
data_2019_2020 = "../Resources/Raw Data/Webberville_Generation_2019_2020.csv"
Webberville_2019_2020_DF = pd.read_csv(data_2019_2020)

# Print the shape (rows, columns) of the DataFrame
print(Webberville_2019_2020_DF.shape)

# Display the DataFrame
Webberville_2019_2020_DF

(13871, 4)


Unnamed: 0,Unit,Date,Hour Ending,MWH
0,WEBBER_S_WSP1,20190101,100,0.0
1,WEBBER_S_WSP1,20190101,200,0.0
2,WEBBER_S_WSP1,20190101,300,0.0
3,WEBBER_S_WSP1,20190101,400,0.0
4,WEBBER_S_WSP1,20190101,500,0.0
...,...,...,...,...
13866,WEBBER_S_WSP1,20200731,2000,0.0
13867,WEBBER_S_WSP1,20200731,2100,0.0
13868,WEBBER_S_WSP1,20200731,2200,0.0
13869,WEBBER_S_WSP1,20200731,2300,0.0


In [26]:
# append the 2019 and 2020 Webberville Data to the WebbervilleDF
WebbervilleDF = WebbervilleDF.append(Webberville_2019_2020_DF)

# Print the shape (rows, columns) of the DataFrame
print(WebbervilleDF.shape)

# Display the DataFrame
WebbervilleDF.tail()

(31391, 4)


Unnamed: 0,Unit,Date,Hour Ending,MWH
13866,WEBBER_S_WSP1,20200731,2000,0.0
13867,WEBBER_S_WSP1,20200731,2100,0.0
13868,WEBBER_S_WSP1,20200731,2200,0.0
13869,WEBBER_S_WSP1,20200731,2300,0.0
13870,WEBBER_S_WSP1,20200731,2400,0.0


In [27]:
# Re-Index the DataFrame
index = np.arange(0,31391,1)
WebbervilleDF = WebbervilleDF.set_index(index)

# Display the DataFrame
WebbervilleDF

Unnamed: 0,Unit,Date,Hour Ending,MWH
0,WEBBER_S_WSP1,20170101,100,0.0
1,WEBBER_S_WSP1,20170101,200,0.0
2,WEBBER_S_WSP1,20170101,300,0.0
3,WEBBER_S_WSP1,20170101,400,0.0
4,WEBBER_S_WSP1,20170101,500,0.0
...,...,...,...,...
31386,WEBBER_S_WSP1,20200731,2000,0.0
31387,WEBBER_S_WSP1,20200731,2100,0.0
31388,WEBBER_S_WSP1,20200731,2200,0.0
31389,WEBBER_S_WSP1,20200731,2300,0.0


## Explore the Data

In [28]:
# Check the data types
WebbervilleDF.dtypes

Unit            object
Date             int64
Hour Ending     object
MWH            float64
dtype: object

In [29]:
# Check the Value Counts
# The Unit column can be dropped since the values are the same in each row.
WebbervilleDF['Unit'].value_counts()

WEBBER_S_WSP1    31391
Name: Unit, dtype: int64

In [30]:
# Check the Value Counts
# The value counts are unexpacted:
    # 2019-11-03 has 25 unique values
    # 2019-03-10 has 23 unique values
    # 2020-03-08 has 23 unique values
# Observation: 
# Daylight saving in 2019 occurred on 2019-03-10 and 2019-11-03. Daylight saving in 2020 occurred on 2020-03-08 and 2020-11-01. 
WebbervilleDF['Date'].value_counts()

20191103    25
20181104    25
20171105    25
20191229    24
20180104    24
            ..
20190824    24
20190310    23
20170312    23
20200308    23
20180311    23
Name: Date, Length: 1308, dtype: int64

## Daylight Savings
2018 
Daylight saving time 2018 in Texas began at 2:00 AM on
Sunday, March 11
and ended at 2:00 AM on
Sunday, November 4

2017
Sunday, March 12
and ended at 2:00 AM on
Sunday, November 5

In [31]:
# Due to Daylight saving, two values for 02:00am were recorded on 2019-11-03 (200 and 0200D).
WebbervilleDF.loc[(WebbervilleDF['Date'] == 20191103)].head()

# Due to Daylight saving, no value was recorded for 02:00am on 2019-03-10. 
WebbervilleDF.loc[(WebbervilleDF['Date'] == 20190310)].head()

# Due to Daylight saving, no value was recorded for 02:00am on 2020-03-08. 
WebbervilleDF.loc[(WebbervilleDF['Date'] == 20200308)].head()

Unnamed: 0,Unit,Date,Hour Ending,MWH
27888,WEBBER_S_WSP1,20200308,100,0.0
27889,WEBBER_S_WSP1,20200308,300,0.0
27890,WEBBER_S_WSP1,20200308,400,0.0
27891,WEBBER_S_WSP1,20200308,500,0.0
27892,WEBBER_S_WSP1,20200308,600,0.0


## Transforming the Webberville Solar Farm Energy Output (MWH) Data

In [32]:
# Clean the WebbervilleDF
cleaned_Webberville_DF = cleanRenewableFarmData(WebbervilleDF)

# Print the shape (rows, columns) of the DataFrame
print(cleaned_Webberville_DF.shape)

# Display the DataFrame
cleaned_Webberville_DF

(31391, 2)


Unnamed: 0,MWH,Date_Time
0,0.0,2017-01-01 01:00:00
1,0.0,2017-01-01 02:00:00
2,0.0,2017-01-01 03:00:00
3,0.0,2017-01-01 04:00:00
4,0.0,2017-01-01 05:00:00
...,...,...
31386,0.0,2020-07-31 20:00:00
31387,0.0,2020-07-31 21:00:00
31388,0.0,2020-07-31 22:00:00
31389,0.0,2020-07-31 23:00:00


In [33]:
# Check the function properly handled daylight savings
# Two different MWH values are displayed for 02:00am on 2019-11-03
cleaned_Webberville_DF.loc[(cleaned_Webberville_DF['Date_Time'] == '2019-11-03 02:00:00')]

Unnamed: 0,MWH,Date_Time
24864,0.0,2019-11-03 02:00:00
24865,0.0,2019-11-03 02:00:00


In [34]:
# Check data types to make sure they are correct
cleaned_Webberville_DF.dtypes

MWH                 float64
Date_Time    datetime64[ns]
dtype: object

## Export the Cleaned Webberville Solar Farm Energy Output (MWH) DataFrame to a CSV File

In [35]:
# # Uncomment to output the cleaned DataFrame as a csv file
# cleaned_Webberville_DF.to_csv(r'../Resources/Output/Cleaned_Webberville_Generation.csv', index = False)

# Merging Webberville Energy Output Data with Solar Weather Data

## Define the Functions

In [36]:
def datetimeSplit(df):
    '''
    Split the datetime object into separate year, month, and day columns. 
    '''
    # Separate Year from datetime object and convert to int32
    df['Year'] = df['Date_Time'].dt.year
    df['Year'] = df['Year'].astype(int)
    # Separate Month from datetime object and convert to int32
    df['Month'] = df['Date_Time'].dt.month
    df['Month'] = df['Month'].astype(int)
    # Separate Day from datetime object and convert to int32
    df['Day'] = df['Date_Time'].dt.day
    df['Day'] = df['Day'].astype(int) 
    # Separate the Hour from datetime object and convert to int32
    df['Hour'] = df['Date_Time'].dt.hour
    df['Hour'] = df['Hour'].astype(int)
    return df

## Merge the Weather and Energy Output DataFrames

In [37]:
# Merge the cleaned weather data with the cleaned solar farm data using an outer join
WebbervilleSolarMWH = pd.merge(cleaned_hourlyWeatherDF, cleaned_Webberville_DF, on='Date_Time', how='outer')

# Display the DataFrame
WebbervilleSolarMWH

Unnamed: 0,Weather_Description,Temperature_F,Sunhour,CloudCover_percent,uvIndex,Humidity_percent,Date_Time,MWH
0,Mist,58,8.7,11,1,95,2017-01-01 00:00:00,0.0
1,Mist,57,8.7,11,1,95,2017-01-01 01:00:00,0.0
2,Mist,56,8.7,11,1,95,2017-01-01 02:00:00,0.0
3,Mist,56,8.7,12,1,95,2017-01-01 03:00:00,0.0
4,Mist,55,8.7,13,1,89,2017-01-01 04:00:00,0.0
...,...,...,...,...,...,...,...,...
31390,Light rain shower,88,9.8,74,1,58,2020-07-31 19:00:00,0.0
31391,Patchy light rain with thunder,86,9.8,77,1,62,2020-07-31 20:00:00,0.0
31392,Patchy light rain with thunder,85,9.8,80,1,66,2020-07-31 21:00:00,0.0
31393,Patchy light rain with thunder,83,9.8,78,1,71,2020-07-31 22:00:00,0.0


## Clean the Marged DataFrame

In [38]:
# Count the Null Values
# Displays null values due to daylight saving (3/10/2019  2:00:00 AM and 3/8/2020  2:00:00 AM)
WebbervilleSolarMWH.isna().sum()

Weather_Description    0
Temperature_F          0
Sunhour                0
CloudCover_percent     0
uvIndex                0
Humidity_percent       0
Date_Time              0
MWH                    4
dtype: int64

In [39]:
# Drop the null values
WebbervilleSolarMWH.dropna(inplace=True)

# Print the shape (rows, columns) of the DataFrame
WebbervilleSolarMWH.shape

(31391, 8)

In [40]:
# Split the Date_Time column into separate columns for year, month, and date
# Splitting the Date_Time column so exploratory analysis can be performed based on month, year, and time features
WebbervilleSolarMWH = datetimeSplit(WebbervilleSolarMWH)

In [41]:
# Re-Order the columns
WebbervilleSolarMWH = WebbervilleSolarMWH[["Date_Time", "Year", "Month", "Day", "Hour", "MWH","Temperature_F", "Humidity_percent", "Sunhour", "CloudCover_percent", "uvIndex", "Weather_Description"]]

In [42]:
# Print the shape (rows, columns) of the DataFrame
print(WebbervilleSolarMWH.shape)

# Display the DataFrame
WebbervilleSolarMWH.head()

(31391, 12)


Unnamed: 0,Date_Time,Year,Month,Day,Hour,MWH,Temperature_F,Humidity_percent,Sunhour,CloudCover_percent,uvIndex,Weather_Description
0,2017-01-01 00:00:00,2017,1,1,0,0.0,58,95,8.7,11,1,Mist
1,2017-01-01 01:00:00,2017,1,1,1,0.0,57,95,8.7,11,1,Mist
2,2017-01-01 02:00:00,2017,1,1,2,0.0,56,95,8.7,11,1,Mist
3,2017-01-01 03:00:00,2017,1,1,3,0.0,56,95,8.7,12,1,Mist
4,2017-01-01 04:00:00,2017,1,1,4,0.0,55,89,8.7,13,1,Mist


In [43]:
# Check the final data types
WebbervilleSolarMWH.dtypes

Date_Time              datetime64[ns]
Year                            int32
Month                           int32
Day                             int32
Hour                            int32
MWH                           float64
Temperature_F                   int32
Humidity_percent                int32
Sunhour                       float64
CloudCover_percent              int32
uvIndex                         int32
Weather_Description            object
dtype: object

## Export the Final DataFrame (Merged Solar Energy Output and Weather Data) to a CSV File

In [44]:
# Uncomment to output the final merged DataFrame as a csv file
WebbervilleSolarMWH.to_csv(r'../Resources/Output/Webberville_Solar_2017-2020_MWH.csv', index = False)

# Load the Data into MongoDB 


## Initial Imports

In [45]:
# Import Dependencies
import config
import pymongo

In [46]:
# Set string variables
DEFAULT_DATABASE = 'wind_solar_data' 
USERNAME = config.USERNAME
PASSWORD = config.PASSWORD

## Connect to MongoDB 

In [47]:
#create connection to database
client = pymongo.MongoClient(f"mongodb+srv://{USERNAME}:{PASSWORD}@austin-green-energy.pwzpm.mongodb.net/{DEFAULT_DATABASE}?retryWrites=true&w=majority")
try:
    client.server_info()
    print("Mongodb connected")
except:
    print("The Mongodb failed to connect. Check username/password in connection string.")

## Uploading the Solar Data to the Database

In [48]:
# Select database
db = client.get_database('wind_solar_data')
# Select collection
collection = db.solar_data

# Pull the csv file from Output folder
solar_data = pd.read_csv('../Resources/Output/Webberville_Solar_2017-2020_MWH.csv')  
# Turn the csv into a JSON object
solar_data_json = json.loads(solar_data.to_json(orient='records'))

# Remove what is in the collection currently
collection.remove()
# Insert the new JSON data into the database
collection.insert(solar_data_json)

## Pull the Solar Data from the Database and Upload into DataFrame

In [49]:
# Select database
db = client.get_database('wind_solar_data')
# sSlect collection
collection = db.solar_data

# Pull collection into dataframe
solar_df = pd.DataFrame(list(collection.find()))
solar_df