# Connect to MongoDB

In [1]:
# Import Dependencies
import pandas as pd
import json
import pymongo
# Import Username / Password
import config

In [2]:
# Set string variables
DEFAULT_DATABASE = 'wind_solar_data' 
USERNAME = config.USERNAME
PASSWORD = config.PASSWORD

In [3]:
# Establish connection to database
client = pymongo.MongoClient(f"mongodb+srv://{USERNAME}:{PASSWORD}@austin-green-energy.pwzpm.mongodb.net/{DEFAULT_DATABASE}?retryWrites=true&w=majority")
try:
    client.server_info()
    print("Mongodb connected")
except:
    print("The Mongodb failed to connect. Check username/password in connection string.")

Mongodb connected


In [4]:
# Select database
db = client.get_database('wind_solar_data')
# Select collection
collection = db.solar_data

# Pull collection into dataframe
solar_df = pd.DataFrame(list(collection.find()))

# Print the shape (rows, columns) of the DataFrame
print(solar_df.shape)

# Display the Head of the DataFrame
solar_df.head()

(13871, 14)


Unnamed: 0,_id,Date_Time,Year,Month,Day,Hour,MWH,MWH_perPanel,Temperature_F,Humidity_percent,Sunhour,CloudCover_percent,uvIndex,Weather_Description
0,5f986632c1c5e33be42804c2,2019-01-01 00:00:00,2019,1,1,0,0.0,0.0,43,88,6.7,0,1,Clear
1,5f986632c1c5e33be42804c3,2019-01-01 01:00:00,2019,1,1,1,0.0,0.0,43,88,6.7,0,1,Clear
2,5f986632c1c5e33be42804c4,2019-01-01 02:00:00,2019,1,1,2,0.0,0.0,43,89,6.7,0,1,Clear
3,5f986632c1c5e33be42804c5,2019-01-01 03:00:00,2019,1,1,3,0.0,0.0,43,90,6.7,0,1,Clear
4,5f986632c1c5e33be42804c6,2019-01-01 04:00:00,2019,1,1,4,0.0,0.0,43,90,6.7,0,1,Clear


In [5]:
# Display the tail of the DataFrame to ensure all data was properly loaded
solar_df.tail()

Unnamed: 0,_id,Date_Time,Year,Month,Day,Hour,MWH,MWH_perPanel,Temperature_F,Humidity_percent,Sunhour,CloudCover_percent,uvIndex,Weather_Description
13866,5f986632c1c5e33be4283aec,2020-07-31 19:00:00,2020,7,31,19,0.0,0.0,79,58,6.9,73,1,Partly cloudy
13867,5f986632c1c5e33be4283aed,2020-07-31 20:00:00,2020,7,31,20,0.0,0.0,79,62,6.9,73,1,Partly cloudy
13868,5f986632c1c5e33be4283aee,2020-07-31 21:00:00,2020,7,31,21,0.0,0.0,79,66,6.9,73,1,Partly cloudy
13869,5f986632c1c5e33be4283aef,2020-07-31 22:00:00,2020,7,31,22,0.0,0.0,79,71,6.9,73,1,Partly cloudy
13870,5f986632c1c5e33be4283af0,2020-07-31 23:00:00,2020,7,31,23,0.0,0.0,79,76,6.9,73,1,Partly cloudy


# Clean the DataFrame

In [6]:
# Check the Initial Data Types
solar_df.dtypes

_id                     object
Date_Time               object
Year                     int64
Month                    int64
Day                      int64
Hour                     int64
MWH                    float64
MWH_perPanel           float64
Temperature_F            int64
Humidity_percent         int64
Sunhour                float64
CloudCover_percent       int64
uvIndex                  int64
Weather_Description     object
dtype: object

In [7]:
# Convert Date_Time to a datetime object
solar_df["Date_Time"] = solar_df["Date_Time"].astype('datetime64[ns]')

In [8]:
# Verify the Data Types
solar_df.dtypes

_id                            object
Date_Time              datetime64[ns]
Year                            int64
Month                           int64
Day                             int64
Hour                            int64
MWH                           float64
MWH_perPanel                  float64
Temperature_F                   int64
Humidity_percent                int64
Sunhour                       float64
CloudCover_percent              int64
uvIndex                         int64
Weather_Description            object
dtype: object

In [9]:
# Drop the _id column
solar_df.drop(columns=['_id'], axis=1, inplace=True)

In [10]:
# Print the shape (rows, columns) of the DataFrame
print(solar_df.shape)

# Display the Cleaned DataFrame
solar_df.head()

(13871, 13)


Unnamed: 0,Date_Time,Year,Month,Day,Hour,MWH,MWH_perPanel,Temperature_F,Humidity_percent,Sunhour,CloudCover_percent,uvIndex,Weather_Description
0,2019-01-01 00:00:00,2019,1,1,0,0.0,0.0,43,88,6.7,0,1,Clear
1,2019-01-01 01:00:00,2019,1,1,1,0.0,0.0,43,88,6.7,0,1,Clear
2,2019-01-01 02:00:00,2019,1,1,2,0.0,0.0,43,89,6.7,0,1,Clear
3,2019-01-01 03:00:00,2019,1,1,3,0.0,0.0,43,90,6.7,0,1,Clear
4,2019-01-01 04:00:00,2019,1,1,4,0.0,0.0,43,90,6.7,0,1,Clear


# Exploration to Uncover Data Trends

## Initial Imports

In [11]:
# Import Matplotlib
import matplotlib.pyplot as plt

## Characterize the Data

In [12]:
# Display the Column Names
solar_df.columns

Index(['Date_Time', 'Year', 'Month', 'Day', 'Hour', 'MWH', 'MWH_perPanel',
       'Temperature_F', 'Humidity_percent', 'Sunhour', 'CloudCover_percent',
       'uvIndex', 'Weather_Description'],
      dtype='object')

In [13]:
# Value Counts: Temperature
print("Temperature Value Counts")
print("Length of Value Counts:")
print(len(solar_df["Temperature_F"].value_counts()))
print("------")
solar_df["Temperature_F"].value_counts()

Temperature Value Counts
Length of Value Counts:
17
------


74    1488
79    1464
77     744
65     744
43     744
83     744
62     744
48     744
58     743
49     743
39     721
76     720
71     720
42     720
56     720
47     696
52     672
Name: Temperature_F, dtype: int64

In [14]:
# Value Counts: Humidity
print("Humidty Value Counts")
print("Length of Value Counts:")
print(len(solar_df["Humidity_percent"].value_counts()))
print("------")
solar_df["Humidity_percent"].value_counts()

Humidty Value Counts
Length of Value Counts:
79
------


92     355
91     347
93     335
90     334
89     324
      ... 
100      9
26       8
24       6
22       4
23       2
Name: Humidity_percent, Length: 79, dtype: int64

In [15]:
# Value Counts: Sunhour
print("Sunhour Value Counts")
print("Length of Value Counts:")
print(len(solar_df["Sunhour"].value_counts()))
print("------")
solar_df["Sunhour"].value_counts().head()

Sunhour Value Counts
Length of Value Counts:
46
------


11.6    3768
10.2     864
9.3      720
10.4     600
10.3     480
Name: Sunhour, dtype: int64

In [16]:
# Value Counts: CloudCover
print("CloudCover Value Counts")
print("Length of Value Counts:")
print(len(solar_df["CloudCover_percent"].value_counts()))
print("------")
solar_df["CloudCover_percent"].value_counts()

CloudCover Value Counts
Length of Value Counts:
15
------


0      2232
74     1487
85     1464
93      744
27      744
34      744
73      744
11      743
6       721
86      720
30      720
4       720
32      720
17      696
100     672
Name: CloudCover_percent, dtype: int64

In [17]:
# Value Counts: uvIndex
print("uvIndex Value Counts")
solar_df["uvIndex"].value_counts()

uvIndex Value Counts


1    13871
Name: uvIndex, dtype: int64

In [18]:
# Value Counts: Weather_Description
print("Weather_Description Value Counts")
print("Length of Value Counts:")
print(len(solar_df["Weather_Description"].value_counts()))
print("------")
solar_df["Weather_Description"].value_counts()

Weather_Description Value Counts
Length of Value Counts:
8
------


Partly cloudy                     5832
Clear                             2232
Patchy rain possible              2208
Thundery outbreaks possible        744
Cloudy                             743
Patchy light rain with thunder     720
Moderate rain at times             720
Fog                                672
Name: Weather_Description, dtype: int64