# Cleaning University of Massachussets Amherst Power Consumption Dataset

The solar data retrieved from [UMass Solar Energy](https://www.umass.edu/sustainability/climate-change-energy/solar/15000-solar-panels-5-buildings-2-parking-lots?_ga=2.23708602.1441216145.1605983844-2090397201.1605471456) regarding power consumption in UCB's Mechanical Engineering building is in the form of a `json` format. Here we'll be converting it into a `csv` file for later data analysis. 
The weather data retrieved from [NREL](https://maps.nrel.gov/nsrdb-viewer/)

In [32]:
import numpy as np
import pandas as pd
import seaborn as sns
import json
import pprint
from datetime import datetime

In [54]:
#Format into dates
def format_time(unix_timestamp):
    format = '%Y-%m-%d'
    ts = int(unix_timestamp)
    return datetime.utcfromtimestamp(ts).strftime(format)

#Create list of datetime objects from timestamps
def gen_datetimes(li):
    datetimes = []
    for time in li:
        datetimes.append(datetime.utcfromtimestamp(time))
    return datetimes

def get_weekday2(year, month, day):
    dates = pd.DataFrame()
    dates['y'] = year
    dates['m'] = month
    dates['d'] = day
    dates['dates'] = df['year'].astype('str') + 
    return [1 if (datetime.strptime(str(str(y) + '-' + str(m) + '-' + str(d)), "%Y-%m-%d").weekday() >= 5) else 0 for (y,m,d) in zip(year, month, day)]

#Get day of week based on date
def get_weekday(dates):
    return [1 if (datetime.strptime(d,"%Y-%m-%d").weekday() >= 5) else 0 for d in dates]

In [34]:
def clean_data(location, skiprows = 0):
    df_weather = pd.read_csv(location, skiprows=skiprows)
    df_weather = df_weather.drop(columns=['Hour', 'Minute'])
    df_weather = df_weather[df_weather.DNI != 0]

    #Take mean, max and min for each DNI in DataFrame and mean for everything else
    
    max_dni = df_weather.groupby(['Year', 'Month', 'Day']).max().reset_index()['DNI']
    min_dni = df_weather.groupby(['Year', 'Month', 'Day']).min().reset_index()['DNI']
    
    df_weather = pd.DataFrame(df_weather.groupby(['Year', 'Month', 'Day']).mean().reset_index())
    
    df_weather.insert(4, 'DNI Max', max_dni)
    df_weather.insert(5, 'DNI Min', min_dni)
    
    return df_weather

In [47]:
def training_prep(data, square_feet, building_type):
    data = data[['Year', 'Month', 'Day', 'DNI', 'DNI Max', 'DNI Min', 'Wind Speed', 'Precipitable Water', 'Wind Direction', 'Relative Humidity', 'Temperature', 'Pressure']]
    data['Square Feet'] = square_feet
    data['Type'] = building_type
    data['Weekday'] = get_weekday2(data['Year'], data['Month'], data['Year'])
    return data

In [36]:
df_weather = clean_data('Datasets/Amherst_Weather.csv')
df_weather.head()

Unnamed: 0,Year,Month,Day,DHI,DNI Max,DNI Min,DNI,GHI,Clearsky DHI,Clearsky DNI,Clearsky GHI,Wind Speed,Precipitable Water,Wind Direction,Relative Humidity,Temperature,Pressure
0,2017,1,1,82.625,838,310,663.5,286.25,73.75,729.0,304.625,0.2375,0.551875,283.9125,89.03125,0.5,1000.0
1,2017,1,2,109.625,649,150,330.625,210.25,68.25,716.875,297.125,0.1125,1.161,101.95,83.9825,0.5,1010.0
2,2017,1,4,125.5,384,31,194.75,175.25,74.75,627.0,285.0,0.225,1.46225,248.15,100.0,2.5,980.0
3,2017,1,5,74.5,769,493,652.75,279.625,74.625,736.625,313.375,0.3125,0.483875,264.0375,58.5325,-2.5,990.0
4,2017,1,6,77.25,845,521,664.125,287.5,73.25,747.75,317.5,0.25,0.429875,309.15,61.96375,-2.875,1000.0


In [37]:
df_weather.to_csv('Datasets/Amherst_Weather_Cleaned.csv')

In [49]:
df_weather = clean_data('Datasets/AnnArbor_Weather.csv')
df_weather.head()

Unnamed: 0,Year,Month,Day,DHI,DNI Max,DNI Min,DNI,GHI,Clearsky DHI,Clearsky DNI,Clearsky GHI,Wind Speed,Precipitable Water,Wind Direction,Relative Humidity,Temperature,Pressure
0,2019,1,1,75.222222,178,3,52.111111,90.444444,46.888889,612.666667,246.555556,0.377778,1.033333,295.222222,86.873333,4.133333,993.888889
1,2019,1,2,71.3,751,17,396.0,194.1,38.6,686.6,248.3,0.18,0.4,281.5,50.143,-1.5,1011.4
2,2019,1,3,75.7,752,7,299.7,170.9,48.9,608.9,240.6,0.24,0.74,290.4,83.652,1.97,997.4
3,2019,1,4,90.7,576,34,281.9,171.4,41.7,636.5,238.5,0.19,1.24,221.0,83.261,1.97,997.3
4,2019,1,5,53.666667,21,3,10.0,57.0,40.333333,527.333333,219.333333,0.2,2.266667,177.666667,94.24,2.0,986.0


In [56]:
training_prep(df_weather, 10000, 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


ValueError: unconverted data remains: 19