# Data Acquisition
### NSRDB-The National Solar Radiation Database

- I'll start with a small set of data, using just one location.
- Stretch Goal: Create application where you can load data from anywhere and get predictions from nearest availible location

In [1]:
import pandas as pd

In [2]:
# Declare all variables as strings. Spaces must be replaced with '+', i.e., change 'John Smith' to 'John+Smith'.
  # Define the lat, long of the location and the year
lat, lon= 33.2164, -97.1292
  # You must request an NSRDB api key from the link above
api_key = 'gs1TsTVlGbso40wwobyWP8OE5THNAzC8fVF8zohe'
  # Set the attributes to extract (e.g., dhi, ghi, etc.), separated by commas.
attributes = 'ghi,dhi,dni,wind_speed,air_temperature,solar_zenith_angle'
  # Choose year of data
year = '2010'
  # Set leap year to true or false. True will return leap day data if present, false will not.
leap_year = 'false'
  # Set time interval in minutes, i.e., '30' is half hour intervals. Valid intervals are 30 & 60.
interval = '30'
  # Specify Coordinated Universal Time (UTC), 'true' will use UTC, 'false' will use the local time zone of the data.
  # NOTE: In order to use the NSRDB data in SAM, you must specify UTC as 'false'. SAM requires the data to be in the
  # local time zone.
utc = 'true'
  # Your full name, use '+' instead of spaces.
your_name = 'Dylla+Usher'
  # Your reason for using the NSRDB.
reason_for_use = 'beta+testing'
  # Your affiliation
your_affiliation = 'no+institution'
  # Your email address
your_email = 'Dyllanjrusher00@gmail.com'
  # Please join our mailing list so we can keep you up-to-date on new developments.
mailing_list = 'true'

  # Declare url string
url = 'https://developer.nrel.gov/api/solar/nsrdb_psm3_download.csv?wkt=POINT({lon}%20{lat})&names={year}&leap_day={leap}&interval={interval}&utc={utc}&full_name={name}&email={email}&affiliation={affiliation}&mailing_list={mailing_list}&reason={reason}&api_key={api}&attributes={attr}'.format(year=year, lat=lat, lon=lon, leap=leap_year, interval=interval, utc=utc, name=your_name, email=your_email, mailing_list=mailing_list, affiliation=your_affiliation, reason=reason_for_use, api=api_key, attr=attributes)
  # Return just the first 2 lines to get metadata:
info = pd.read_csv(url, nrows=1)
  # See metadata for specified properties, e.g., timezone and elevation
timezone, elevation = info['Local Time Zone'], info['Elevation']

In [3]:
info.head()

Unnamed: 0,Source,Location ID,City,State,Country,Latitude,Longitude,Time Zone,Elevation,Local Time Zone,...,Cloud Type 11,Cloud Type 12,Fill Flag 0,Fill Flag 1,Fill Flag 2,Fill Flag 3,Fill Flag 4,Fill Flag 5,Surface Albedo Units,Version
0,NSRDB,693763,-,-,-,33.21,-97.14,0,203,-6,...,Dust,Smoke,,Missing Image,Low Irradiance,Exceeds Clearsky,Missing CLoud Properties,Rayleigh Violation,,3.0.6


In [4]:
info.columns

Index(['Source', 'Location ID', 'City', 'State', 'Country', 'Latitude',
       'Longitude', 'Time Zone', 'Elevation', 'Local Time Zone',
       'Clearsky DHI Units', 'Clearsky DNI Units', 'Clearsky GHI Units',
       'Dew Point Units', 'DHI Units', 'DNI Units', 'GHI Units',
       'Solar Zenith Angle Units', 'Temperature Units', 'Pressure Units',
       'Relative Humidity Units', 'Precipitable Water Units',
       'Wind Direction Units', 'Wind Speed', 'Cloud Type -15', 'Cloud Type 0',
       'Cloud Type 1', 'Cloud Type 2', 'Cloud Type 3', 'Cloud Type 4',
       'Cloud Type 5', 'Cloud Type 6', 'Cloud Type 7', 'Cloud Type 8',
       'Cloud Type 9', 'Cloud Type 10', 'Cloud Type 11', 'Cloud Type 12',
       'Fill Flag 0', 'Fill Flag 1', 'Fill Flag 2', 'Fill Flag 3',
       'Fill Flag 4', 'Fill Flag 5', 'Surface Albedo Units', 'Version'],
      dtype='object')

In [5]:
info

Unnamed: 0,Source,Location ID,City,State,Country,Latitude,Longitude,Time Zone,Elevation,Local Time Zone,...,Cloud Type 11,Cloud Type 12,Fill Flag 0,Fill Flag 1,Fill Flag 2,Fill Flag 3,Fill Flag 4,Fill Flag 5,Surface Albedo Units,Version
0,NSRDB,693763,-,-,-,33.21,-97.14,0,203,-6,...,Dust,Smoke,,Missing Image,Low Irradiance,Exceeds Clearsky,Missing CLoud Properties,Rayleigh Violation,,3.0.6


In [6]:
# Return all but first 2 lines of csv to get data:
df = pd.read_csv(url, skiprows=2)
# Set the time index in the pandas dataframe:
df = df.set_index(pd.date_range('1/1/{yr}'.format(yr=year), freq=interval+'Min', periods=525600/int(interval)))

In [7]:
df.shape

(17520, 11)

In [8]:
df.head()

Unnamed: 0,Year,Month,Day,Hour,Minute,GHI,DHI,DNI,Wind Speed,Temperature,Solar Zenith Angle
2010-01-01 00:00:00,2010,1,1,0,0,0,0,0,5.1,1,96.17
2010-01-01 00:30:00,2010,1,1,0,30,0,0,0,5.0,1,101.99
2010-01-01 01:00:00,2010,1,1,1,0,0,0,0,4.9,1,107.96
2010-01-01 01:30:00,2010,1,1,1,30,0,0,0,4.8,0,114.03
2010-01-01 02:00:00,2010,1,1,2,0,0,0,0,4.8,0,120.19


In [9]:
df.memory_usage()

Index                 140160
Year                  140160
Month                 140160
Day                   140160
Hour                  140160
Minute                140160
GHI                   140160
DHI                   140160
DNI                   140160
Wind Speed            140160
Temperature           140160
Solar Zenith Angle    140160
dtype: int64

In [10]:
pd.set_option('display.max_columns', 100)

###Important information about dataset!
info.head()

Unnamed: 0,Source,Location ID,City,State,Country,Latitude,Longitude,Time Zone,Elevation,Local Time Zone,Clearsky DHI Units,Clearsky DNI Units,Clearsky GHI Units,Dew Point Units,DHI Units,DNI Units,GHI Units,Solar Zenith Angle Units,Temperature Units,Pressure Units,Relative Humidity Units,Precipitable Water Units,Wind Direction Units,Wind Speed,Cloud Type -15,Cloud Type 0,Cloud Type 1,Cloud Type 2,Cloud Type 3,Cloud Type 4,Cloud Type 5,Cloud Type 6,Cloud Type 7,Cloud Type 8,Cloud Type 9,Cloud Type 10,Cloud Type 11,Cloud Type 12,Fill Flag 0,Fill Flag 1,Fill Flag 2,Fill Flag 3,Fill Flag 4,Fill Flag 5,Surface Albedo Units,Version
0,NSRDB,693763,-,-,-,33.21,-97.14,0,203,-6,w/m2,w/m2,w/m2,c,w/m2,w/m2,w/m2,Degree,c,mbar,%,cm,Degrees,m/s,,Clear,Probably Clear,Fog,Water,Super-Cooled Water,Mixed,Opaque Ice,Cirrus,Overlapping,Overshooting,Unknown,Dust,Smoke,,Missing Image,Low Irradiance,Exceeds Clearsky,Missing CLoud Properties,Rayleigh Violation,,3.0.6


In [11]:
info.columns

Index(['Source', 'Location ID', 'City', 'State', 'Country', 'Latitude',
       'Longitude', 'Time Zone', 'Elevation', 'Local Time Zone',
       'Clearsky DHI Units', 'Clearsky DNI Units', 'Clearsky GHI Units',
       'Dew Point Units', 'DHI Units', 'DNI Units', 'GHI Units',
       'Solar Zenith Angle Units', 'Temperature Units', 'Pressure Units',
       'Relative Humidity Units', 'Precipitable Water Units',
       'Wind Direction Units', 'Wind Speed', 'Cloud Type -15', 'Cloud Type 0',
       'Cloud Type 1', 'Cloud Type 2', 'Cloud Type 3', 'Cloud Type 4',
       'Cloud Type 5', 'Cloud Type 6', 'Cloud Type 7', 'Cloud Type 8',
       'Cloud Type 9', 'Cloud Type 10', 'Cloud Type 11', 'Cloud Type 12',
       'Fill Flag 0', 'Fill Flag 1', 'Fill Flag 2', 'Fill Flag 3',
       'Fill Flag 4', 'Fill Flag 5', 'Surface Albedo Units', 'Version'],
      dtype='object')

In [37]:

### Use this function in stretch goal:

def get_data(lat=33.2164, lon=-97.1292, year='2010'):
    api_key = 'gs1TsTVlGbso40wwobyWP8OE5THNAzC8fVF8zohe'
    attributes = 'cloud_type,dew_point,fill_flag,wind_speed,surface_albedo,air_temperature,solar_zenith_angle,wind_direction,wind_speed,fill_flag,ghi'
    leap_year = 'false'
    interval = '30' #30 or 60 minutes
    utc = 'true'
    your_name = 'Dylla+Usher'
    reason_for_use = 'beta+testing'
    your_affiliation = 'no+institution'
    your_email = 'Dyllanjrusher00@gmail.com'
    mailing_list = 'true'

    url = 'https://developer.nrel.gov/api/solar/nsrdb_psm3_download.csv?wkt=POINT({lon}%20{lat})&names={year}&leap_day={leap}&interval={interval}&utc={utc}&full_name={name}&email={email}&affiliation={affiliation}&mailing_list={mailing_list}&reason={reason}&api_key={api}&attributes={attr}'.format(year=year, lat=lat, lon=lon, leap=leap_year, interval=interval, utc=utc, name=your_name, email=your_email, mailing_list=mailing_list, affiliation=your_affiliation, reason=reason_for_use, api=api_key, attr=attributes)
    
    df = pd.read_csv(url, skiprows=2)
    df = df.set_index(pd.date_range('1/1/{yr}'.format(yr=year), freq=interval+'Min', periods=525600/int(interval)))
    return df


In [22]:
df = get_data(lat=35.234452, lon=-91.059679)
# df = get_data()

In [23]:
df['Fill Flag'].value_counts()

0    16287
4      774
1      442
3       17
Name: Fill Flag, dtype: int64

In [24]:
df.head()

Unnamed: 0,Year,Month,Day,Hour,Minute,Cloud Type,Dew Point,Fill Flag,Wind Speed,Surface Albedo,Temperature,Solar Zenith Angle,Wind Direction,GHI
2010-01-01 00:00:00,2010,1,1,0,0,4,2,0,2.8,0.125,4,101.59,353.1,0
2010-01-01 00:30:00,2010,1,1,0,30,4,2,0,2.7,0.125,3,107.4,353.1,0
2010-01-01 01:00:00,2010,1,1,1,0,4,2,0,2.7,0.125,3,113.33,350.5,0
2010-01-01 01:30:00,2010,1,1,1,30,4,2,0,2.7,0.125,2,119.35,350.5,0
2010-01-01 02:00:00,2010,1,1,2,0,7,1,0,2.8,0.125,2,125.43,346.1,0


In [27]:
##Lat/Lon order:
#Arkansas, Georgia, Arizona
#use Arkansas to train, Georgia to test in same region, Arizona to test in different region
lats = [35.234452, 33.20215, 33.32044685]
lons = [-91.059679, -83.514241, -110.8204584]
years = list(range(1998, 2018))
names = ['Arkansas', 'Georgia', 'Arizona']

In [40]:
import time
%cd /home/dyllanjr/Solar_Irradiance_Prediction/data/raw
#This will be 6% of my hourly api quota:
for lat, lon, name in zip(lats, lons, names):
    for year in years:
        df = get_data(lat=lat,lon=lon, year=year)
        df.to_csv(f"{name}_{year}.csv")
        #2 Second delay between requests required
        time.sleep(2)

/home/dyllanjr/Solar_Irradiance_Prediction/data/raw


In [42]:
###Loading raw data into hdfs

#Make sure you're hdfs and yarn jvms are started
#cd path_to_hadoop/sbin
#./start-dfs.sh
#./start-yarn.sh
import os

os.system('hadoop fs -mkdir /solar_data')
os.system('hadoop fs -copyFromLocal /home/dyllanjr/Solar_Irradiance_Prediction/data/raw /solar_data')

#checked http://localhost:50070/explorer.html#/solar_data/raw, all good!

0