# Alternative Energy Data Preparation

In this notebook we read in the ten renewable energy stocks, process them and combine them into a unifed h5 file to use for latter analysis

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
from pathlib import Path
from collections import OrderedDict
import os

In [3]:
DATA_STORE = Path('Data/H5/assets.h5')

In [4]:
Stock_dict = {1: 'Enphase_Energy', 2: 'Solaredge_Technologies'}
Ticker_dict ={1: 'ENPH', 2: 'SEDG'}

In [5]:
def read_file(csv, stock):
    stock = pd.read_csv('Data/Raw/' + csv + '.csv', thousands=',', decimal='.')
    return stock

### Read in all the files and create a columns for their ticker

We read in all the files form the `Raw` folder in the `data` folder and create a ticker for each symbol

In [6]:
def read_file(csv):
    data = pd.read_csv('Data/Raw/' + csv + '.csv', thousands=',', decimal='.')
    return data

In [7]:
Enphase_Energy = read_file('ENPH')
Solaredge_Technologies = read_file('SEDG')
Sunrun = read_file('RUN')
Canadian_Solar = read_file('CSIQ')
First_Solar = read_file('FSLR')

Oersted = read_file('DOGEF')
Siemens = read_file('SIEGY')
Vestas_Wind = read_file('VWSYF')
Broadwind = read_file('BWEN')
NextEra_Energy = read_file('NEE')

Bloom = read_file('BE')

In [8]:
Enphase_Energy.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2019-01-07,5.12,5.39,4.9,5.26,5.26,1458700
1,2019-01-08,5.33,5.48,5.21,5.35,5.35,1222600
2,2019-01-09,5.41,5.77,5.41,5.63,5.63,1326300
3,2019-01-10,5.54,5.77,5.44,5.63,5.63,1056100
4,2019-01-11,5.6,5.7,5.57,5.68,5.68,812100


In [9]:
def designate_ticker(stock, ticker): 
    stock['Ticker'] = ticker
    return stock

In [10]:
Enphase_Energy = designate_ticker(Enphase_Energy, 'ENPH')
Solaredge_Technologies = designate_ticker(Solaredge_Technologies, 'SEDG')
Sunrun = designate_ticker(Sunrun, 'RUN')
Canadian_Solar = designate_ticker(Canadian_Solar, 'CSIQ')
First_Solar = designate_ticker(First_Solar, 'FSLR')

Oersted = designate_ticker(Oersted, 'DOGEF')
Siemens = designate_ticker(Siemens, 'SIEGY')
Vestas_Wind = designate_ticker(Vestas_Wind, 'VWSYF')
Broadwind = designate_ticker(Broadwind, 'BWEN')
NextEra_Energy = designate_ticker(NextEra_Energy, 'NEE')

Bloom = designate_ticker(Bloom, 'BE')

In [11]:
Enphase_Energy.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Ticker
0,2019-01-07,5.12,5.39,4.9,5.26,5.26,1458700,ENPH
1,2019-01-08,5.33,5.48,5.21,5.35,5.35,1222600,ENPH
2,2019-01-09,5.41,5.77,5.41,5.63,5.63,1326300,ENPH
3,2019-01-10,5.54,5.77,5.44,5.63,5.63,1056100,ENPH
4,2019-01-11,5.6,5.7,5.57,5.68,5.68,812100,ENPH


### Scripts to format and concatenate files correctly. All the data was collected form Yahoo Finance.

In [12]:
def format_files(stock):
    
    del stock['Close']
    stock['Date'] = pd.to_datetime(stock['Date'])
    stock = stock.rename(columns = {" Adj Close": "Close"})
    
    return stock

In [13]:
Enphase_Energy = format_files(Enphase_Energy)
Solaredge_Technologies = format_files(Solaredge_Technologies)
Sunrun = format_files(Sunrun)
Canadian_Solar = format_files(Canadian_Solar)
First_Solar = format_files(First_Solar)

Oersted = format_files(Oersted)
Siemens = format_files(Siemens)
Vestas_Wind = format_files(Vestas_Wind)
Broadwind = format_files(Broadwind)
NextEra_Energy = format_files(NextEra_Energy)

Bloom = format_files(Bloom)

This script is meant for data that may come from another source rather than Yahoo. Some data I worked with used 'm' or 'b' as a short hand for million or billion - this simply converts them. Additoanlly, it removes any dollar signs that are sometimes presnt in stcok data.

In [14]:
def value_to_float(x):
    if type(x) == float or type(x) == int:
        return x
    if 'K' in x:
        if len(x) > 1:
            return float(x.replace('K', '')) * 1000
        return 1000.0
    if 'M' in x:
        if len(x) > 1:
            return float(x.replace('M', '')) * 1000000
        return 1000000.0
    if 'B' in x:
        return float(x.replace('B', '')) * 1000000000
    if '$' in x:
        return float(x.replace('$', ''))
    return 0.0

### Save the adjusted files to the `Adjusted` data folder

In [15]:
Enphase_Energy.to_csv('Data/Adjusted/ENPH.csv', index = False, header=True)
Solaredge_Technologies.to_csv('Data/Adjusted/SEDG.csv', index = False, header=True)
Sunrun.to_csv('Data/Adjusted/RUN.csv', index = False, header=True)
Canadian_Solar.to_csv('Data/Adjusted/CSIQ.csv', index = False, header=True)
First_Solar.to_csv('Data/Adjusted/FSLR.csv', index = False, header=True)

Oersted.to_csv('Data/Adjusted/DOGEF.csv', index = False, header=True)
Siemens.to_csv('Data/Adjusted/SIEGY.csv', index = False, header=True)
Vestas_Wind.to_csv('Data/Adjusted/VWSYF.csv', index = False, header=True)
Broadwind.to_csv('Data/Adjusted/BWEN.csv', index = False, header=True)
NextEra_Energy.to_csv('Data/Adjusted/NEE.csv', index = False, header=True)

Bloom.to_csv('Data/Adjusted/BE.csv', index = False, header=True)

### Concatenate

The enxt step is to Concatenate all the tickers into a unifed `h5` file for later use. This will allow for more simplicty when we use the data in futrue notebooks 

In [16]:
frames = [Enphase_Energy, Solaredge_Technologies, Sunrun, Canadian_Solar, First_Solar, Oersted, Siemens, Vestas_Wind,
          Broadwind, NextEra_Energy, Bloom]

Alt_Energy_prices = pd.concat(frames)
Alt_Energy_prices.head()

Unnamed: 0,Date,Open,High,Low,Adj Close,Volume,Ticker
0,2019-01-07,5.12,5.39,4.9,5.26,1458700,ENPH
1,2019-01-08,5.33,5.48,5.21,5.35,1222600,ENPH
2,2019-01-09,5.41,5.77,5.41,5.63,1326300,ENPH
3,2019-01-10,5.54,5.77,5.44,5.63,1056100,ENPH
4,2019-01-11,5.6,5.7,5.57,5.68,812100,ENPH


In [17]:
Alt_Energy_prices.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5544 entries, 0 to 503
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Date       5544 non-null   datetime64[ns]
 1   Open       5544 non-null   float64       
 2   High       5544 non-null   float64       
 3   Low        5544 non-null   float64       
 4   Adj Close  5544 non-null   float64       
 5   Volume     5544 non-null   int64         
 6   Ticker     5544 non-null   object        
dtypes: datetime64[ns](1), float64(4), int64(1), object(1)
memory usage: 346.5+ KB


### Save dataset as `csv` and `h5`

In [18]:
Alt_Energy_prices.to_csv('Data/Adjusted/Alt_Energy_prices.csv', index = False, header=True)

In [19]:
df = (pd.read_csv('Data/Adjusted/Alt_Energy_prices.csv',
                 parse_dates=['Date'],
                 index_col=['Date', 'Ticker'],
                 infer_datetime_format=True)
     .sort_index())

In [20]:
df.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Open,High,Low,Adj Close,Volume
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-01-07,BE,11.86,13.24,11.86,12.41,391600
2019-01-07,BWEN,1.45,1.63,1.45,1.61,27100
2019-01-07,CSIQ,16.059999,17.4,15.96,16.799999,855600
2019-01-07,DOGEF,67.949997,67.949997,67.949997,58.250946,200
2019-01-07,ENPH,5.12,5.39,4.9,5.26,1458700
2019-01-07,FSLR,46.459999,47.740002,45.66,45.790001,1784000
2019-01-07,NEE,42.912498,43.235001,42.625,40.522408,9478000
2019-01-07,RUN,10.37,10.8,10.15,10.56,1209500
2019-01-07,SEDG,37.200001,37.200001,34.919998,35.009998,1147600
2019-01-07,SIEGY,55.700001,56.5,55.369999,47.956219,303200


In [21]:
with pd.HDFStore(DATA_STORE) as store:
    store.put('Alt_Energy/prices', df)