# Load Forecast
## SVM Regression


In [1]:
# Import libraries
import numpy as np
import pandas as pd
from time import time
from sklearn.metrics import f1_score

# Read load data
load_data = pd.read_csv("rsfenergymodel2011.csv")
print "data loaded"

data loaded


### Implementation: Data Exploration



In [2]:
n_total_data = len(load_data)

# number of features
# column B, C, E, F, G, H
n_features = 6 # need to revise based on later coding

# count number of data for Jan, Feb, Mar
n_Jan = len(load_data[load_data['Month'] == 1])
n_Feb = len(load_data[load_data['Month'] == 2])
n_Mar = len(load_data[load_data['Month'] == 3])

n_Winter = n_Jan+n_Feb+n_Mar

# Print the results
print "Total data: {}".format(n_total_data)
print "Number of features: {}".format(n_features)
print "n_Jan: {}".format(n_Jan)
print "n_Feb: {}".format(n_Feb)
print "n_Mar: {}".format(n_Mar)
print "n_Winter: {}".format(n_Winter)


Total data: 8769
Number of features: 6
n_Jan: 744
n_Feb: 672
n_Mar: 744
n_Winter: 2160


## Preparing the Data

### Identify feature and target columns


In [3]:
# Extract feature columns
#feature_cols = ['Weekday', 'Month', 'Day', 'Hour', 'Outside Wet-Bulb Temp (F)', 
           # 'Outside Dry-Bulb Temp (F)', ]
feature_cols_1 = list(load_data.columns[1:3])
feature_cols_2 = list(load_data.columns[4:8])

feature_cols = feature_cols_1+feature_cols_2
target_cols = list(load_data.columns[8:17]) 

# Show the list of columns
print "Feature columns:\n{}".format(feature_cols)

print "\nTarget column: {}".format(target_cols)

X_all = load_data[feature_cols]
Y_cols = load_data[target_cols]

Y_all = Y_cols.sum(axis=1)
print "\nFeature values:"
print X_all.head()
print "\ntarget values:"
print Y_all.head()

Feature columns:
['Weekday', 'Month', 'Day', 'Hour', 'Outside Wet-Bulb Temp (F)', 'Outside Dry-Bulb Temp (F)']

Target column: ['Total Space Cooling - Kwh', 'Heating - kWh', 'Lighting End-Use Energy - Kwh', 'Task Lighting End-Use Energy - Kwh', 'Fans End-Use Energy - Kwh', 'Miscellaneous Equipment End-Use Energy - Kwh', 'Data Center IT End-Use Energy - Kwh', 'Data Center HVAC End-Use Energy - Kwh', 'Pumps End-Use Energy - Kwh']

Feature values:
   Weekday  Month  Day  Hour  Outside Wet-Bulb Temp (F)  \
0  Tuesday    1.0  1.0   1.0                       16.0   
1  Tuesday    1.0  1.0   2.0                       14.0   
2  Tuesday    1.0  1.0   3.0                       13.0   
3  Tuesday    1.0  1.0   4.0                       11.0   
4  Tuesday    1.0  1.0   5.0                       12.0   

   Outside Dry-Bulb Temp (F)  
0                       18.0  
1                       16.0  
2                       15.0  
3                       13.0  
4                       14.0  

target va

In [4]:
X_winter = X_all[0:2160]
Y_winter = Y_all[0:2160]

print "\nFeature values:"
print X_winter.head()
print "\ntarget values:"
print Y_winter.head()


Feature values:
   Weekday  Month  Day  Hour  Outside Wet-Bulb Temp (F)  \
0  Tuesday    1.0  1.0   1.0                       16.0   
1  Tuesday    1.0  1.0   2.0                       14.0   
2  Tuesday    1.0  1.0   3.0                       13.0   
3  Tuesday    1.0  1.0   4.0                       11.0   
4  Tuesday    1.0  1.0   5.0                       12.0   

   Outside Dry-Bulb Temp (F)  
0                       18.0  
1                       16.0  
2                       15.0  
3                       13.0  
4                       14.0  

target values:
0    278.564750
1    285.625589
2    294.650750
3    299.552589
4    307.048750
dtype: float64


### Preprocess Feature Columns


In [5]:
def preprocess_features(X):
    #''' Preprocesses the data'''
    #'''Converts categorical variables into dummy variables. '''
    
    # Initialize new output DataFrame
    output = pd.DataFrame(index = X.index)

    # Investigate each feature column for the data
    for col, col_data in X.iteritems():
        
        if col_data.dtype == object:
            col_data = col_data.replace(['Monday','Tuesday','Wednesday','Thursday','Friday',
                                        'Saturday','Sunday'], [1, 2, 3, 4, 5, 60, 70])

        output = output.join(col_data)
    
    return output

X_winter = preprocess_features(X_winter)

print "Processed feature columns ({} total features):\n{}".format(len(X_all.columns), list(X_all.columns))
print X_winter.head()
print Y_winter.head()

Processed feature columns (6 total features):
['Weekday', 'Month', 'Day', 'Hour', 'Outside Wet-Bulb Temp (F)', 'Outside Dry-Bulb Temp (F)']
   Weekday  Month  Day  Hour  Outside Wet-Bulb Temp (F)  \
0        2    1.0  1.0   1.0                       16.0   
1        2    1.0  1.0   2.0                       14.0   
2        2    1.0  1.0   3.0                       13.0   
3        2    1.0  1.0   4.0                       11.0   
4        2    1.0  1.0   5.0                       12.0   

   Outside Dry-Bulb Temp (F)  
0                       18.0  
1                       16.0  
2                       15.0  
3                       13.0  
4                       14.0  
0    278.564750
1    285.625589
2    294.650750
3    299.552589
4    307.048750
dtype: float64


### Implementation: Training and Testing Data Split

- Randomly shuffle and split the data (`X_winter`, `Y_winter`) into training and testing subsets.
  - 75% data for training
  - 25% data for testing

In [6]:
#load lib
from sklearn.cross_validation import train_test_split

# Set 75% data for training
num_train = 1620

# Set 75% data for testing
num_test = X_all.shape[0] - num_train

# TODO: Shuffle and split the dataset into the number of training and testing points above
X_train, X_test, y_train, y_test= train_test_split(X_winter, Y_winter, train_size=num_train, random_state=42)

# Show the results of the split
print "Training set has {} samples.".format(X_train.shape[0])
print "Testing set has {} samples.".format(X_test.shape[0])
print X_train.head()
print y_train.head()


Training set has 1620 samples.
Testing set has 540 samples.
      Weekday  Month   Day  Hour  Outside Wet-Bulb Temp (F)  \
91          5    1.0   4.0  20.0                       34.0   
1677        1    3.0  11.0  22.0                       35.0   
1628       60    3.0   9.0  21.0                       32.0   
873         3    2.0   6.0  10.0                       33.0   
2146       70    3.0  31.0  11.0                       33.0   

      Outside Dry-Bulb Temp (F)  
91                         47.0  
1677                       40.0  
1628                       39.0  
873                        44.0  
2146                       45.0  
91      177.822789
1677    193.478089
1628    220.374750
873     495.787570
2146    199.352730
dtype: float64


## Training and Evaluating Models

- Support Vector Regression (SVR)


In [None]:
from sklearn.svm import SVR
import matplotlib.pyplot as plt

svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1)

y_rbf = svr_rbf.fit(X_train, y_train).predict(X_test)


# #############################################################################
# Look at the results
##

a = np.arange(0, 540, 1)
lw = 2
print len(a)
print len(y_rbf)

plt.scatter(a, y_test, color='darkorange', label='data')

plt.plot(a, y_rbf, color='c', lw=lw, label='RBF model')

plt.xlabel('data')
plt.ylabel('target')
plt.title('Support Vector Regression')
plt.legend()
plt.show()

540
540
