## Imports

In [2]:
import pandas as pd
from sklearn import svm
from sklearn.preprocessing import StandardScaler

## Prep the Data

First, we read in the datasets.

In [3]:
wind_df = pd.read_csv("../data/wind.csv")
solar_df = pd.read_csv("../data/solar.csv")

print("Previews of the datasets:")
print(wind_df.head(10))
print("_________________________________")
print(solar_df.head(10))

Previews of the datasets:
   id        lat        long  wind_speed farm_type  capacity  capacity_factor  \
0   0  23.510410 -117.147260        6.07  offshore        16            0.169   
1   1  24.007446  -93.946777        7.43  offshore        16            0.302   
2   2  25.069138  -97.482483        8.19  offshore        16            0.375   
3   3  25.069443  -97.463135        8.19  offshore        16            0.375   
4   4  25.069763  -97.443756        8.19  offshore        16            0.376   
5   5  25.070091  -97.424377        8.19  offshore        16            0.375   
6   6  25.070404  -97.404999        8.19  offshore        16            0.375   
7   7  25.086678  -97.482849        8.18  offshore        16            0.375   
8   8  25.087006  -97.463470        8.19  offshore        16            0.376   
9   9  25.087318  -97.444092        8.19  offshore        16            0.376   

   power_generation  estimated_cost  
0          23687.04        20800000  
1     

Now, we must shuffle the datasets to reduce bias.

In [4]:
wind_df = wind_df.sample(frac=1)
solar_df = solar_df.sample(frac=1)

print("Previews of the shuffled datasets:")
print(wind_df.head(10))
print("_________________________________")
print(solar_df.head(10))

Previews of the shuffled datasets:
            id        lat        long  wind_speed farm_type  capacity  \
2538      2538  28.427433  -95.482086        7.43  offshore        16   
74685    74685  42.460751  -97.424561        8.70   onshore        16   
14832    14832  34.077995 -105.496582        8.78   onshore        16   
62697    62697  41.330143  -89.920837        7.48   onshore        16   
126102  126102  48.964993 -112.273132        8.37   onshore        16   
47952    47952  38.612041 -112.771744        6.21   onshore        10   
70149    70149  42.003757  -91.957764        7.48   onshore        14   
37703    37703  36.501045 -113.198120        6.94   onshore        16   
1669      1669  26.541775  -82.296326        5.95  offshore        16   
97832    97832  42.256721  -74.759338        7.23   onshore        10   

        capacity_factor  power_generation  estimated_cost  
2538              0.311          43589.76        20800000  
74685             0.508          71201.28

Looking at each dataset, we can identify which variables we want to use for our models. SVM can only handle 1D predictions, so we have to separate the two desired outputs into their own sets.

In [5]:
# Wind data: energy production
wind_X = wind_df.loc[:, [False, True, True, True, False, True, True, False, False]]
wind_energy_y = wind_df.loc[:, wind_df.columns[-2]]

# Wind data: cost
wind_cost_y = wind_df.loc[:, wind_df.columns[-1]]

# Solar data: energy production
solar_X = solar_df.loc[:, [False, True, True, True, False, True, True, False, False]]
solar_energy_y = solar_df.loc[:, solar_df.columns[-2]]

# Solar data: cost
solar_cost_y = solar_df.loc[:, solar_df.columns[-1]]

Now we split into training and testing sets, making sure each output get its own training and testing set, reserving about 80% for training and 20% for testing.

In [6]:
# Wind data: energy production
wind_X_train = wind_X[:100000]
wind_X_test = wind_X[100000:]
wind_energy_y_train = wind_energy_y[:100000]
wind_energy_y_test = wind_energy_y[100000:]

# Wind data: cost
wind_cost_y_train = wind_cost_y[:100000]
wind_cost_y_test = wind_cost_y[100000:]

# Solar data: energy production
solar_X_train = solar_X[:9500]
solar_X_test = solar_X[9500:]
solar_energy_y_train = solar_energy_y[:9500]
solar_energy_y_test = solar_energy_y[9500:]

# Solar data: cost
solar_cost_y_train = solar_cost_y[:9500]
solar_cost_y_test = solar_cost_y[9500:]

Some models perform better when inputs are within a certain range, like [-1, 1] for example. We scale the data points appropriately.

In [7]:
scaler = StandardScaler()

# Wind data
scaler.fit(wind_X_train)
wind_X_train = scaler.transform(wind_X_train)
wind_X_test = scaler.transform(wind_X_test)

# Solar data
scaler.fit(solar_X_train)
solar_X_train = scaler.transform(solar_X_train)
solar_X_test = scaler.transform(solar_X_test)

## Training the Models

Now that the data is pre-processed accordingly, the models can be trained and fit.

In [None]:
# Wind: energy production
wind_energy_reg = svm.SVR()
wind_energy_reg.fit(wind_X_train, wind_energy_y_train)

In [None]:
# Wind: cost
wind_cost_reg = svm.SVR()
wind_cost_reg.fit(wind_X_train, wind_cost_y_train)

In [None]:
# Solar: energy production
solar_energy_reg = svm.SVR()
solar_energy_reg.fit(solar_X_train, solar_energy_y_train)

In [None]:
#Solar: cost
solar_cost_reg = svm.SVR()
solar_cost_reg.fit(solar_X_train, solar_cost_y_train)

## Testing the Models

With trained models, we can now test them and make predictions.

In [12]:
# Wind: energy production
wind_energy_test = wind_energy_reg.predict(wind_X_test)
print("Predicted outputs for wind energy data:")
print(wind_energy_test)
print()

# Wind: cost
wind_cost_test = wind_cost_reg.predict(wind_X_test)
print("Predicted outputs for wind cost data:")
print(wind_cost_test)
print()

# Solar: energy production
solar_energy_test = solar_energy_reg.predict(wind_X_test)
print("Predicted outputs for solar energy data:")
print(solar_energy_test)
print()

# Solar: cost
solar_cost_test = solar_cost_reg.predict(wind_X_test)
print("Predicted outputs for solar cost data:")
print(solar_cost_test)

Predicted outputs for wind energy data:
[43740.47802981 48651.84476318 55644.30161531 ... 57777.52430673
 42938.48674883 61054.39411137]

Predicted outputs for wind cost data:
[20795471.847709   20797202.07397964 20799992.74356124 ...
 20799999.44791149 20799860.74038213 20799991.1027435 ]

Predicted outputs for solar energy data:
[4364.28499731 4821.89196899 4700.30967839 ... 4806.56397471 4758.2968372
 4974.2796332 ]

Predicted outputs for solar cost data:
[5319985.2501513  5320251.04082778 5320328.99546367 ... 5320366.13373715
 5320292.87901872 5320344.07807934]
