# Notebook summary

This notebook contains three sections:

1. **Preprocesing:** removing data dependent attributes as we already known that are not much correlated with the response
2. **Tuning max_depth for regression trees:** tuning this hyperparameter over the whole principal components
3. **Tuning number of principal components:** trying to see if we can reduce the dimensionality of the problem
4. **Checking performance of final model:** comparing final model obtained against the one built before

In [2]:
from pyspark.ml.feature import PCA
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import ChiSqSelector
from pyspark.ml import Pipeline
from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors
import matplotlib.pyplot as plt
from pyspark.ml.feature import VectorSlicer
from pyspark.ml.linalg import Vectors
from pyspark.sql.types import Row

In [3]:
from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
import numpy as np
wind = spark.table("wind")

#Filtering the dataset into train/validation/test
wind_train=wind.filter(wind["year"]<=2006)
wind_validation=wind[(wind.year==2007)|(wind.year==2008)]
wind_test=wind.filter(wind["year"]>=2009)
wind_train_validation=wind.filter(wind["year"]<2009)


#Function to preproces the data
def preprocessing(data):
  notignore=["u100","v100","u10","v10","ienss","iews"]
  col_selected = []
  for idx_1,name in enumerate(notignore):
    for idx_2,name in enumerate(data.columns):
      if notignore[idx_1] in name:
        list=col_selected.append(name)
  assembler = VectorAssembler(inputCols=col_selected,outputCol="features")
  temp = assembler.transform(data).select(['energy','features'])
  return (temp)

#Applying the function to preprocessing
wind_train=preprocessing(wind_train)
wind_validation=preprocessing(wind_validation)
wind_test=preprocessing(wind_test)
wind_train_validation=preprocessing(wind_train_validation)

wind_train_validation.cache()
wind_test.cache()

#Obtaining Principal Components
pca = PCA(k=225,inputCol="features", outputCol="pcaFeatures")
model_pca = pca.fit(wind_train)

wind_train = model_pca.transform(wind_train)
wind_validation = model_pca.transform(wind_validation)

wind_train.cache()
wind_validation.cache()

In [5]:
##In this case, as we are tunning just the max depth of the decision tree regresor, we are using the same PC on each iteration - so it is convenient
##to cache that transformation (previously done)

d={}
for i in range(1,10):
  dt = DecisionTreeRegressor(featuresCol="pcaFeatures",labelCol="energy",maxDepth=i)
  pipeline_dt = Pipeline(stages=[dt])
  model_dt= pipeline_dt.fit(wind_train)

  predictions = model_dt.transform(wind_validation)

  evaluator = RegressionEvaluator(labelCol="energy", predictionCol="prediction", metricName="mae")
  mae_train = evaluator.evaluate(predictions)
  print("MAE with all principal components and maxdepth={}: {}".format(i,mae_train))
  d[i]=mae_train

plt.plot(d.keys(),d.values())
plt.show()

In [6]:
#From this result we can see that best max depth is equal to 5
#Let´s check with the test set

In [7]:
# 2 - Evaluating the model after setting best max_depth = 5
##Obtaining PCA with train+validation

pca = PCA(k=225,inputCol="features", outputCol="pcaFeatures")
model_pca_train_validation = pca.fit(wind_train_validation)

#Transforming the data with the fitted pc
wind_train_validation_pca=model_pca_train_validation.transform(wind_train_validation)
wind_test_pca = model_pca_train_validation.transform(wind_test)

#Fitting a decision tree with the train+validation with pca already obtained
dt = DecisionTreeRegressor(featuresCol="pcaFeatures",labelCol="energy",maxDepth=5)
pipeline_dt = Pipeline(stages=[dt])
model_dt= pipeline_dt.fit(wind_train_validation_pca)

#Making predictions over test partition
predictions=model_dt.transform(wind_test_pca)

#Evaluating the model with MAE
evaluator = RegressionEvaluator(labelCol="energy", predictionCol="prediction", metricName="mae")
mae_train = evaluator.evaluate(predictions)

print("MAE with all principal components and maxdepth={}: {}".format(5,mae_train))

In [8]:
#Apartado 3 1D search for #pca in range 10 to 190 with steps of 10

for i in [i*10 for i in range(1,20)]:
  #pca_train = PCA(k=i, inputCol="features")
  
  pca_train=VectorSlicer(inputCol="pcaFeatures", outputCol="selectedpcafeatures", indices=range(1,i))
  
  dt_train = DecisionTreeRegressor(featuresCol=pca_train.getOutputCol(), 
                             labelCol="energy",maxDepth=5)
  
  pipeline_train = Pipeline(stages=[pca_train, dt_train])

  model_train = pipeline_train.fit(wind_train)
  predictions_validation = model_train.transform(wind_validation)

  evaluator = RegressionEvaluator(labelCol="energy", predictionCol="prediction", metricName="mae")
  mae_validation = evaluator.evaluate(predictions_validation)

  print("MAE with {} pca components: {}".format(i,mae_validation))
  

In [9]:
#From the results on previous part we can assume that more than 20 PCA does not improve the performance model, in fact it reduces the MAE. 
#Let´s check the range from 10 to 20 to check wheter we can reduce the amount of predictors or not

In [10]:
#Apartado 3 1D search for #pca in range 10 to 190 with steps of 10

for i in [i for i in range(10,30)]:
  #pca_train = PCA(k=i, inputCol="features")
  
  pca_train=VectorSlicer(inputCol="pcaFeatures", outputCol="selectedpcafeatures", indices=range(1,i))
  
  dt_train = DecisionTreeRegressor(featuresCol=pca_train.getOutputCol(), 
                             labelCol="energy",maxDepth=5)
  
  pipeline_train = Pipeline(stages=[pca_train, dt_train])

  model_train = pipeline_train.fit(wind_train)
  predictions_validation = model_train.transform(wind_validation)

  evaluator = RegressionEvaluator(labelCol="energy", predictionCol="prediction", metricName="mae")
  mae_validation = evaluator.evaluate(predictions_validation)

  print("MAE with {} pca components: {}".format(i,mae_validation))

In [11]:
#From the results above, we can see that best number of principal components is equal to 19

In [12]:
# Apartado 4
pca_train_validation = PCA(k=19, inputCol="features")
dt_train_validation = DecisionTreeRegressor(featuresCol=pca_train_validation.getOutputCol(), 
                           labelCol="energy",maxDepth=5)

pipeline_train_validation = Pipeline(stages=[pca_train_validation, dt_train_validation])

model_train_validation = pipeline_train_validation.fit(wind_train_validation)
predictions_train_validation = model_train_validation.transform(wind_test)

evaluator = RegressionEvaluator(labelCol="energy", predictionCol="prediction", metricName="mae")
mae_train = evaluator.evaluate(predictions_train_validation)

print("MAE with all components components: {}".format(mae_train))

In [13]:
#From this result, we can observe that a model using the first 4 principal components has almost the same MAE result as using all of them, so in this case the dimensionality of the problem will be reduced a lot, improving the time-performance of the model