In [154]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import r2_score

## Importing the data

In [155]:
dataset = pd.read_csv("ElectionData.csv")

dataset = dataset.drop("TimeElapsed", axis=1)
dataset = dataset.drop("time", axis=1)


X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [156]:
print(X)

[['Território Nacional' 0 226 ... 40.22 147993 94]
 ['Território Nacional' 0 226 ... 34.95 128624 81]
 ['Território Nacional' 0 226 ... 7.15 26307 16]
 ...
 ['Viseu' 8 0 ... 0.15 256 0]
 ['Viseu' 8 0 ... 0.14 239 0]
 ['Viseu' 8 0 ... 0.07 118 0]]


In [157]:
print(y)

[106  77  19 ...   0   0   0]


In [158]:
# We need to transform the y to be in a 2D array format. That is
# because the StandardScaler object that will be used to apply
# feature scaling expects a 2D array format in order to work

y = y.reshape(len(y), 1)
# first arg in the reshape is the number of rows that the new array
# will have, and the second is the number of columns

In [159]:
print(y)

[[106]
 [ 77]
 [ 19]
 ...
 [  0]
 [  0]
 [  0]]


## Encoding categorical data

In [160]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0,19])], remainder='passthrough')

# Fit and do the transformation on X. Since the fit_transform returns the transformed matrix, we need
# to save the output to X as a numpy array because this will be expected from the machine learning algo
X = np.array(ct.fit_transform(X))

In [161]:
print(X)

[[0.0 0.0 0.0 ... 40.22 147993 94]
 [0.0 0.0 0.0 ... 34.95 128624 81]
 [0.0 0.0 0.0 ... 7.15 26307 16]
 ...
 [0.0 0.0 0.0 ... 0.15 256 0]
 [0.0 0.0 0.0 ... 0.14 239 0]
 [0.0 0.0 0.0 ... 0.07 118 0]]


## Splitting the data (before feature scaling)

In [162]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Feature scaling

In [163]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_y = StandardScaler()
X_train[:, 42:] = sc_X.fit_transform(X_train[:, 42:])
X_test[:, 42:] = sc_X.transform(X_test[:, 42:])

y_train = sc_y.fit_transform(y_train)
y_test = sc_y.transform(y_test)

In [164]:
print(X_train)

[[0.0 0.0 0.0 ... 0.9039825076225502 0.3262527007245665
  0.2904587312431689]
 [0.0 0.0 0.0 ... -0.4265578781237316 -0.1665358248725105
  -0.1799224939932431]
 [0.0 0.0 0.0 ... -0.45769234000630116 -0.16721789260791958
  -0.1799224939932431]
 ...
 [0.0 0.0 0.0 ... -0.17290358572750308 0.044190625952928504
  -0.023128752247772427]
 [0.0 1.0 0.0 ... -0.4668495346776452 -0.17139691079629912
  -0.1799224939932431]
 [0.0 1.0 0.0 ... -0.4210635613209252 -0.17162426670810216
  -0.1799224939932431]]


In [165]:
print(y_train)

[[ 0.27]
 [-0.17]
 [-0.17]
 ...
 [-0.02]
 [-0.17]
 [-0.17]]


## Train the SVR model

In [166]:
# The kernel can be chosen for linear or non linear data accordingly
regressor = SVR(kernel = 'rbf') # rfb = radial basis function (non linear function)
regressor.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

## Predicting test set results

In [169]:
y_pred = sc_y.inverse_transform(regressor.predict(X_test))
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[-0.19 -0.17]
 [ 0.69 -0.17]
 [-0.19 -0.17]
 ...
 [-0.1  -0.17]
 [-0.68 -0.17]
 [-0.41 -0.17]]


## Evaluate the SVR performance

In [172]:
print(r2_score(sc_y.inverse_transform(y_test), y_pred))

0.9837981045702319
