In [38]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

dataset = pd.read_csv('ElectionData.csv')
dataset = dataset.drop("TimeElapsed", axis=1)
dataset = dataset.drop("time", axis=1)

# Define X as the matrix of features (independent) and 
# y as the matrix for the dependent variable
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [39]:
print(X)

[['Território Nacional' 0 226 ... 40.22 147993 94]
 ['Território Nacional' 0 226 ... 34.95 128624 81]
 ['Território Nacional' 0 226 ... 7.15 26307 16]
 ...
 ['Viseu' 8 0 ... 0.15 256 0]
 ['Viseu' 8 0 ... 0.14 239 0]
 ['Viseu' 8 0 ... 0.07 118 0]]


In [40]:
print(y)

[106  77  19 ...   0   0   0]


## Encode categorical data

In [41]:
index = 0
for col in dataset.columns:
    print("Index {}- Name: {}- type: {}".format(index, col, dataset[col].dtypes))
    index += 1

Index 0- Name: territoryName- type: object
Index 1- Name: totalMandates- type: int64
Index 2- Name: availableMandates- type: int64
Index 3- Name: numParishes- type: int64
Index 4- Name: numParishesApproved- type: int64
Index 5- Name: blankVotes- type: int64
Index 6- Name: blankVotesPercentage- type: float64
Index 7- Name: nullVotes- type: int64
Index 8- Name: nullVotesPercentage- type: float64
Index 9- Name: votersPercentage- type: float64
Index 10- Name: subscribedVoters- type: int64
Index 11- Name: totalVoters- type: int64
Index 12- Name: pre.blankVotes- type: int64
Index 13- Name: pre.blankVotesPercentage- type: float64
Index 14- Name: pre.nullVotes- type: int64
Index 15- Name: pre.nullVotesPercentage- type: float64
Index 16- Name: pre.votersPercentage- type: float64
Index 17- Name: pre.subscribedVoters- type: int64
Index 18- Name: pre.totalVoters- type: int64
Index 19- Name: Party- type: object
Index 20- Name: Mandates- type: int64
Index 21- Name: Percentage- type: float64
Index 22

In [42]:
# Columns with index 0 and 19 need encoding
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0, 19])], remainder='passthrough')

# Fit and do the transformation on X.
# Since the fit_transform returns the transformed matrix, we need
# to save the output to X as a numpy array because this will be expected
# from the machine learning algo
X = np.array(ct.fit_transform(X))

In [43]:
print(X)

[[0.0 0.0 0.0 ... 40.22 147993 94]
 [0.0 0.0 0.0 ... 34.95 128624 81]
 [0.0 0.0 0.0 ... 7.15 26307 16]
 ...
 [0.0 0.0 0.0 ... 0.15 256 0]
 [0.0 0.0 0.0 ... 0.14 239 0]
 [0.0 0.0 0.0 ... 0.07 118 0]]


## Splitting Data

In [44]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Training the Random Forest Regression model on the whole dataset

In [45]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
regressor.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=10, n_jobs=None, oob_score=False,
                      random_state=0, verbose=0, warm_start=False)

## Predicting the Test set results

In [46]:
y_pred = regressor.predict(X_test)

## Evaluating the Model Performance

In [47]:
from sklearn.metrics import r2_score
print(r2_score(y_test, y_pred))

0.9999626864642273
