In [115]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

## Importing the data

In [116]:
dataset = pd.read_csv("ElectionData.csv")

dataset = dataset.drop("TimeElapsed", axis=1)
dataset = dataset.drop("time", axis=1)


X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [117]:
print(X)

[['Território Nacional' 0 226 ... 40.22 147993 94]
 ['Território Nacional' 0 226 ... 34.95 128624 81]
 ['Território Nacional' 0 226 ... 7.15 26307 16]
 ...
 ['Viseu' 8 0 ... 0.15 256 0]
 ['Viseu' 8 0 ... 0.14 239 0]
 ['Viseu' 8 0 ... 0.07 118 0]]


In [121]:
# y = y.reshape(len(y), 1)
print(y)

[[106]
 [ 77]
 [ 19]
 ...
 [  0]
 [  0]
 [  0]]


## Missing data 

In [122]:
print(dataset.describe())

       totalMandates  availableMandates   numParishes  numParishesApproved  \
count   21643.000000       21643.000000  21643.000000         21643.000000   
mean       11.544795          11.499284    309.956013           261.090237   
std        31.314567          31.738783    659.055911           583.377428   
min         0.000000           0.000000     54.000000             3.000000   
25%         1.000000           0.000000     75.000000            67.000000   
50%         4.000000           3.000000    147.000000           120.000000   
75%         9.000000           9.000000    242.000000           208.000000   
max       226.000000         226.000000   3092.000000          3092.000000   

          blankVotes  blankVotesPercentage     nullVotes  nullVotesPercentage  \
count   21643.000000          21643.000000  21643.000000         21643.000000   
mean     8875.066673              2.621629   6148.068752             1.961471   
std     21484.874088              0.795289  14735.4692

In [123]:
# From the counts above, it turns out, no missing data

## Encoding categorical data

In [124]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0,19])], remainder='passthrough')

# Fit and do the transformation on X. Since the fit_transform returns the transformed matrix, we need
# to save the output to X as a numpy array because this will be expected from the machine learning algo
X = np.array(ct.fit_transform(X))

In [125]:
print(X)

[[0.0 0.0 0.0 ... 40.22 147993 94]
 [0.0 0.0 0.0 ... 34.95 128624 81]
 [0.0 0.0 0.0 ... 7.15 26307 16]
 ...
 [0.0 0.0 0.0 ... 0.15 256 0]
 [0.0 0.0 0.0 ... 0.14 239 0]
 [0.0 0.0 0.0 ... 0.07 118 0]]


## Splitting the dataset into the Training set and Test set

In [127]:
# random state is the seed so we get the same split over and over
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [128]:
print(X_train)

[[0.0 0.0 0.0 ... 1.04 45 0]
 [0.0 0.0 0.0 ... 0.63 801 0]
 [0.0 0.0 0.0 ... 0.82 1605 0]
 ...
 [0.0 0.0 1.0 ... 0.17 94 0]
 [0.0 0.0 0.0 ... 0.5 363 0]
 [0.0 0.0 0.0 ... 11.29 352 0]]


In [129]:
print(X_test)

[[0.0 0.0 0.0 ... 0.41 814 0]
 [0.0 0.0 0.0 ... 0.29 64 0]
 [0.0 0.0 1.0 ... 0.28 20 0]
 ...
 [0.0 0.0 0.0 ... 0.88 5910 0]
 [0.0 0.0 0.0 ... 1.69 764 0]
 [0.0 0.0 0.0 ... 0.27 309 0]]


In [130]:
print(y_train)

[[0]
 [0]
 [0]
 ...
 [0]
 [0]
 [0]]


In [131]:
print(y_test)

[[0]
 [0]
 [0]
 ...
 [0]
 [0]
 [0]]


## Training the multiple regression model

In [132]:
from sklearn.linear_model import LinearRegression

regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

## Predicting test set results

In [133]:
y_pred = regressor.predict(X_test)

In [135]:
print(regressor.coef_)
print(regressor.intercept_)
print(r2_score(y_test, y_pred))

[[-2.17252244e+05 -1.44172471e+05 -1.73482001e+05 -1.32822906e+05
  -9.56452868e+04 -1.56507509e+05 -1.69575350e+05 -2.14937121e+05
  -8.73977067e+04 -1.98993607e+05 -4.23053208e+05 -2.02972523e+05
  -1.70352987e+05 -3.17091553e+05 -1.76791926e+05 -2.77119604e+05
  -5.77156720e+03 -1.23589535e+05 -1.23037849e+05 -1.00465502e+05
  -1.76574866e+05 -6.38167663e-02  1.12157084e-01 -1.54677659e-01
  -5.73151076e-02 -4.87151498e-02  5.04002589e-02 -1.88616886e-02
   5.39767325e-02 -3.03564775e-02 -2.89285624e-02 -1.80920181e-02
   6.09991937e-02 -6.42420810e-02 -4.56957594e-02 -4.13560410e-02
   2.18991107e-02 -4.09768779e-02  4.80872853e-01 -3.69782241e-02
  -3.64515969e-02 -9.38412287e-02  6.22185022e+03  6.22186702e+03
  -5.15474892e+02 -2.99545201e-04 -9.34721902e-06  5.65477435e-02
   2.61783716e-05  1.16659062e-02 -8.69625433e-04 -1.28441025e-06
   3.53573705e-06  1.76745234e-05 -2.68584094e-02 -5.94792655e-05
  -1.43769375e-02  9.96785674e-04  2.86429713e-06 -5.00047463e-06
   3.21073

In [137]:
for i in range(len(y_pred)):
    if y_test[i] >= 2:
        print("Predicted: {}/ Actual: {}".format(y_pred[i], y_test[i]))

Predicted: [22.97916262]/ Actual: [19]
Predicted: [8.40934396]/ Actual: [8]
Predicted: [1.96667011]/ Actual: [2]
Predicted: [3.84086683]/ Actual: [4]
Predicted: [1.88366896]/ Actual: [2]
Predicted: [2.95458921]/ Actual: [2]
Predicted: [20.84397468]/ Actual: [20]
Predicted: [3.96005401]/ Actual: [4]
Predicted: [2.78584641]/ Actual: [3]
Predicted: [7.54881131]/ Actual: [8]
Predicted: [2.93150574]/ Actual: [3]
Predicted: [1.75588615]/ Actual: [2]
Predicted: [1.0003495]/ Actual: [2]
Predicted: [2.06183739]/ Actual: [2]
Predicted: [1.0003495]/ Actual: [2]
Predicted: [3.23404247]/ Actual: [3]
Predicted: [2.06460196]/ Actual: [2]
Predicted: [2.78809371]/ Actual: [3]
Predicted: [9.20568788]/ Actual: [8]
Predicted: [0.93595343]/ Actual: [2]
Predicted: [1.99610273]/ Actual: [2]
Predicted: [2.92486077]/ Actual: [3]
Predicted: [5.30054413]/ Actual: [4]
Predicted: [2.84682053]/ Actual: [3]
Predicted: [4.57297346]/ Actual: [4]
Predicted: [1.03658972]/ Actual: [2]
Predicted: [3.36069525]/ Actual: [3]