## Question 1

In [1]:
# Import the necessary library
import pandas as pd
import numpy as np

In [2]:
# Show the correlation matrix
corr_data = [[355, 1480], [45, 120]]
corr_matrix = pd.DataFrame(corr_data, index=['Predicted Spam', 'Predicted Not Spam'],columns=['Actual Spam', 'Actual Not Spam'])

corr_matrix

Unnamed: 0,Actual Spam,Actual Not Spam
Predicted Spam,355,1480
Predicted Not Spam,45,120


In [3]:
# F1 Score = 2 * ((Precision * Recall)/(Precision + Recall))
# where Precison = TP/(TP + FP) and Recall = TP/(TP + FN)
# From the correlation matrix:
TP = corr_matrix.iloc[0][0]
FP = corr_matrix.iloc[0][1]
TN = corr_matrix.iloc[1][1]
FN = corr_matrix.iloc[1][0]

# To obtain the precision and recall:
Precision = TP/(TP + FP)
Recall = TP/(TP + FN)

# To obtain the F1 Score:
print('F1 Score = {}'.format(np.round(2 * ((Precision * Recall)/(Precision + Recall)), 4)))

F1 Score = 0.3177


## Question 14

### Classification - Managing the Quality Metric of Global Ecological Footprint


Stability of the Grid System

Electrical grids require a balance between electricity supply and demand in order to be stable. Conventional systems achieve this balance through demand-driven electricity production. For future grids with a high share of inflexible (i.e., renewable) energy source, the concept of demand response is a promising solution. This implies changes in electricity consumption in relation to electricity price changes. In this work, we’ll build a binary classification model to predict if a grid is stable or unstable using the UCI Electrical Grid Stability Simulated dataset.

Dataset: https://archive.ics.uci.edu/ml/datasets/Electrical+Grid+Stability+Simulated+Data+

It has 12 primary predictive features and two dependent variables.

Predictive features:

    'tau1' to 'tau4': the reaction time of each network participant, a real value within the range 0.5 to 10 ('tau1' corresponds to the supplier node, 'tau2' to 'tau4' to the consumer nodes);
    'p1' to 'p4': nominal power produced (positive) or consumed (negative) by each network participant, a real value within the range -2.0 to -0.5 for consumers ('p2' to 'p4'). As the total power consumed equals the total power generated, p1 (supplier node) = - (p2 + p3 + p4);
    'g1' to 'g4': price elasticity coefficient for each network participant, a real value within the range 0.05 to 1.00 ('g1' corresponds to the supplier node, 'g2' to 'g4' to the consumer nodes; 'g' stands for 'gamma');

Dependent variables:

    'stab': the maximum real part of the characteristic differential equation root (if positive, the system is linearly unstable; if negative, linearly stable);
    'stabf': a categorical (binary) label ('stable' or 'unstable').


In [4]:
# Load the data
data = pd.read_csv('Data_for_UCI_named.csv')

# View the first five rows of the data
data.head(5)

# Check for missing values
print(data.isnull().sum())

# Check for dtypes
print(data.dtypes)

tau1     0
tau2     0
tau3     0
tau4     0
p1       0
p2       0
p3       0
p4       0
g1       0
g2       0
g3       0
g4       0
stab     0
stabf    0
dtype: int64
tau1     float64
tau2     float64
tau3     float64
tau4     float64
p1       float64
p2       float64
p3       float64
p4       float64
g1       float64
g2       float64
g3       float64
g4       float64
stab     float64
stabf     object
dtype: object


In [5]:
# Delete the stab column
data = data.drop('stab', axis = 1)

# Importing an encoder
from sklearn.preprocessing import LabelEncoder

# Fit the encoder and transform stabf
label = LabelEncoder()
data['stabf'] = label.fit(data['stabf']).transform(data['stabf'])

# Importing a scaler for the data
from sklearn.preprocessing import StandardScaler

# Fit the scaler and transform the data
scaler = StandardScaler()
scaled_df = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)

# Obtain x and y
x = scaled_df.drop('stabf', axis = 1)
y = data['stabf']

# Obtain the training (80%) and test data (20%)
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x , y, test_size = 0.2, random_state = 1)

# Importing Random Forest Classifier, setting the random state = 1
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state = 1)

# Fit the model
clf.fit(xtrain, ytrain)

# Predict on the test data
clf_pred = clf.predict(xtest)

# Obtain the accuracy
from sklearn.metrics import accuracy_score
clf_acc = accuracy_score(clf_pred, ytest)
print(clf_acc)

0.929


## Question 15

In [6]:
# Importing XGBoost Classifier, setting the random state = 1
import xgboost as xgb
model = xgb.XGBClassifier(random_state = 1)

# Fit the model
model.fit(xtrain, ytrain)

# Predict on the test data
model_pred = model.predict(xtest)

# Obtain the accuracy
model_acc = accuracy_score(model_pred, ytest)
print(model_acc)

0.9455


## Question 16

In [7]:
# Importing LGBM Classifier, setting random state = 1
import lightgbm as gbm
lgbm = gbm.LGBMClassifier(random_state = 1)

# Fit the model
lgbm.fit(xtrain, ytrain)

# Predict on the test data
lgbm_pred = lgbm.predict(xtest)

# Obtain the accuracy
lgbm_acc = accuracy_score(lgbm_pred, ytest)
print(lgbm_acc)

0.9475


## Question 17

In [8]:
# Importing Extra Trees Classifier
from sklearn.ensemble import ExtraTreesClassifier
extra = ExtraTreesClassifier()

# Hyper-parameters
n_estimators = [50, 100, 300, 500, 1000]
min_samples_split = [2, 3, 5, 7, 9]
min_samples_leaf = [1, 2, 4, 6, 8]
max_features = ['auto', 'sqrt', 'log2', None]

# Parameter grid
param_grid = {'n_estimators': n_estimators, 'min_samples_leaf': min_samples_leaf, 'min_samples_split': min_samples_split, 'max_features': max_features}

# Importing Random Search CV
from sklearn.model_selection import RandomizedSearchCV

# Random Search for hyper-parameter optimization
rscv = RandomizedSearchCV(extra, param_grid, cv = 5, n_iter = 10, scoring = 'accuracy', n_jobs = -1, verbose = 1, random_state = 1)

# Fit the model
rscv.fit(xtrain, ytrain)

# Obtain the best estimators
print(rscv.best_estimator_)
print(rscv.best_estimator_.min_samples_split)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   40.3s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   45.2s finished


ExtraTreesClassifier(max_features=None, min_samples_leaf=8, n_estimators=1000)
2


## Question 18

In [18]:
# Taking the new optimal model
extclass = ExtraTreesClassifier(max_features = None, min_samples_leaf = 8, n_estimators = 1000, min_samples_split = 2, random_state = 1)

# Fit the model
extclass.fit(xtrain, ytrain)

# Predict on the test data
extclass_pred = extclass.predict(xtest)

# Obtain the accuracy
extclass_acc = accuracy_score(extclass_pred, ytest)
print(extclass_acc)

# Taking the initial Extra Trees Classifier with no hyper-parameter tuning
# Fit the model
extra.fit(xtrain, ytrain)

# Predict on the test data
extra_pred = extra.predict(xtest)

# Obtain the accuracy
extra_acc = accuracy_score(extra_pred, ytest)
print(extra_acc)

# extclass_pred < extra_pred
# The accuracy of the new optimal model is lower than the initial Extra Trees Classifier model with no hyper-parameter tuning

0.927
0.933


## Question 20

In [10]:
# Function to obtain the important features
def get_weights_df(model, feat, col_name):
  weights = pd.Series(model.feature_importances_, feat.columns).sort_values()
  weights_df = pd.DataFrame(weights).reset_index()
  weights_df.columns = ['Features', col_name]
  weights_df[col_name].round(3)
  return weights_df

extra.fit(xtrain, ytrain)

model_weights = get_weights_df(extra, xtrain, 'Feature_Importance')
model_weights.nsmallest

<bound method DataFrame.nsmallest of    Features  Feature_Importance
0        p1            0.038917
1        p2            0.040555
2        p4            0.040642
3        p3            0.040796
4        g1            0.089804
5        g4            0.093572
6        g2            0.094504
7        g3            0.096153
8      tau4            0.113514
9      tau3            0.115735
10     tau1            0.117553
11     tau2            0.118254>

In [19]:
# Most important is tau2. Least important is p1.