# Cork South Link Road - Rank the Algorithms
Only PM10 > 100 $\mu g/m^{3}$ removed algorithms were ranked for clarity

# Import all the various packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, RobustScaler, FunctionTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
import pickle
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
# imports necessary for dimensionality reduction
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE
from sklearn import svm
from sklearn.decomposition import FastICA
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import FactorAnalysis
# regression algorithms
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.svm import SVR

# metrics for evaluating regression models
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, median_absolute_error, explained_variance_score, max_error

from time import process_time

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from yellowbrick.regressor import PredictionError

from sklearn.neural_network import MLPRegressor

# Import Results from previous notebooks

In [3]:
Cork100 = pd.read_csv('Cork PM10 100 Results.csv', thousands=',', low_memory=False)

In [4]:
Cork100.tail()

Unnamed: 0,Algorithm,Feature Added,MAE,ME,MEAE,MSE,Max Depth,N Features,R2,Run Time,Test-Train Split,Year,Unnamed: 12
10,Linear Regression,"NO2, SO2, CO",8.665442,83.887206,6.722068,140.634758,,10,0.255447,1m 45s,0.3,2018-2019,
11,Linear Regression,NO WIND,8.344753,72.961198,6.576008,130.127647,,8,0.229238,1m 9s,0.3,2018-2019,
12,SVM,,8.549101,83.293246,5.844912,165.933118,,7,0.202028,21s,0.3,2018-2019,
13,SVM,"NO2, SO2, CO",7.910517,79.030066,5.476659,144.369334,,10,0.317284,30s,0.3,2018-2019,
14,SVM,NO WIND,7.556473,73.721926,5.170962,126.367275,,8,0.304182,15s,0.3,2018-2019,


# Select Results for a certain combination of input parameters

In [14]:
def SelectAlgorithms(Data, Condition):
    Selecteddf = Data
    Selecteddf = Selecteddf[(Data['Feature Added'] == Condition)]
    
    return Selecteddf
RankBase = SelectAlgorithms(Cork100, 'None')
RankPollutant = SelectAlgorithms(Cork100, 'NO2, SO2, CO')
RankNoWind = SelectAlgorithms(Cork100, 'NO WIND')



In [15]:
display(RankBase)
display(RankPollutant)
display(RankNoWind)

Unnamed: 0,Algorithm,Feature Added,MAE,ME,MEAE,MSE,Max Depth,N Features,R2,Run Time,Test-Train Split,Year,Unnamed: 12
0,Random Forest,,7.624066,64.418,5.28,121.89467,12.0,7,0.383547,3m 26s,0.3,2018-2019,
3,Gradient Boosting,,7.661094,67.879161,5.272208,126.192397,12.0,7,0.323178,1m 1s,0.3,2018-2019,
6,ANN,,8.690357,73.676564,6.387667,152.263887,,7,0.285866,1m 18s,0.3,2018-2019,
9,Linear Regression,,9.154191,86.945833,7.410409,156.590396,,7,0.126413,1m 11s,0.3,2018-2019,
12,SVM,,8.549101,83.293246,5.844912,165.933118,,7,0.202028,21s,0.3,2018-2019,


Unnamed: 0,Algorithm,Feature Added,MAE,ME,MEAE,MSE,Max Depth,N Features,R2,Run Time,Test-Train Split,Year,Unnamed: 12
1,Random Forest,"NO2, SO2, CO",6.620102,80.52,4.805,94.243096,12.0,10,0.493072,5m 49s,0.3,2018-2019,
4,Gradient Boosting,"NO2, SO2, CO",6.506614,82.598606,4.187607,101.802665,12.0,10,0.455842,1m 23s,0.3,2018-2019,
7,ANN,"NO2, SO2, CO",7.134201,86.05615,5.452623,99.788855,,10,0.445254,3m 11s,0.3,2018-2019,
10,Linear Regression,"NO2, SO2, CO",8.665442,83.887206,6.722068,140.634758,,10,0.255447,1m 45s,0.3,2018-2019,
13,SVM,"NO2, SO2, CO",7.910517,79.030066,5.476659,144.369334,,10,0.317284,30s,0.3,2018-2019,


Unnamed: 0,Algorithm,Feature Added,MAE,ME,MEAE,MSE,Max Depth,N Features,R2,Run Time,Test-Train Split,Year,Unnamed: 12
2,Random Forest,NO WIND,6.816733,79.12,4.835,99.405063,12.0,8,0.457891,4m 14s,0.3,2018-2019,
5,Gradient Boosting,NO WIND,7.008613,67.670763,4.75128,107.519445,12.0,8,0.399231,58s,0.3,2018-2019,
8,ANN,NO WIND,7.481185,78.04504,5.596379,118.146988,,8,0.439887,1m 9s,0.3,2017-2019,
11,Linear Regression,NO WIND,8.344753,72.961198,6.576008,130.127647,,8,0.229238,1m 9s,0.3,2018-2019,
14,SVM,NO WIND,7.556473,73.721926,5.170962,126.367275,,8,0.304182,15s,0.3,2018-2019,


# Rank each individual Statistical Arbiter 
I assign rank 1 to the lowest value for MAE, ME, MEAE, MSE because the lower these statistics, the better the algorithm combination with rank 5 being assigned to the highest value. <br>
For R2 I assign rank 1 to the highest value because the higher the correlation coefficient, the better the algorithm combination. Likewise I assign rank 5 to the lowest R2 algorithm combination.

In [17]:
def Rankingdf(Data):
    DataR = Data
    DataR['MAE Rank'] = DataR['MAE'].rank(method='max')
    DataR['ME Rank'] = DataR['ME'].rank(method='max')
    DataR['MEAE Rank'] = DataR['MEAE'].rank(method='max')
    DataR['MSE Rank'] = DataR['MSE'].rank(method='max')
    DataR['R2 Rank'] = DataR['R2'].rank(ascending=False)


    return DataR
    
Rank_Base = Rankingdf(RankBase)
Rank_Pollutant = Rankingdf(RankPollutant)
Rank_NoWind = Rankingdf(RankNoWind)



display(Rank_Base)
display(Rank_Pollutant)
display(Rank_NoWind)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the ca

Unnamed: 0,Algorithm,Feature Added,MAE,ME,MEAE,MSE,Max Depth,N Features,R2,Run Time,Test-Train Split,Year,Unnamed: 12,MAE Rank,ME Rank,MEAE Rank,MSE Rank,R2 Rank
0,Random Forest,,7.624066,64.418,5.28,121.89467,12.0,7,0.383547,3m 26s,0.3,2018-2019,,1.0,1.0,2.0,1.0,1.0
3,Gradient Boosting,,7.661094,67.879161,5.272208,126.192397,12.0,7,0.323178,1m 1s,0.3,2018-2019,,2.0,2.0,1.0,2.0,2.0
6,ANN,,8.690357,73.676564,6.387667,152.263887,,7,0.285866,1m 18s,0.3,2018-2019,,4.0,3.0,4.0,3.0,3.0
9,Linear Regression,,9.154191,86.945833,7.410409,156.590396,,7,0.126413,1m 11s,0.3,2018-2019,,5.0,5.0,5.0,4.0,5.0
12,SVM,,8.549101,83.293246,5.844912,165.933118,,7,0.202028,21s,0.3,2018-2019,,3.0,4.0,3.0,5.0,4.0


Unnamed: 0,Algorithm,Feature Added,MAE,ME,MEAE,MSE,Max Depth,N Features,R2,Run Time,Test-Train Split,Year,Unnamed: 12,MAE Rank,ME Rank,MEAE Rank,MSE Rank,R2 Rank
1,Random Forest,"NO2, SO2, CO",6.620102,80.52,4.805,94.243096,12.0,10,0.493072,5m 49s,0.3,2018-2019,,2.0,2.0,2.0,1.0,1.0
4,Gradient Boosting,"NO2, SO2, CO",6.506614,82.598606,4.187607,101.802665,12.0,10,0.455842,1m 23s,0.3,2018-2019,,1.0,3.0,1.0,3.0,2.0
7,ANN,"NO2, SO2, CO",7.134201,86.05615,5.452623,99.788855,,10,0.445254,3m 11s,0.3,2018-2019,,3.0,5.0,3.0,2.0,3.0
10,Linear Regression,"NO2, SO2, CO",8.665442,83.887206,6.722068,140.634758,,10,0.255447,1m 45s,0.3,2018-2019,,5.0,4.0,5.0,4.0,5.0
13,SVM,"NO2, SO2, CO",7.910517,79.030066,5.476659,144.369334,,10,0.317284,30s,0.3,2018-2019,,4.0,1.0,4.0,5.0,4.0


Unnamed: 0,Algorithm,Feature Added,MAE,ME,MEAE,MSE,Max Depth,N Features,R2,Run Time,Test-Train Split,Year,Unnamed: 12,MAE Rank,ME Rank,MEAE Rank,MSE Rank,R2 Rank
2,Random Forest,NO WIND,6.816733,79.12,4.835,99.405063,12.0,8,0.457891,4m 14s,0.3,2018-2019,,1.0,5.0,2.0,1.0,1.0
5,Gradient Boosting,NO WIND,7.008613,67.670763,4.75128,107.519445,12.0,8,0.399231,58s,0.3,2018-2019,,2.0,1.0,1.0,2.0,3.0
8,ANN,NO WIND,7.481185,78.04504,5.596379,118.146988,,8,0.439887,1m 9s,0.3,2017-2019,,3.0,4.0,4.0,3.0,2.0
11,Linear Regression,NO WIND,8.344753,72.961198,6.576008,130.127647,,8,0.229238,1m 9s,0.3,2018-2019,,5.0,2.0,5.0,5.0,5.0
14,SVM,NO WIND,7.556473,73.721926,5.170962,126.367275,,8,0.304182,15s,0.3,2018-2019,,4.0,3.0,3.0,4.0,4.0


# Final Ranking
Having ranked each individual statistical arbiter, I now sum these ranks and assign a rank number of 1 to 5 based on the final tally. Rank 1 was awarded to the lowest total sum since this algorithm combination has the best combination of statistical arbiters.

In [21]:
def finalrank(Data):
    Rank = Data
    column_list = list(Rank)
    
    column_list.remove("Algorithm") 
    column_list.remove('MAE') 
    column_list.remove('ME') 
    column_list.remove('MEAE')
    column_list.remove('MSE') 
    column_list.remove('Max Depth') 
    column_list.remove('N Features') 
    column_list.remove('R2') 
    column_list.remove('Run Time') 
    column_list.remove('Test-Train Split') 
    column_list.remove('Year')
    #Rank.drop('Sum', axis=0, inplace=True)
    #Rank.drop('Final Rank', axis=0, inplace=True)
    Rank["Sum"] = Rank[column_list].sum(axis=1)
    Rank.drop('Sum', axis=1, inplace=True)
    #Rank.drop('Final Rank', axis=1, inplace=True)
    Rank["Sum"] = Rank[column_list].sum(axis=1)
    Rank['Final Rank'] = Rank['Sum'].rank(ascending=True)
    display(Rank)


In [22]:
finalrank(Rank_Base)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,Algorithm,Feature Added,MAE,ME,MEAE,MSE,Max Depth,N Features,R2,Run Time,Test-Train Split,Year,Unnamed: 12,MAE Rank,ME Rank,MEAE Rank,MSE Rank,R2 Rank,Sum,Final Rank
0,Random Forest,,7.624066,64.418,5.28,121.89467,12.0,7,0.383547,3m 26s,0.3,2018-2019,,1.0,1.0,2.0,1.0,1.0,6.0,1.0
3,Gradient Boosting,,7.661094,67.879161,5.272208,126.192397,12.0,7,0.323178,1m 1s,0.3,2018-2019,,2.0,2.0,1.0,2.0,2.0,9.0,2.0
6,ANN,,8.690357,73.676564,6.387667,152.263887,,7,0.285866,1m 18s,0.3,2018-2019,,4.0,3.0,4.0,3.0,3.0,17.0,3.0
9,Linear Regression,,9.154191,86.945833,7.410409,156.590396,,7,0.126413,1m 11s,0.3,2018-2019,,5.0,5.0,5.0,4.0,5.0,24.0,5.0
12,SVM,,8.549101,83.293246,5.844912,165.933118,,7,0.202028,21s,0.3,2018-2019,,3.0,4.0,3.0,5.0,4.0,19.0,4.0


In [23]:
finalrank(Rank_Pollutant)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,Algorithm,Feature Added,MAE,ME,MEAE,MSE,Max Depth,N Features,R2,Run Time,Test-Train Split,Year,Unnamed: 12,MAE Rank,ME Rank,MEAE Rank,MSE Rank,R2 Rank,Sum,Final Rank
1,Random Forest,"NO2, SO2, CO",6.620102,80.52,4.805,94.243096,12.0,10,0.493072,5m 49s,0.3,2018-2019,,2.0,2.0,2.0,1.0,1.0,8.0,1.0
4,Gradient Boosting,"NO2, SO2, CO",6.506614,82.598606,4.187607,101.802665,12.0,10,0.455842,1m 23s,0.3,2018-2019,,1.0,3.0,1.0,3.0,2.0,10.0,2.0
7,ANN,"NO2, SO2, CO",7.134201,86.05615,5.452623,99.788855,,10,0.445254,3m 11s,0.3,2018-2019,,3.0,5.0,3.0,2.0,3.0,16.0,3.0
10,Linear Regression,"NO2, SO2, CO",8.665442,83.887206,6.722068,140.634758,,10,0.255447,1m 45s,0.3,2018-2019,,5.0,4.0,5.0,4.0,5.0,23.0,5.0
13,SVM,"NO2, SO2, CO",7.910517,79.030066,5.476659,144.369334,,10,0.317284,30s,0.3,2018-2019,,4.0,1.0,4.0,5.0,4.0,18.0,4.0


In [24]:
finalrank(Rank_NoWind)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,Algorithm,Feature Added,MAE,ME,MEAE,MSE,Max Depth,N Features,R2,Run Time,Test-Train Split,Year,Unnamed: 12,MAE Rank,ME Rank,MEAE Rank,MSE Rank,R2 Rank,Sum,Final Rank
2,Random Forest,NO WIND,6.816733,79.12,4.835,99.405063,12.0,8,0.457891,4m 14s,0.3,2018-2019,,1.0,5.0,2.0,1.0,1.0,10.0,2.0
5,Gradient Boosting,NO WIND,7.008613,67.670763,4.75128,107.519445,12.0,8,0.399231,58s,0.3,2018-2019,,2.0,1.0,1.0,2.0,3.0,9.0,1.0
8,ANN,NO WIND,7.481185,78.04504,5.596379,118.146988,,8,0.439887,1m 9s,0.3,2017-2019,,3.0,4.0,4.0,3.0,2.0,16.0,3.0
11,Linear Regression,NO WIND,8.344753,72.961198,6.576008,130.127647,,8,0.229238,1m 9s,0.3,2018-2019,,5.0,2.0,5.0,5.0,5.0,22.0,5.0
14,SVM,NO WIND,7.556473,73.721926,5.170962,126.367275,,8,0.304182,15s,0.3,2018-2019,,4.0,3.0,3.0,4.0,4.0,18.0,4.0
