# Rank the Algorithms

# Import all the various packages

In [33]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, RobustScaler, FunctionTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
import pickle
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
# imports necessary for dimensionality reduction
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE
from sklearn import svm
from sklearn.decomposition import FastICA
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import FactorAnalysis
# regression algorithms
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.svm import SVR

# metrics for evaluating regression models
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, median_absolute_error, explained_variance_score, max_error

from time import process_time

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from yellowbrick.regressor import PredictionError

from sklearn.neural_network import MLPRegressor

# Import Results from previous notebooks

In [34]:
PM25incData = pd.read_csv('PM10_Prediction_Results_including_PM25', thousands=',', low_memory=False)

In [35]:
PM25excData = pd.read_csv('PM10_Prediction_Results_excluding_PM25', thousands=',', low_memory=False)

In [36]:
#check results
PM25excData.tail()

Unnamed: 0,Algorithm,MAE,ME,MEAE,MSE,Max Depth,N Features,PM10 > 70 Removed,R2,Run Time,Test-Train Split,Year
28,SVM,6.33491,139.342293,4.579193,88.142247,,6,NO,0.255411,1m 15s,0.7,2015-2019
29,SVM,6.343486,139.913383,4.50395,89.909349,,7,NO,0.249914,1m 11s,0.7,2015-2019
30,SVM,6.280991,137.137695,4.487347,87.120772,,7,NO,0.269951,3m 26s,0.5,2015-2019
31,SVM,6.328256,109.31547,4.495744,87.246364,,7,NO,0.266662,6m 57s,0.3,2015-2019
32,SVM,6.13943,55.746894,4.500375,75.632694,,7,YES,0.294114,7m 21s,0.3,2015-2019


# Select Results for a certain combination of input parameters

In [37]:
def SelectAlgorithms(Data, N_Features, Split, PM_70, Depth1, Depth2, Depth3, Depth4):
    Selecteddf = Data
    Selecteddf = Selecteddf[(Data['N Features'] == N_Features) & (Data['Test-Train Split'] == Split) & (Data['PM10 > 70 Removed'] == PM_70) & (Data['Max Depth'] != Depth1) & (Data['Max Depth'] != Depth2) & (Data['Max Depth'] != Depth3) &(Data['Max Depth'] != Depth4)]
    
    return Selecteddf
RankNo = SelectAlgorithms(PM25incData, 8, 0.3, 'NO', 14, 16, 18, 20)
RankYes_20 = SelectAlgorithms(PM25incData, 8, 0.3, 'YES', 12, 14, 16, 18)
RankYes_18 = SelectAlgorithms(PM25incData, 8, 0.3, 'YES', 12, 14, 16, 20)
RankYes_16 = SelectAlgorithms(PM25incData, 8, 0.3, 'YES', 12, 14, 18, 20)
RankYes_14 = SelectAlgorithms(PM25incData, 8, 0.3, 'YES', 12, 16, 18, 18)
RankYes_12 = SelectAlgorithms(PM25incData, 8, 0.3, 'YES', 14, 16, 18, 20)

RankNo_exc = SelectAlgorithms(PM25excData, 7, 0.3, 'NO', 14, 16, 18, 20)
RankYes_20_exc = SelectAlgorithms(PM25excData, 7, 0.3, 'YES', 12, 14, 16, 18)
RankYes_18_exc = SelectAlgorithms(PM25excData, 7, 0.3, 'YES', 12, 14, 16, 20)
RankYes_16_exc = SelectAlgorithms(PM25excData, 7, 0.3, 'YES', 12, 14, 18, 20)
RankYes_14_exc = SelectAlgorithms(PM25excData, 7, 0.3, 'YES', 12, 16, 18, 18)
RankYes_12_exc = SelectAlgorithms(PM25excData, 7, 0.3, 'YES', 14, 16, 18, 20)

In [38]:
#check selection
display(RankYes_12)

Unnamed: 0,Algorithm,MAE,ME,MEAE,MSE,Max Depth,N Features,PM10 > 70 Removed,R2,Run Time,Test-Train Split,Year
5,Random Forest,3.662137,40.3368,2.72805,25.538662,12.0,8,YES,0.765942,48m 1s,0.3,2015-2019
15,Gradient Boosting,3.601197,39.862221,2.642555,25.355749,12.0,8,YES,0.769093,17m 15s,0.3,2015-2019
25,ANN,3.802356,35.444155,2.870641,27.011881,,8,YES,0.751723,2m 55s,0.3,2015-2019
31,Linear Regression,5.090178,42.097914,4.093631,45.371381,,8,YES,0.573015,24m 34s,0.3,2015-2019
37,SVM,3.738233,40.636611,2.685891,27.820055,,8,YES,0.737571,7m 13s,0.3,2015-2019


# Rank each individual Statistical Arbiter 
I assign rank 1 to the lowest value for MAE, ME, MEAE, MSE because the lower these statistics, the better the algorithm combination with rank 5 being assigned to the highest value. <br>
For R2 I assign rank 1 to the highest value because the higher the correlation coefficient, the better the algorithm combination. Likewise I assign rank 5 to the lowest R2 algorithm combination.

In [39]:
def Rankingdf(Data):
    DataR = Data
    DataR['MAE Rank'] = DataR['MAE'].rank(method='max')
    DataR['ME Rank'] = DataR['ME'].rank(method='max')
    DataR['MEAE Rank'] = DataR['MEAE'].rank(method='max')
    DataR['MSE Rank'] = DataR['MSE'].rank(method='max')
    DataR['R2 Rank'] = DataR['R2'].rank(ascending=False)


    return DataR
    
Rank_No_12 = Rankingdf(RankNo)
Rank_Yes_12 = Rankingdf(RankYes_12)
Rank_Yes_14 = Rankingdf(RankYes_14)
Rank_Yes_16 = Rankingdf(RankYes_16)
Rank_Yes_18 = Rankingdf(RankYes_18)
Rank_Yes_20 = Rankingdf(RankYes_20)

Rank_Yes_12_exc = Rankingdf(RankYes_12_exc)
display(Rank_Yes_20)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the ca

Unnamed: 0,Algorithm,MAE,ME,MEAE,MSE,Max Depth,N Features,PM10 > 70 Removed,R2,Run Time,Test-Train Split,Year,MAE Rank,ME Rank,MEAE Rank,MSE Rank,R2 Rank
9,Random Forest,3.568663,39.434893,2.653853,24.170381,20.0,8,YES,0.768887,1h 38m 10s,0.3,2015-2019,1.0,3.0,1.0,1.0,1.0
19,Gradient Boosting,4.208891,37.325302,3.227378,31.879528,20.0,8,YES,0.686507,49m 1s,0.3,2015-2019,4.0,2.0,4.0,4.0,4.0
25,ANN,3.802356,35.444155,2.870641,27.011881,,8,YES,0.751723,2m 55s,0.3,2015-2019,3.0,1.0,3.0,2.0,2.0
31,Linear Regression,5.090178,42.097914,4.093631,45.371381,,8,YES,0.573015,24m 34s,0.3,2015-2019,5.0,5.0,5.0,5.0,5.0
37,SVM,3.738233,40.636611,2.685891,27.820055,,8,YES,0.737571,7m 13s,0.3,2015-2019,2.0,4.0,2.0,3.0,3.0


# Final Ranking
Having ranked each individual statistical arbiter, I now sum these ranks and assign a rank number of 1 to 5 based on the final tally. Rank 1 was awarded to the lowest total sum since this algorithm combination has the best combination of statistical arbiters.

In [40]:
def finalrank(Data):
    Rank = Data
    column_list = list(Rank)
    
    column_list.remove("Algorithm") 
    column_list.remove('MAE') 
    column_list.remove('ME') 
    column_list.remove('MEAE')
    column_list.remove('MSE') 
    column_list.remove('Max Depth') 
    column_list.remove('N Features') 
    column_list.remove('PM10 > 70 Removed') 
    column_list.remove('R2') 
    column_list.remove('Run Time') 
    column_list.remove('Test-Train Split') 
    column_list.remove('Year')
    #Rank.drop('Sum', axis=0, inplace=True)
    #Rank.drop('Final Rank', axis=0, inplace=True)
    Rank["Sum"] = Rank[column_list].sum(axis=1)
    Rank.drop('Sum', axis=1, inplace=True)
    #Rank.drop('Final Rank', axis=1, inplace=True)
    Rank["Sum"] = Rank[column_list].sum(axis=1)
    Rank['Final Rank'] = Rank['Sum'].rank(ascending=True)
    display(Rank)


In [41]:
finalrank(Rank_Yes_12_exc)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,Algorithm,MAE,ME,MEAE,MSE,Max Depth,N Features,PM10 > 70 Removed,R2,Run Time,Test-Train Split,Year,MAE Rank,ME Rank,MEAE Rank,MSE Rank,R2 Rank,Sum,Final Rank
4,Random Forest,5.575581,51.853,4.2515,58.419196,12.0,7,YES,0.462236,38m 49s,0.3,2015-2019,1.0,1.0,1.0,1.0,1.0,5.0,1.0
13,Gradient Boosting,5.741684,53.904401,4.330578,63.026852,12.0,7,YES,0.414243,15m 45s,0.3,2015-2019,2.0,3.0,2.0,2.0,2.0,11.0,2.0
22,ANN,6.172147,52.917141,4.795003,70.325368,,7,YES,0.350875,3m 21s,0.3,2015-2019,4.0,2.0,4.0,3.0,3.0,16.0,3.0
27,Linear Regression,6.958046,60.465936,5.75443,84.702902,,7,YES,0.1932,21m 33s,0.3,2015-2019,5.0,5.0,5.0,5.0,5.0,25.0,5.0
32,SVM,6.13943,55.746894,4.500375,75.632694,,7,YES,0.294114,7m 21s,0.3,2015-2019,3.0,4.0,3.0,4.0,4.0,18.0,4.0


In [9]:
finalrank(Rank_No_12)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,Algorithm,MAE,ME,MEAE,MSE,Max Depth,N Features,PM10 > 70 Removed,R2,Run Time,Test-Train Split,Year,MAE Rank,ME Rank,MEAE Rank,MSE Rank,R2 Rank,Sum,Final Rank
4,Random Forest,3.699649,110.1381,2.7264,28.682918,12.0,8,NO,0.7648,48m 22s,0.3,2015-2019,2.0,3.0,3.0,2.0,2.0,12.0,2.0
14,Gradient Boosting,3.612434,50.204931,2.648087,25.986082,12.0,8,NO,0.782142,17m 11s,0.3,2015-2019,1.0,1.0,1.0,1.0,1.0,5.0,1.0
24,ANN,3.819558,83.061702,2.825361,29.117346,,8,NO,0.748378,2m 56s,0.3,2015-2019,3.0,2.0,4.0,3.0,3.0,15.0,3.0
30,Linear Regression,5.244982,115.782366,4.155983,55.10745,,8,NO,0.546049,24m 18s,0.3,2015-2019,5.0,5.0,5.0,5.0,5.0,25.0,5.0
36,SVM,3.911996,114.815451,2.711142,36.953898,,8,NO,0.704151,7m 15s,0.3,2015-2019,4.0,4.0,2.0,4.0,4.0,18.0,4.0


In [10]:
finalrank(Rank_Yes_14)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,Algorithm,MAE,ME,MEAE,MSE,Max Depth,N Features,PM10 > 70 Removed,R2,Run Time,Test-Train Split,Year,MAE Rank,ME Rank,MEAE Rank,MSE Rank,R2 Rank,Sum,Final Rank
6,Random Forest,3.636881,37.2711,2.714,25.126118,14.0,8,YES,0.767862,48m 33s,0.3,2015-2019,3.0,3.0,4.0,3.0,3.0,16.0,3.0
9,Random Forest,3.568663,39.434893,2.653853,24.170381,20.0,8,YES,0.768887,1h 38m 10s,0.3,2015-2019,2.0,5.0,2.0,2.0,1.0,12.0,2.0
16,Gradient Boosting,3.55904,36.792639,2.647237,23.962651,14.0,8,YES,0.768494,21m 16s,0.3,2015-2019,1.0,2.0,1.0,1.0,2.0,7.0,1.0
19,Gradient Boosting,4.208891,37.325302,3.227378,31.879528,20.0,8,YES,0.686507,49m 1s,0.3,2015-2019,6.0,4.0,6.0,6.0,6.0,28.0,6.0
25,ANN,3.802356,35.444155,2.870641,27.011881,,8,YES,0.751723,2m 55s,0.3,2015-2019,5.0,1.0,5.0,4.0,4.0,19.0,4.0
31,Linear Regression,5.090178,42.097914,4.093631,45.371381,,8,YES,0.573015,24m 34s,0.3,2015-2019,7.0,7.0,7.0,7.0,7.0,35.0,7.0
37,SVM,3.738233,40.636611,2.685891,27.820055,,8,YES,0.737571,7m 13s,0.3,2015-2019,4.0,6.0,3.0,5.0,5.0,23.0,5.0


In [11]:
finalrank(Rank_Yes_16)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,Algorithm,MAE,ME,MEAE,MSE,Max Depth,N Features,PM10 > 70 Removed,R2,Run Time,Test-Train Split,Year,MAE Rank,ME Rank,MEAE Rank,MSE Rank,R2 Rank,Sum,Final Rank
7,Random Forest,3.597277,35.82686,2.662925,24.739617,16.0,8,YES,0.76967,48m 50s,0.3,2015-2019,1.0,2.0,2.0,1.0,1.0,7.0,1.0
17,Gradient Boosting,3.662421,40.525278,2.658181,26.369878,16.0,8,YES,0.749356,28m 53s,0.3,2015-2019,2.0,3.0,1.0,2.0,3.0,11.0,2.0
25,ANN,3.802356,35.444155,2.870641,27.011881,,8,YES,0.751723,2m 55s,0.3,2015-2019,4.0,1.0,4.0,3.0,2.0,14.0,3.0
31,Linear Regression,5.090178,42.097914,4.093631,45.371381,,8,YES,0.573015,24m 34s,0.3,2015-2019,5.0,5.0,5.0,5.0,5.0,25.0,5.0
37,SVM,3.738233,40.636611,2.685891,27.820055,,8,YES,0.737571,7m 13s,0.3,2015-2019,3.0,4.0,3.0,4.0,4.0,18.0,4.0


In [12]:
finalrank(Rank_Yes_18)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,Algorithm,MAE,ME,MEAE,MSE,Max Depth,N Features,PM10 > 70 Removed,R2,Run Time,Test-Train Split,Year,MAE Rank,ME Rank,MEAE Rank,MSE Rank,R2 Rank,Sum,Final Rank
8,Random Forest,3.540611,40.290784,2.618691,24.207407,18.0,8,YES,0.77189,1h 38m 13s,0.3,2015-2019,1.0,2.0,1.0,1.0,1.0,6.0,1.0
18,Gradient Boosting,3.944417,44.795137,2.734883,32.532326,18.0,8,YES,0.687437,41m 50s,0.3,2015-2019,4.0,5.0,3.0,4.0,4.0,20.0,4.0
25,ANN,3.802356,35.444155,2.870641,27.011881,,8,YES,0.751723,2m 55s,0.3,2015-2019,3.0,1.0,4.0,2.0,2.0,12.0,2.0
31,Linear Regression,5.090178,42.097914,4.093631,45.371381,,8,YES,0.573015,24m 34s,0.3,2015-2019,5.0,4.0,5.0,5.0,5.0,24.0,5.0
37,SVM,3.738233,40.636611,2.685891,27.820055,,8,YES,0.737571,7m 13s,0.3,2015-2019,2.0,3.0,2.0,3.0,3.0,13.0,3.0


In [13]:
finalrank(Rank_Yes_20)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,Algorithm,MAE,ME,MEAE,MSE,Max Depth,N Features,PM10 > 70 Removed,R2,Run Time,Test-Train Split,Year,MAE Rank,ME Rank,MEAE Rank,MSE Rank,R2 Rank,Sum,Final Rank
9,Random Forest,3.568663,39.434893,2.653853,24.170381,20.0,8,YES,0.768887,1h 38m 10s,0.3,2015-2019,1.0,3.0,1.0,1.0,1.0,7.0,1.0
19,Gradient Boosting,4.208891,37.325302,3.227378,31.879528,20.0,8,YES,0.686507,49m 1s,0.3,2015-2019,4.0,2.0,4.0,4.0,4.0,18.0,4.0
25,ANN,3.802356,35.444155,2.870641,27.011881,,8,YES,0.751723,2m 55s,0.3,2015-2019,3.0,1.0,3.0,2.0,2.0,11.0,2.0
31,Linear Regression,5.090178,42.097914,4.093631,45.371381,,8,YES,0.573015,24m 34s,0.3,2015-2019,5.0,5.0,5.0,5.0,5.0,25.0,5.0
37,SVM,3.738233,40.636611,2.685891,27.820055,,8,YES,0.737571,7m 13s,0.3,2015-2019,2.0,4.0,2.0,3.0,3.0,14.0,3.0
