# Model Climate Change Performance Index (CCPI) Scores

The goal is to use country emissions data to predict CCPI scores. The following code creates a class for Climate Change Performance Index (CCPI) scores, into a single Python script and two data files (explanatory values and CCPI scores we want to predict).

In [None]:
from lightgbm import LGBMClassifier

import pandas as pd

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.impute import KNNImputer

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier

from sklearn.model_selection import cross_val_score

from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB

from sklearn.neighbors import KNeighborsClassifier

from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler

from sklearn.svm import SVC

from sklearn.tree import DecisionTreeClassifier

# import warnings
# warnings.filterwarnings("ignore")

In [None]:
RATE = 0.5

# The original imputation method function
def MissingImpute(dataframe, method):
    '''
    Impute the missing values
    Update the data field with the imputed version
    Parameter:
        dataframe: a pandas dataframe that is to be imputated
        method: a string that represents the imputation method the user wants to use
    '''
    temp = dataframe.copy()
    temp = temp.replace('0', np.NaN)
    temp = temp.replace('..', np.NaN)
    temp = temp.replace('--', np.NaN)
        
    if (method == 'knn'):
        temp = temp.replace(np.NaN, 0).transpose()
        new_header = temp.iloc[0]
        temp = temp[1:]
        temp.columns = new_header
            
        imputer = KNNImputer(n_neighbors = 5)
        temp = pd.DataFrame(imputer.fit_transform(temp), columns = temp.columns)
        temp = temp.transpose()
        temp.reset_index(inplace = True)
        temp.columns = dataframe.columns
            
    if (method == 'ewma'): 
        # EWMA(t) = lambda * x(t) + (1-lambda)EWMA(t-1)
        # EWMA(0) = mean(x)
        # The rate parameter can be tuned
        temp.iloc[:, 1:] = temp.iloc[:, 1:].astype(float)
        ewma = [0 for i in range(len(temp.columns))]
        # for each row of data, calculate the ewma values according to the formula
        for j in range(len(temp)):
            ewma[0] = temp.iloc[j, 1:].mean(skipna = True)
            for i in range(1, len(temp.columns)):
                if (np.isnan(temp.iloc[j, i]) and i > 1):
                    ewma[i] = RATE * temp.iloc[j, i-1] + (1 - RATE) * ewma[i - 1]
                    temp.iloc[j, i] = ewma[i]
                elif (np.isnan(temp.iloc[j, 1])):
                    ewma[i] = ewma[i - 1]
                    temp.iloc[j, i] = ewma[i]
                else:
                    ewma[i] = RATE * temp.iloc[j, i] + (1 - RATE) * ewma[i - 1]
        
    return temp

In [None]:
class CCPI_Model:

    def __init__(self, data_path='data.xlsx', imputation='ewma'):

        self.df = pd.read_excel(data_path, index_col='country')

        self.x_data = self.df.drop(columns=['year', 'CCPI'])
        self.y_data = self.df['CCPI']

        self.imputation = imputation
        self.x_data = MissingImpute(self.x_data, self.imputation)

        self.pipeline_GaussianNB = Pipeline([("scaler",StandardScaler()),
                            ("pipeline_GaussianNB",GaussianNB())])

        self.pipeline_BernoulliNB = Pipeline([("scaler",StandardScaler()),
                            ("pipeline_BernoulliNB",BernoulliNB())])

        self.pipeline_LogisticRegression = Pipeline([("scaler",StandardScaler()),
                            ("pipeline_LogisticRegression",LogisticRegression())])

        self.pipeline_RandomForest = Pipeline([("scaler",StandardScaler()),
                            ("pipeline_RandomForest",RandomForestClassifier())])

        self.pipeline_SVM = Pipeline([("scaler",StandardScaler()),
                            ("pipeline_SVM",SVC())])

        self.pipeline_DecisionTree = Pipeline([("scaler",StandardScaler()),
                            ("pipeline_DecisionTree",DecisionTreeClassifier())])

        self.pipeline_KNN = Pipeline([("scaler",StandardScaler()),
                            ("pipeline_KNN",KNeighborsClassifier())])

        self.pipeline_GBC = Pipeline([("scaler",StandardScaler()), (
                                "pipeline_GBC",GradientBoostingClassifier())])

        self.pipeline_SGD = Pipeline([("scaler",StandardScaler()),
                                ("pipeline_SGD",SGDClassifier(max_iter=5000, random_state=0))])

        self.pipeline_LGBM = Pipeline([("scaler",StandardScaler()),
                                ("pipeline_NN",LGBMClassifier())])


        self.pipe_dict = {0: "GaussianNB", 1: "BernoulliNB", 2: "LogisticRegression",3: "RandomForestClassifier", 4: "SupportVectorMachine", 5: "DecisionTreeClassifier",
                    6: "KNeighborsClassifier", 7: "GradientBoostingClassifier", 8:"Stochastic Gradient Descent", 9: "LGBM"}

        self.modelNames = ["GaussianNB", 'BernoulliNB','LogisticRegression','RandomForestClassifier','SupportVectorMachine',
                    'DecisionTreeClassifier', 'KNeighborsClassifier','GradientBoostingClassifier',
                    'Stochastic Gradient Descent', 'LGBM']

        self.pipelines = [self.pipeline_GaussianNB, self.pipeline_BernoulliNB, 
                          self.pipeline_LogisticRegression, self.pipeline_RandomForest, 
                          self.pipeline_SVM, self.pipeline_DecisionTree, 
                          self.pipeline_KNN, self.pipeline_GBC, 
                          self.pipeline_SGD, self.pipeline_LGBM]

        self.cv_results_acc = []

        for i, model in enumerate(self.pipelines):
            self.cv_score = cross_val_score(model, self.x_data, self.y_data, scoring = "accuracy", cv = 10, error_score='raise')
            self.cv_results_acc.append(self.cv_score.mean()*100)

        self.summary = pd.DataFrame({'Model': self.modelNames, 'CV Score': self.cv_results_acc})

In [None]:
m1 = CCPI_Model(data_path='data 10.xlsx')
m1.summary

Unnamed: 0,Model,CV Score
0,GaussianNB,37.795164
1,BernoulliNB,45.483642
2,LogisticRegression,61.244666
3,RandomForestClassifier,68.933144
4,SupportVectorMachine,58.847795
5,DecisionTreeClassifier,63.819346
6,KNeighborsClassifier,60.953058
7,GradientBoostingClassifier,66.25889
8,Stochastic Gradient Descent,54.587482
9,LGBM,67.603129
