**Imported Packages and Datasets**

In [None]:
import pandas as pd
from zipfile import ZipFile
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from IPython import display
from time import sleep
import scipy.stats  as stats
from statistics import mean,mode,stdev
from scipy.stats import pearsonr
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
import math
import sklearn
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from mpl_toolkits.mplot3d import Axes3D
from sklearn.cluster import KMeans
from itertools import permutations
import itertools
from sklearn.preprocessing import StandardScaler
from sklearn import mixture
from sklearn.cluster import MiniBatchKMeans
from sklearn import metrics
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore')
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
import scipy.stats as ss
from scipy.spatial import distance

In [None]:
# Import Literacy and numeracy dataset, define list of countries, years and features in dataset
File = pd.read_csv('LiteracyNumeracyDataset2.csv')
country = 'Country'
countries = File.Country.unique()
Year = ['2013','2014','2015','2016','2017','2018']
Indicator = File.Indicator.unique()

In [None]:
# Import political gender balance dataset, and defining features and years in dataset
FileGen = pd.read_csv('PoliticalGenderBalanceDataset2.csv')
Year = ['2013','2014','2015','2016','2017','2018']
IndicatorGen = FileGen.Indicator.unique()

In [None]:
# Import unicef dataset 1 (Financial inclusion, family demand, labour participation rates)
FileUnicef1 = pd.read_csv('UnicefDatasets1.csv',encoding = "ISO-8859-1")
IndicatorUnicef1 = list(FileUnicef1.columns.values[5:])

**Functions**

In [None]:
# function to rearrange dataframe
def rearrange_dataframe(df, indicator_name):
    #years = [c for c in df.columns if c[0] == '1' or c[0] == '2']
    years = Year
    df = df.loc[df['Indicator'] == indicator_name]
    df = pd.melt(df[[country] + years], id_vars=country, var_name='year')
    ## https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.melt.html
    df.rename(columns={'value': indicator_name}, inplace=True)
    df.set_index(['year', country], inplace=True)
    return df

# function to get most recent value of attribute of each country and create dataset
def get_recent_value(df,indicator_name):
    data = pd.DataFrame(columns = ['Country',indicator_name])
    years = Year
    df = df.loc[df['Indicator'] == indicator_name]
    #List = []
    for i in range(len(countries)):
        ValueList = []
        num = 0
        ValueList.append(countries[i])
        ValueList.append(np.nan)
        #ValueList.append('Nan')
        for j in range(len(years)-1,-1,-1):
            q = df[df['Country'] == countries[i]]
            a = q.loc[:,years[j]]
            #if (str(a.item())!='nan') and (num == 0):
            if (str(a.item()) != str(np.nan)) and (num == 0):
                ValueList[1] = float(a.item())
                num = 1
        data = data.append({'Country': ValueList[0], indicator_name: ValueList[1]}, ignore_index=True)
    return data

# function to find correlations and significance between features of dataset
def Correlation_features(X,IndicatorList):
    CorrMatrix = np.zeros((len(IndicatorList),len(IndicatorList)))
    SigMatrix = np.zeros((len(IndicatorList),len(IndicatorList)))
    for i in range(len(IndicatorList)):
        FirstInd = X[:,i]
        for j in range(len(IndicatorList)):
            if i == j:
                CorrMatrix[i,j] = 1
                continue
            else:
                SecondInd = X[:,j]
                # Correlation
                correlation = pearsonr(FirstInd,SecondInd)
                CorrMatrix[i,j] = correlation[0]
                # Significance
                SigMatrix[i,j] = correlation[1]
    return CorrMatrix, SigMatrix

# function that prints heatmap of correlations
def correlation_heatmap(CorrMatrix):
    %matplotlib qt
    ax = sns.heatmap(CorrMatrix, vmin=-1, vmax=1,cmap='cool')
    return

# function that prints moderate correlations that are significant
def significant_correlations(CorrMatrix,SigMatrix,IndicatorList):
    ModulusMatrix = abs(CorrMatrix)
    a = 0
    print("\033[1m" + "Indicators with a moderate correlation (>0.5) and are significant at a 5% level of significance:"
          + "\033[0;0m")
    for i in range(len(IndicatorList)):
        for j in range(len(IndicatorList)):
            if i == j:
                continue
            if (ModulusMatrix[i,j] > 0.5) and (SigMatrix[i,j] < 0.05):
                a = a + 1
                print(str(IndicatorList[i]) + " V.S. " + str(IndicatorList[j]) + " ---> " + str(CorrMatrix[i,j]) + "\n")
    if a == 0:
        print("None")
    return

# function that plots indicator over time
def plot_indicator(df, indicator_name):
    fig, ax = plt.subplots(figsize=[15 ,10])
    for label, dfi in df.groupby(level=1):
        dfi[indicator].plot(ax=ax, label=label)
    plt.legend()
    ax.set_ylabel(indicator)
    ax.set_xticklabels(df1c.index.levels[0].values)
    ax.set_xlabel('year')
    return

# function that assigns a group colour to each datapoint based on label (GNI or Region)
def get_regions_and_gni(File,FileUnicef1,r,g,h,d,cl,attribute):
    RegionsList = []
    REGIONS = []
    gniList = []
    GNI = []
    colourList = []
    COLOUR = []
    happyList = []
    HAPPY = []
    devList = []
    DEV = []
    for k in range(len(countries)):
        Data = File.loc[File['Country'] == countries[k]]
        RegionName = Data['Region'].unique()
        gniName = Data['GNI'].unique()
        RegionsList.extend(RegionName)
        gniList.extend(gniName)
        Data2 = FileUnicef1.loc[FileUnicef1['Country'] == countries[k]]
        happyName = Data2['HPI_Class'].unique()
        happyList.extend(happyName)
        devName = Data2['Development'].unique()
        devList.extend(devName)
        for i in range(len(r)):
            if (str(r[i]) == str(RegionName)) and (attribute == 'Region'):
                colourList.append(cl[i])
            if i < len(g):
                if (str(g[i]) == str(gniName)) and (attribute == 'GNI'):
                    colourList.append(cl[i])
    return RegionsList,colourList,gniList,happyList,devList

# function that plots a 3d scatterplot of chosen features
def plot_scatter(xvalues,yvalues,zvalues,REGIONS,GNI,HAPPY,DEV,attribute,Index1,Index2,Index3):
    %matplotlib qt
    fig = plt.figure(num=1)
    ax = fig.add_subplot(111, projection='3d')
    if (attribute == 'Region'):
        for j in range(len(r)):
            index = []
            index = [i for i, x in enumerate(REGIONS) if x == r[j]]
            xx = [xvalues[x] for x in index]
            yy = [yvalues[x] for x in index]
            zz = [zvalues[x] for x in index]
            ax.scatter(xx,yy,zz,label=r[j],color=cl[j])
            ax.legend(loc='best')
            ax.set_xlabel(IndicatorList[Index1])
            ax.set_ylabel(IndicatorList[Index2])
            ax.set_zlabel(IndicatorList[Index3])
    if (attribute == 'GNI'):
        for j in range(len(g)):
            index = []
            index = [i for i, x in enumerate(GNI) if x == g[j]]
            xx = [xvalues[x] for x in index]
            yy = [yvalues[x] for x in index]
            zz = [zvalues[x] for x in index]
            ax.scatter(xx,yy,zz,label=g[j],color=cl[j])
            ax.legend(loc='upper left')
            #ax.legend(loc='best')
            ax.set_xlabel(IndicatorList[Index1])
            ax.set_ylabel(IndicatorList[Index2])
            ax.set_zlabel(IndicatorList[Index3])
    if (attribute == 'Happy'):
        for j in range(len(h)):
            index = []
            index = [i for i, x in enumerate(HAPPY) if x == h[j]]
            xx = [xvalues[x] for x in index]
            yy = [yvalues[x] for x in index]
            zz = [zvalues[x] for x in index]
            ax.scatter(xx,yy,zz,label=h[j],color=clh[j])
            ax.legend(loc='best')
            ax.set_xlabel(IndicatorList[Index1])
            ax.set_ylabel(IndicatorList[Index2])
            ax.set_zlabel(IndicatorList[Index3])
    if (attribute == 'Dev'):
        for j in range(len(d)):
            index = []
            index = [i for i, x in enumerate(DEV) if x == d[j]]
            xx = [xvalues[x] for x in index]
            yy = [yvalues[x] for x in index]
            zz = [zvalues[x] for x in index]
            ax.scatter(xx,yy,zz,label=d[j],color=cl[j])
            ax.legend(loc='best')
            ax.set_xlabel(IndicatorList[Index1])
            ax.set_ylabel(IndicatorList[Index2]) 
            ax.set_zlabel(IndicatorList[Index3])
 
# function that finds the most likely mapping of K-means labels to GNI or Region labels in dataset (thus to test accuracy
# of K-means in classifying "unseen" test datapoints)

def permutation_labels(q,attribute,Klabels,y_train):
    per = list(permutations(range(len(q)))) 
    correctlabelslist = np.zeros((len(per)))
    for i in range(len(per)):
        labeldict = {per[i][c]:q[c] for c in range(len(q))}
        KLabelPer = [labeldict.get(item,item)  for item in Klabels]
        BothLabels = np.column_stack([KLabelPer,y_train]) 
        BothLabelsCompare = [BothLabels[c][0] == BothLabels[c][1] for c in range(len(BothLabels))]
        CorrectLabels = (float(BothLabelsCompare.count(True))/float(len(y_train)))*100
        correctlabelslist[i] = CorrectLabels
    maxcorrect = np.argmax(correctlabelslist)
    per = per[maxcorrect]
    return per

# function that finds accuracy of k-means in classifying test datapoints
def K_means_test_accuracy(num,t_size,q,attribute,OUTPUT,xvalues,yvalues,zvalues,TestAccuracyMean,b,c):
    testaccuracy = np.zeros((1,num))
    km_precision = np.zeros((1,len(q)))
    km_recall = np.zeros((1,len(q)))
    km_f1 = np.zeros((1,len(q)))
    km_num = np.zeros((1,len(q)))
    for i in range(len(testaccuracy[0])):
        Xinput = np.column_stack([np.array(xvalues),np.array(yvalues),np.array(zvalues)])
        X_train, X_test, y_train, y_test = train_test_split(Xinput, OUTPUT, test_size=t_size, random_state=i)
        #kmeans = KMeans(n_clusters=int(len(q)),random_state = c).fit(X_train)
        kmeans = MiniBatchKMeans(n_clusters=int(len(q)),random_state=c).fit(X_train)
        Klabels = kmeans.labels_
        per = permutation_labels(q,attribute,Klabels,y_train)
        labeldict = {per[c]:q[c] for c in range(len(q))}
        y_pred = kmeans.predict(X_test)
        y_pred = [labeldict.get(item,item)  for item in y_pred]
        BothLabelstest = np.column_stack([y_pred,y_test]) 
        BothLabelsComparetest = [BothLabelstest[c][0] == BothLabelstest[c][1] for c in range(0,len(BothLabelstest))]
        CorrectLabelstest = (float(BothLabelsComparetest.count(True))/float(len(y_test)))*100
        testaccuracy[0,i] = CorrectLabelstest
        report = classification_report(y_pred,y_test,target_names=q,labels=q,output_dict=True)
        for v in range(len(q)):
                km_precision[0,v] =  km_precision[0,v] + float(report[q[v]]['precision']) 
                km_recall[0,v] = km_recall[0,v] + float(report[q[v]]['recall'])    
                km_f1[0,v] = km_f1[0,v] + float(report[q[v]]['f1-score'])
                if float(report[q[v]]['precision'])>0:
                    km_num[0,v] = km_num[0,v] + 1
    for i in range(len(km_num)):
        if int(km_num[0,i]) == 0:
            km_num[i] = 1
    k_p = np.divide(km_precision, km_num)
    k_r = np.divide(km_recall, km_num)
    k_f = np.divide(km_f1, km_num)
    km_results = [k_p,k_r,k_f]
    TestAccuracyMean[0,b] = np.mean(testaccuracy)
    return TestAccuracyMean,km_results

**Rearranged and merged datasets (using extra trees regression to predict missing values)**

In [None]:
# REARRANGE DATASETS IN ORDER TO USE EXTRA TREES REGRESSION TO PREDICT MISSING VALUES OF DATASET

# Rearrange all Literacy Data
LitData = get_recent_value(File,Indicator[0])
for i in range(1,len(Indicator)):
    LitCol = get_recent_value(File,Indicator[i])
    LitData = LitData.merge(LitCol, left_index=True, right_index=True)
LitData = LitData.iloc[:,1::2]

# Rearrange all Politics Data
PolData = get_recent_value(FileGen,IndicatorGen[0])
for i in range(1,len(IndicatorGen)):
    PolCol = get_recent_value(FileGen,IndicatorGen[i])
    PolData = PolData.merge(PolCol, left_index=True, right_index=True)
PolData = PolData.iloc[:,1::2]

# Rearrange data in UnicefDatasets1
Uni1Data = FileUnicef1.loc[FileUnicef1['Country'].isin(countries)]
Uni1Data = Uni1Data.reset_index(inplace = False) 
Uni1Data = Uni1Data.iloc[:,6:]

# merge all data together
AllData = LitData.merge(PolData,left_index=True, right_index=True)
AllData = AllData.merge(Uni1Data,left_index=True, right_index=True)

# use random forest to predict all missing values in AllData
imp = IterativeImputer(estimator=ExtraTreesRegressor(n_estimators=100, random_state=0),missing_values=np.nan,max_iter = 30, random_state=0)
imp.fit(AllData)
A = imp.transform(AllData)

# reshape output back to shape of AllData
X = A[:,0].reshape(len(countries),1)
for i in range(1,len(A[0])):
    B = A[:,i].reshape(len(countries),1)
    X = np.column_stack((X,B))

**Correlations between features**

In [None]:
# Correlations and heatmap between features of all 3 datasets
IndicatorList = np.array(Indicator)
IndicatorList =  np.concatenate((IndicatorList, IndicatorGen), axis=None)
IndicatorList = np.concatenate((IndicatorList, IndicatorUnicef1), axis=None)
CORR = Correlation_features(X,IndicatorList)
CORRELATION = CORR[0]
SIGNIFICANCE = CORR[1]
significant_correlations(CORRELATION,SIGNIFICANCE,IndicatorList)
correlation_heatmap(CORRELATION)

**3D Scatterplots between features of the 3 datasets (Literacy, Political and Unicef data)** 

**INDEX LIST OF FEATURES IN X:**

0 --> 'Adult illiterate population, 15+ years, % female'

1 --> 'Adult literacy rate, population 15+ years, gender parity index (GPI)'

2 --> 'Learning poverty: gender difference (%)'

3 --> 'Learning poverty: Share of Female Children at the End-of-Primary age below minimum reading proficiency adjusted by Out-of-School Children (%)'

4 --> 'Learning poverty: Share of Male Children at the End-of-Primary age below minimum reading proficiency adjusted by Out-of-School Children (%)'

5 --> 'Literacy rate, adult female (% of females ages 15 and above)'

6 --> 'Literacy rate, adult gender difference (%)'

7 --> 'Literacy rate, adult male (% of males ages 15 and above)'

8 --> 'Literacy rate, youth (ages 15-24), gender parity index (GPI)'

9 --> 'Literacy rate, youth female (% of females ages 15-24)'

10 --> 'Literacy rate, youth gender difference (%)'

11 --> 'Literacy rate, youth male (% of males ages 15-24)'

12 --> 'Mean performance on the mathematics scale. Female'

13 --> 'Mean performance on the mathematics scale. Gender Difference'

14 --> 'Mean performance on the mathematics scale. Male'

15 --> 'Mean performance on the reading scale. Female'

16 --> 'Mean performance on the reading scale. Gender Difference'

17 --> 'Mean performance on the reading scale. Male'

18 --> 'Mean performance on the science scale. Female'

19 --> 'Mean performance on the science scale. Gender Difference'

20 --> 'Mean performance on the science scale. Male'

21 --> 'Youth illiterate population, 15-24 years, % female'

22 --> 'Youth illiterate population, 15-24 years, % male'

23 --> 'Youth illiterate population, 15-24 years, gender difference (%)'

24 --> 'Proportion of seats held by women in national parliaments (%)'

25 --> 'Proportion of women in ministerial level positions (%)'

26 --> 'Political Empowerment Score'

27 --> 'Proportion of women in managerial positions'

28 --> 'Proportion of women in senior and middle management positions'

29 --> 'Share of female judges'

30 --> 'Share of female police officers'

31 --> 'Financial inclusion (%) - male'

32 --> 'Financial inclusion (%) - female '

33 --> 'Urban labour force participation rate (%) - male'

34 --> 'Urban labour force participation rate (%) - female'

35 --> 'Rural labour force participation rate (%) - male'

36 --> 'Rural labour force participation rate (%) - female'

37 --> 'Total labour force participation rate (%) - male'

38 --> 'Total labour force participation rate (%) - female'

39 --> 'Demand for family planning satisfied with modern methods (%) - Women aged 15-49'

40 --> 'Demand for family planning satisfied with modern methods (%) - Women aged 15-19'

41 --> 'Government Expenditure on Health as % of GDP'

42 --> 'Government Expenditure on Education as % of GDP'

43 --> 'Births by Age 18'

44 --> 'Adolescent birth rate (number of live births to adolescent women per 1,000 adolescent women)'

45 --> 'Percentage of adolescents (aged 10-14 years) engaged in household chores - male'

46 --> 'Percentage of adolescents (aged 10-14 years) engaged in household chores - female'

47 --> 'Percentage of boys (aged 15-17 years) who have experienced sexual violence'

48 --> 'Percentage of girls (aged 15-17 years) who have experienced sexual violence'

49 --> 'Percentage of women (aged 20-24 years) married or in union before age 15'

50 --> 'Percentage of women (aged 20-24 years) married or in union before age 18'

51 --> 'Percentage of women (aged 15-49 years) who consider a husband to be justified in hitting or beating his wife for at least one of the specified reasons'

52 --> 'Happy Planet Index'

**Feature Selection for K-Means and K-NN --> selecting subset of feature combinations that best separate class labels**

In [None]:
# produce list of all feature combinations
alllist = [Indicator,IndicatorGen,IndicatorUnicef1]
n = int(len(Indicator))*int(len(IndicatorGen)+len(IndicatorUnicef1))*int(len(IndicatorGen)+len(IndicatorUnicef1))
TestAccuracyMean = np.zeros((1,n))
TestFeatures = np.zeros((3,n))
a = 0
CentroidSeparabilityList = []
SilList = []
for i in range(len(Indicator)):
    for j in range(len(IndicatorGen)+len(IndicatorUnicef1)):
        for k in range(len(IndicatorGen)+len(IndicatorUnicef1)):
            TestFeatures[0,a] = i
            TestFeatures[1,a] = int(len(Indicator)) + j
            TestFeatures[2,a] = int(len(Indicator)) + k  
            a = a + 1

# Calculation of Centroid separation and Silhouette score of each feature combination 
scaler = StandardScaler()
for i in range(n):
    print(i)
    Index1 = int(TestFeatures[0,i]) # First feature 0 to 23 
    Index2 = int(TestFeatures[1,i]) # Second feature 24 to 52 
    Index3 = int(TestFeatures[2,i]) # Third feature 52 to 52
    attribute = 'Dev' #set to 'Region' or 'GNI' or 'Happy' or 'Dev'
    xvalues = scaler.fit_transform(X[:,Index1].reshape(-1, 1))
    yvalues = scaler.fit_transform(X[:,Index2].reshape(-1, 1))
    zvalues = scaler.fit_transform(X[:,Index3].reshape(-1, 1))
    RCG = get_regions_and_gni(File,FileUnicef1,r,g,h,d,cl,attribute)
    REGIONS = RCG[0]
    COLOUR = RCG[1]
    GNI = RCG[2]
    HAPPY = RCG[3]
    DEV = RCG[4]
    XXXX = np.column_stack([xvalues,yvalues,zvalues])
    SilList.append(metrics.silhouette_score(XXXX, DEV, metric='sqeuclidean'))
    Centroid = []
    dist = 0
    if (attribute == 'Region'):
        for j in range(len(r)):
            index = []
            index = [i for i, x in enumerate(REGIONS) if x == r[j]]
            xx = [xvalues[x] for x in index]
            yy = [yvalues[x] for x in index]
            zz = [zvalues[x] for x in index]
            Centroid.append([np.mean(xx),np.mean(yy),np.mean(zz)])
    if (attribute == 'GNI'):
        for j in range(len(g)):
            index = []
            index = [i for i, x in enumerate(GNI) if x == g[j]]
            xx = [xvalues[x] for x in index]
            yy = [yvalues[x] for x in index]
            zz = [zvalues[x] for x in index]
            Centroid.append((np.mean(xx),np.mean(yy),np.mean(zz)))
    if (attribute == 'Happy'):
        for j in range(len(h)):
            index = []
            index = [i for i, x in enumerate(HAPPY) if x == h[j]]
            xx = [xvalues[x] for x in index]
            yy = [yvalues[x] for x in index]
            zz = [zvalues[x] for x in index]
            Centroid.append((np.mean(xx),np.mean(yy),np.mean(zz)))
    if (attribute == 'Dev'):
        for j in range(len(d)):
            index = []
            index = [i for i, x in enumerate(DEV) if x == d[j]]
            xx = [xvalues[x] for x in index]
            yy = [yvalues[x] for x in index]
            zz = [zvalues[x] for x in index]
            Centroid.append((np.mean(xx),np.mean(yy),np.mean(zz)))
    for k in range(len(Centroid)):
        for l in range(len(Centroid)):
            dist = dist + float(distance.euclidean(Centroid[k],Centroid[l]))
    CentroidSeparabilityList.append(dist)

#Get rank of centroids
GNIFEATURESCENTROID = CentroidSeparabilityList
GNIFEATURESRANK = list(ss.rankdata(GNIFEATURESCENTROID))

# top 200 combination of features in list
TestFeaturesDev = np.zeros((3,200))
aaa=0
for i in range(len(GNIFEATURESRANK)):
    if int(GNIFEATURESRANK[i])>(int(len(GNIFEATURESRANK))-200):
        TestFeaturesDev[0,aaa] = int(TestFeatures[0,i])
        TestFeaturesDev[1,aaa] = int(TestFeatures[1,i])
        TestFeaturesDev[2,aaa] = int(TestFeatures[2,i])
        aaa=aaa+1
print(TestFeaturesDev)

# Get rank of Silhouette scores
GNIFEATURESCENTROID = SilList
GNIFEATURESRANK = list(ss.rankdata(GNIFEATURESCENTROID))

# top 200 combination of features in list
TestFeaturesDevSil = np.zeros((3,200))
aaa=0
for i in range(len(GNIFEATURESRANK)):
    if int(GNIFEATURESRANK[i])>(int(len(GNIFEATURESRANK))-200):
        TestFeaturesDevSil[0,aaa] = int(TestFeatures[0,i])
        TestFeaturesDevSil[1,aaa] = int(TestFeatures[1,i])
        TestFeaturesDevSil[2,aaa] = int(TestFeatures[2,i])
        aaa=aaa+1
print(TestFeaturesDevSil)

# Centroid: TestFeaturesHappy, TestFeaturesGNI, TestFeaturesRegion,TestFeaturesDev
# Silhouette: TestFeaturesHappySil, TestFeaturesGNISil, TestFeaturesRegionSil,TestFeaturesDevSil

In [None]:
# SCATTERPLOTS 
# define features to plot and label attribute
Index1 = 23 # First feature 0 to 23    
Index2 = 44 # Second feature 24 to 30  
Index3 = 52 # Third feature 31 to 52   
attribute = 'Happy' #set to 'Region' or 'GNI' or 'Happy' or 'Dev'

# define regions, gni and colours in plot
r = File.Region.unique()
g = File.GNI.unique()
h = FileUnicef1.HPI_Class.unique()
h = np.delete(h,3)
d = FileUnicef1.Development.unique()
cl = ['red','blue','green','orange','purple','cyan']
clh = ['darkred','yellow','orange','green','red']
# scaled x and y values to use in scatterplot (Using datset in X)
scaler = StandardScaler()
xvalues = scaler.fit_transform(X[:,Index1].reshape(-1, 1))
yvalues = scaler.fit_transform(X[:,Index2].reshape(-1, 1))
zvalues = scaler.fit_transform(X[:,Index3].reshape(-1, 1))
RCG = get_regions_and_gni(File,FileUnicef1,r,g,h,d,cl,attribute)
REGIONS = RCG[0]
COLOUR = RCG[1]
GNI = RCG[2]
HAPPY = RCG[3]
DEV = RCG[4]
# plot scatter plot
%matplotlib qt
plot_scatter(xvalues,yvalues,zvalues,REGIONS,GNI,HAPPY,DEV,attribute,Index1,Index2,Index3)

**Centroid K-Means (3D - one feature from each dataset) - takes hours to run this code**

In [None]:
# CENTROID K-MEANS OF ALL COMBINATIONS OF FEATURES FROM THE HDX, Happy Planet Index and UNICEF DATASETS

alllist = [Indicator,IndicatorGen,IndicatorUnicef1]

# initialise number of feature combinations and arrays
#n = len(list(itertools.product(*alllist))) #all
#TestAccuracyMean = np.zeros((1,n)) #all
#TestFeatures = np.zeros((3,n)) #all
#a = 0
#for i in range(len(Indicator)):
#    for j in range(len(IndicatorGen)):
#        for k in range(len(IndicatorUnicef1)):
#            TestFeatures[0,a] = i
#            TestFeatures[1,a] = int(len(Indicator)) + j
#            TestFeatures[2,a] = int(len(Indicator)) + int(len(IndicatorGen)) + k  
#            a = a + 1
TestFeatures = TestFeaturesDevSil # happy or gni or region or Dev
n = int(len(TestFeatures[0]))
TestAccuracyMean = np.zeros((1,n))


# choose features for kmeans,initialise variables and compute test accuracy of each combination
N = 1000 # number of times each combination of features is run
TestAccuracyMeanList = np.zeros((N,n)) 
RCG = get_regions_and_gni(File,FileUnicef1,r,g,h,d,cl,attribute) # get labels for each country
REGIONS = RCG[0]
COLOUR = RCG[1]
GNI = RCG[2]
HAPPY = RCG[3]
DEV = RCG[4]
q = d # set q as either g (for GNI) or r (for Region) or h (for Happy) or d (for dev)
OUTPUT = DEV # set as either GNI or REGIONS or HAPPY or DEV
t_size = 0.2 # proportion of points in data split into test and training sets
num = 5 # number of different splits of the data into test and training sets
attribute = 'Dev' #set to 'Region' or 'GNI' or 'Happy' or 'Dev'
scaler = StandardScaler()
k_precision = np.zeros((n,len(q)))
k_recall = np.zeros((n,len(q)))
k_f1 = np.zeros((n,len(q)))
for c in range(0,N):
    print(c)
    for b in range(0,n):
        # initialise inputs
        Index1 = int(TestFeatures[0,b]) # First feature 0 to 23
        Index2 = int(TestFeatures[1,b]) # Second feature 24 to 30
        Index3 = int(TestFeatures[2,b]) # Third feature 31 to 40
        #Index1 = b
        #Index2 = b
        #Index3 = 52
        InputMatrix = np.column_stack([X[:,Index1],X[:,Index2],X[:,Index3]])
        InputMatrix = scaler.fit_transform(InputMatrix)
        xvalues = InputMatrix[:,0]
        yvalues = InputMatrix[:,1]
        zvalues = InputMatrix[:,2]
        TestMean,km_results = K_means_test_accuracy(num,t_size,q,attribute,OUTPUT,xvalues,yvalues,zvalues,TestAccuracyMean,b,c)
        for v in range(len(q)):
            k_precision[b][v] = k_precision[b][v] + km_results[0][0][v]
            k_recall[b][v] = k_recall[b][v] + km_results[1][0][v]
            k_f1[b][v] = k_f1[b][v] + km_results[2][0][v]
    TestAccuracyMeanList[c,:] = TestAccuracyMean

print('Mean Statistics:')
print(np.divide(k_precision, float(N)))
print(np.divide(k_recall, float(N)))
print(np.divide(k_f1, float(N)))
          
# mean and standard deviation test accuracy for each combination
FinalAccuracyMean= np.mean(TestAccuracyMeanList, axis=0) # mean accuracy of each combination of features
FinalAccuracyStd = np.std(TestAccuracyMeanList, axis=0) # standard dev of test accuracy of each combination of features
print(FinalAccuracyMean)
print(FinalAccuracyStd)

# combination that produces best results:
MAX = np.max(FinalAccuracyMean)
Maxindex = np.argmax(FinalAccuracyMean)
STD = FinalAccuracyStd[int(Maxindex)]
print("\033[1m" + "Features that produce best results using K-Means clustering:"+ "\033[0;0m" + "\n")
print(IndicatorList[int(TestFeatures[0,Maxindex])] + "\n")
print(IndicatorList[int(TestFeatures[1,Maxindex])] + "\n")
print(IndicatorList[int(TestFeatures[2,Maxindex])] + "\n")
print("Mean Test Accuracy --> " + str(MAX) + "\n")
print("std Test Accuracy --> " + str(STD))

**-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------**

**DOES GENDER DATA EXHIBIT CLUSTERING RELATIONSHIP WITH OTHER DATA? CAN COUNTRIES WITH SIMILAR DEVELOPMENT/HAPPINESS BE GROUPED AND THUS DATA USED TO PREDICT WHAT DEVELOPMENT/HAPPINESS STAGE A COUNTRY IS AT?**

**-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------**

**Top 3 combination of features that produce best results using Centroid K-Means clustering (GNI):**

**-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------**

**Best:** Literacy rate, adult gender difference (%) (6),
Financial inclusion (%) - male (31),
Percentage of women (aged 15-49 years) who consider a husband to be justified in hitting or beating his wife for at least one of the specified reasons (51)

**(Mean Test Accuracy --> 60.81944444444371%,                                 Standard Deviation Test Accuracy --> 1.69960635, Max test accuracy --> 63.88888889%)**

**precision --> [L:0.62246889 UM:0.54792857 LM:0.33843611 H:0.72846946]**
 
**recall --> [L:0.68813899 UM:0.51928673 LM:0.40885298 H:0.68754194]**

**f1 --> [L:0.65365868 UM:0.5332233 LM:0.37032686  H:0.70741422]**
 
**Second:** Mean performance on the science scale. Female (18),
Financial inclusion (%) - female (32),
Percentage of women (aged 15-49 years) who consider a husband to be justified in hitting or beating his wife for at least one of the specified reasons (51)

**(Mean Test Accuracy --> 60.13277778%,                                 Standard Deviation Test Accuracy --> 2.28237199, Max test accuracy --> 66.11111111%)**

**precision --> [L:0.7444     UM:0.52901714 LM:0.48178333 H:0.70979245]**

**recall --> [L:0.61710646 UM:0.55331693 LM:0.394376   H:0.8180461 ]**

**f1 --> [L:0.61710646 UM:0.54089425 LM:0.43371970 H:0.76008416]**

**Third:** Mean performance on the science scale. Male (20),
Financial inclusion (%) - female (32),
Percentage of women (aged 15-49 years) who consider a husband to be justified in hitting or beating his wife for at least one of the specified reasons (51)

**(Mean Test Accuracy --> 59.67%,                                 Standard Deviation Test Accuracy --> 2.07000656, Max test accuracy --> 64.44444444%)**

**precision --> [L:0.747375   UM:0.53228857 LM:0.48140833 H:0.69003071]]**

**recall --> [L:0.61772514 UM:0.54093311 LM:0.38289172 H:0.83437285]]**

**f1 --> [L:0.67639334 UM:0.53657602 LM:0.42653535 H:0.75536807]**

**-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------**

**Top 3 combination of features that produce best results using Centroid K-Means clustering (Development):**

**-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------**

**Best:** Mean performance on the mathematics scale. Male (14),
Financial inclusion (%) - male (31),
Financial inclusion (%) - female (32)

**(Mean Test Accuracy --> 67.61888889%,                                 Standard Deviation Test Accuracy --> 2.44319438, Max test accuracy --> 71.11111111%)**

**precision --> [DEVELOPING:0.67481491 TRANSITION:0.50947143 DEVELOPED:0.79826097]**

**recall --> [DEVELOPING:0.85678743 TRANSITION:0.32540213 DEVELOPED:0.68999161]**

**f1 --> [DEVELOPING:0.75499092 TRANSITION:0.39714537 DEVELOPED:0.74018802]**
 
**Second:** Mean performance on the reading scale. Male (17),
Financial inclusion (%) - male (31),
Financial inclusion (%) - female (32)

**(Mean Test Accuracy --> 66.205%,                                 Standard Deviation Test Accuracy --> 2.94509631, Max test accuracy --> 70.55555556%)**

**precision --> [DEVELOPING:0.6523092  TRANSITION:0.4524381  DEVELOPED:0.82264695]**

**recall --> [DEVELOPING:0.86588576 TRANSITION:0.30246393 DEVELOPED:0.65607428]**

**f1 --> [DEVELOPING:0.74407472  TRANSITION:0.36255355 DEVELOPED:0.72997870]**

**Third:** Mean performance on the reading scale. Female (15),
Financial inclusion (%) - male (31),
Financial inclusion (%) - female (32)

**(Mean Test Accuracy --> 64.67388889%,                                 Standard Deviation Test Accuracy --> 3.20668818, Max test accuracy --> 71.66666667%)**

**precision --> [DEVELOPING:0.60243168 TRANSITION:0.48552476 DEVELOPED:0.85675688]**

**recall --> [DEVELOPING:0.90177997 TRANSITION:0.30133487 DEVELOPED:0.63753933]**

**f1 --> [DEVELOPING:0.72231965 TRANSITION:0.37187202 DEVELOPED:0.73106818]**

**-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------**

**Top 3 combination of features that produce best results using Centroid K-Means clustering (Development without Transition):**

**-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------**

**Best:** Mean performance on the mathematics scale. Female (12),
Share of female judges (29),
Government Expenditure on Health as % of GDP (41)

**(Mean Test Accuracy --> 86.218%,                                 Standard Deviation Test Accuracy --> 0.375, Max test accuracy --> 87.222%)**

**precision --> [DEVELOPED:0.92817668 DEVELOPING:0.84491398]**

**recall --> [DEVELOPED:0.64592345 DEVELOPING:0.9670228 ]**

**f1 --> [DEVELOPED:0.76874453 DEVELOPING:0.90185385]**
 
**Second:** Mean performance on the mathematics scale. Male (14),
Share of female police officers (30),
Government Expenditure on Health as % of GDP (41) 

**(Mean Test Accuracy --> 85.624%,                                 Standard Deviation Test Accuracy --> 0.270, Max test accuracy --> 86.527%)**

**precision --> [DEVELOPED:0.9061241  DEVELOPING:0.83881731]**

**recall --> [DEVELOPED:0.63229579 DEVELOPING:0.96622098]**

**f1 --> [DEVELOPED:0.74484015 DEVELOPING:0.89802292]**

**Third:** Mean performance on the mathematics scale. Male (14),
Share of female judges (29),
Government Expenditure on Health as % of GDP (41)

**(Mean Test Accuracy --> 85.557%,                                 Standard Deviation Test Accuracy --> 0.287, Max test accuracy --> 86.250%)**

**precision --> [DEVELOPED:0.92777328 DEVELOPING:0.83645219]**

**recall --> [DEVELOPED:0.63493852 DEVELOPING:0.96650218]**

**f1 --> [DEVELOPED:0.75391891 DEVELOPING:0.89678682]**

**-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------**

**Top 3 combination of features that produce best results using Centroid K-Means clustering (Happiness):**

**-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------**

**Best:** Youth illiterate population, 15-24 years, % male (22),
Government Expenditure on Education as % of GDP (42),
Adolescent birth rate (number of live births to adolescent women per 1,000 adolescent women) (44)

**(Mean Test Accuracy --> 45.389%,                                 Standard Deviation Test Accuracy --> 3.248, Max test accuracy --> 61.11111111%)**

**precision --> [R:0.53233103 G:0.39365865 O:0.54271172 DO:0.54594553 Y:0.54771333]**

**recall --> [R:0.71379628 G:0.40768622 O:0.22188818 DO:0.27159761 Y:0.61009079]**

**f1 --> [R:0.60985086 G:0.40054965 O:0.31499171 DO:0.36273927 Y:0.57722174]**

**Second:** Adult illiterate population, 15+ years, % female (0),
Total labour force participation rate (%) - male (37),
Demand for family planning satisfied with modern methods (%) - Women aged 15-49 (39)

**(Mean Test Accuracy --> 45.200%,                                 Standard Deviation Test Accuracy --> 2.645, Max test accuracy --> 60.97222222%)**

**precision --> [R:0.4610612  G:0.45272023 O:0.5084938  DO:0.53889022 Y:0.53336723]**

**recall --> [R:0.57822793 G:0.26682704 O:0.30698016 DO:0.4448239  Y:0.64341243]**

**f1 --> [R:0.51304003 G:0.33576112 O:0.38283873 DO:0.48735957 Y:0.58324445]**

**Third:** Youth illiterate population, 15-24 years, gender difference (%) (23),
Government Expenditure on Education as % of GDP (42),
Adolescent birth rate (number of live births to adolescent women per 1,000 adolescent women) (44)

**(Mean Test Accuracy --> 45.344%,                                 Standard Deviation Test Accuracy --> 3.381, Max test accuracy --> 60.27777778%)**

**precision --> [R:0.53048149 G:0.39070005 O:0.53000936 DO:0.55200481 Y:0.54489156]**

**recall --> [R:0.71161373 G:0.40683947 O:0.2205389  DO:0.2721589  Y:0.6097826 ]**

**f1 --> [R:0.60784053 G:0.39860645 O:0.31147279  DO:0.36457082 Y:0.57551368]**

**K-Nearest Neighbours**

In [None]:
K_num_list = []
K_mean_list = []
K_std_list = []
q = d # set q as either g (for GNI) or r (for Region) or h (for Happy) or d (for dev)
OUTPUT = DEV # set as either GNI or REGIONS or HAPPY or DEV
num = 5 # number of different splits of the data into test and training sets

#TestFeatures = np.array([[],[],[]])
#n=53
TestFeatures = TestFeaturesDevSil
n = int(len(TestFeatures[0]))
for w in range(0,n):
    print(w)
    # define input variables and lists
    xval = X[:,int(TestFeatures[0,w])]
    yval = X[:,int(TestFeatures[1,w])]
    zval = X[:,int(TestFeatures[2,w])]
    #xval = X[:,w]
    #yval = X[:,w]
    #zval = X[:,52]
    X_in = np.column_stack([xval,yval,zval])
    k = list(range(1,50))
    k_score_mean = []
    k_score_std = []
    F1_mean = []
    kf = StratifiedKFold(n_splits=num, shuffle=False)
    # compute mean test accuracy for number of neighbours, k, ranging from 1 to 100
    for i in range(1,1+len(k)): 
        k_score = []
        km_precision = np.zeros((1,len(q)))
        km_recall = np.zeros((1,len(q)))
        km_f1 = np.zeros((1,len(q)))
        km_num = np.zeros((1,len(q)))
        for j in range(0,num):
            j_split = 0
            for train_index, test_index in kf.split(X_in,OUTPUT):
                if j == j_split:
                    X_train, X_test = X[train_index], X[test_index]
                    Y_train, Y_test = [OUTPUT[x] for x in train_index], [OUTPUT[x] for x in test_index]
                j_split = j_split + 1
            #X_train, X_test, Y_train, Y_test = train_test_split(X_in,DEV, test_size=0.2, random_state=j)
            neigh = KNeighborsClassifier(n_neighbors=i)
            neigh.fit(X_train, Y_train)
            k_score.append(neigh.score(X_test,Y_test))
            report = classification_report(neigh.predict(X_test),Y_test,target_names=q,labels=q,output_dict=True)
            for v in range(len(q)):
                km_precision[0,v] =  km_precision[0,v] + float(report[q[v]]['precision']) 
                km_recall[0,v] = km_recall[0,v] + float(report[q[v]]['recall'])    
                km_f1[0,v] = km_f1[0,v] + float(report[q[v]]['f1-score'])
                if float(report[q[v]]['precision'])>0:
                    km_num[0,v] = km_num[0,v] + 1
        for i in range(len(km_num)):
            if int(km_num[0,i]) == 0:
                km_num[i] = 1
        #print('Mean Statistics:')
        #print(q)
        #print(np.divide(km_precision, km_num))
        #print(np.divide(km_recall, km_num))
        #print(np.divide(km_f1, km_num))
        #print(np.max(k_score,axis=0))
        F1_mean.append(np.mean(np.divide(km_recall, km_num)))
        k_score_mean.append(mean(k_score))
        k_score_std.append(stdev(k_score))
    print(len(F1_mean))
    print(np.max(k_score_mean))
    print(np.nanmax(F1_mean))
    K_mean_list.append(max(k_score_mean))
    K_std_list.append(k_score_std[np.argmax(k_score_mean)])
    K_num_list.append(int(np.argmax(k_score_mean))+1)
    
    
print(K_mean_list)
print(K_std_list)
# plot number of neighbours vs mean test accuracy, print results
plt.plot(k,k_score_mean)
print("\033[1m" + "Number of neighbours that produces highest test accuracy (GNI):" + "\033[0;0m")
print("k = " + str(int(np.argmax(k_score_mean)+1)))
print("\033[1m" + "Mean test accuracy using k = " + str(int(np.argmax(k_score_mean))+1) + " (GNI):" + "\033[0;0m")
print(str(max(k_score_mean)))
print("\033[1m" + "Standard Deviation test accuracy using k = " + str(np.argmax(k_score_mean)) + " (GNI):" + "\033[0;0m")
print(k_score_std[np.argmax(k_score_mean)])

**-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------**

**DOES GENDER DATA EXHIBIT CLUSTERING RELATIONSHIP WITH OTHER DATA? CAN COUNTRIES WITH SIMILAR DEVELOPMENT/HAPPINESS BE GROUPED AND THUS DATA USED TO PREDICT WHAT DEVELOPMENT/HAPPINESS STAGE A COUNTRY IS AT?**

**-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------**

**Top 3 combination of features that produce best results using k-nearest neighbours (GNI):**

**-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------**

**Best:** Mean performance on the science scale. Male (20),
Financial inclusion (%) - male (31)
Percentage of women (aged 15-49 years) who consider a husband to be justified in hitting or beating his wife for at least one of the specified reasons (51)

**(Mean Test Accuracy --> 61.056%,                                 Standard Deviation Test Accuracy --> 7.307, k = 27, Max test accuracy --> 86.111%)**

**precision --> [L:0.7385934821241721, UM:0.5928655483405482, LM:0.3659460844710846, H:0.6752121468701577]**

**recall --> [L:0.8385137748689231, UM:0.6314246074187244, LM:0.4357547494172491, H:0.533160606619043]**

**f1 --> [L:0.785388319080233, UM:0.6115378684570987, LM:0.3978110976023557, H:0.5958368670302147]**

**Second:** Mean performance on the mathematics scale. Male (14),
Financial inclusion (%) - male (31)
Percentage of women (aged 15-49 years) who consider a husband to be justified in hitting or beating his wife for at least one of the specified reasons (51)

**(Mean Test Accuracy --> 61.0%,                                 Standard Deviation Test Accuracy --> 7.004, k = 21, Max test accuracy --> 86.111%)**

**precision --> [L:0.733544379266245, UM:0.575518470418471, LM:0.39562351259851236, H:0.6633517406698914]**

**recall --> [L:0.8206679653937408, UM:0.6241186147186146, LM:0.44807382062382034, H:0.539669005470122]**

**f1 --> [L:0.7746642539892349, UM:0.5988340889970175, LM:0.4202183219937176, H:0.5951524532105283]**

**Third:** Mean performance on the mathematics scale. Male (14),
Percentage of women (aged 20-24 years) married or in union before age 15 (49)
Percentage of women (aged 15-49 years) who consider a husband to be justified in hitting or beating his wife for at least one of the specified reasons (51)

**(Mean Test Accuracy --> 60.722%,                                  Standard Deviation Test Accuracy --> 7.153, k = 16,Max test accuracy --> 86.111%)**

**precision --> [L:0.7377874750593408, UM:0.6398099206349204, LM:0.4246830197580197, H:0.5940103056558159]**

**recall --> [L:0.8250444013502857, UM:0.6388209568209557, LM:0.42744247704909477, H:0.5385179359039654]**

**f1 --> [L:0.7789800488104802, UM:0.6393150562682921, LM:0.4260582803970431, H:0.5649045948150304]**

**-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------**

**Top 3 combination of features that produce best results using k-nearest neighbours (Development):**

**-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------**

**Best:** Mean performance on the mathematics scale. Female (12),
Share of female judges (%) (29),
Financial inclusion (%) - female (32)

**(Mean Test Accuracy --> 83.858%,                                 Standard Deviation Test Accuracy --> 5.6543, k = 2, Max test accuracy --> 100%)**

**precision --> [DEVELOPING:0.9059963954123553, TRANSITION:0.1695273809523809, DEVELOPED:0.9227963508713525]**

**recall  --> [DEVELOPING:0.8939461668180482, TRANSITION:0.48723333333333335, DEVELOPED:0.7047506713874373]**

**f1  --> [DEVELOPING:0.899930944269954, TRANSITION:0.2515357240955942, DEVELOPED:0.7991675066049854]**

**Second:** Learning poverty: Share of Male Children at the End-of-Primary age below minimum reading proficiency adjusted by Out-of-School Children (%) (4),
Share of female judges (%) (29),
Percentage of women (aged 15-49 years) who consider a husband to be justified in hitting or beating his wife for at least one of the specified reasons (51)

**(Mean Test Accuracy --> 83.489%,                                 Standard Deviation Test Accuracy --> 5.5784, k = 21, Max test accuracy --> 100%)**

**precision --> [DEVELOPING:0.9093857917040175, TRANSITION:0.0, DEVELOPED:0.9631083638583653]**

**recall --> [DEVELOPING:0.889100223576146, TRANSITION:0.0, DEVELOPED:0.7055588641750412]**

**f1  --> [DEVELOPING:0.8991286046725929, TRANSITION:0.0, DEVELOPED:0.8144579480742178]**

**Third:** Mean performance on the mathematics scale. Female (12),
Proportion of women in ministerial level positions (%) (25),
Share of female judges (%) (29)

**(Mean Test Accuracy --> 83.056%,                                 Standard Deviation Test Accuracy --> 5.8246, k = 4, Max test accuracy --> 100%)**

**precision  --> [DEVELOPING:0.9093093392516309, TRANSITION:0.0833130952380953, DEVELOPED:0.8997294566544585]**

**recall  --> [DEVELOPING:0.8696603748436059, TRANSITION:0.2121666666666667, DEVELOPED:0.7356410336051991]**

**f1  --> [DEVELOPING:0.8890430169291013, TRANSITION:0.11964448321199454, DEVELOPED:0.8094531623268311]**

**-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------**

**Top 3 combination of features that produce best results using k-nearest neighbours (Development without transition):**

**-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------**

**Best:** Learning poverty: Share of Male Children at the End-of-Primary age below minimum reading proficiency adjusted by Out-of-School Children (%) (4),
Proportion of women in ministerial level positions (%) (25),
Government Expenditure on Health as % of GDP (41)

**(Mean Test Accuracy --> 91.363%,                                 Standard Deviation Test Accuracy --> 4.1692, k = 9, Max test accuracy --> 100%)**

**precision --> [DEVELOPED:0.82124178 DEVELOPING:0.94032703]**

**recall  --> [DEVELOPED:0.79540259 DEVELOPING:0.94941078]**

**f1  --> [DEVELOPED:0.8081156882118858  DEVELOPING:0.9448470727347974]**

**Second:** Mean performance on the mathematics scale. Female (12),
Proportion of women in ministerial level positions (%) (25),
Financial inclusion (%) - female (32)

**(Mean Test Accuracy --> 90.755%,                                 Standard Deviation Test Accuracy --> 4.3742, k = 3, Max test accuracy --> 100%)**

**precision --> [DEVELOPED:0.8203595  DEVELOPING:0.93337754]**

**recall --> [DEVELOPED:0.77687984 DEVELOPING:0.94828162]**

**f1  --> [DEVELOPED:0.7980278736466384 DEVELOPING:0.9407705545384902]**

**Third:** Mean performance on the mathematics scale. Female (12),
Proportion of women in ministerial level positions (%) (25),
Government Expenditure on Health as % of GDP (41)

**(Mean Test Accuracy --> 90.531%,                                 Standard Deviation Test Accuracy --> 4.5679, k = 4, Max test accuracy --> 100%)**

**precision  --> [DEVELOPED:0.89919376 DEVELOPING:0.90836998]**

**recall  --> [DEVELOPED:0.73320163 DEVELOPING:0.96919523]**

**f1  --> [DEVELOPED:0.8077581382018345 DEVELOPING:0.937797363310961]**

**-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------**

**Top 3 combination of features that produce best results using k-nearest neighbours (Happiness):**

**-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------**

**Best:** Youth illiterate population, 15-24 years, gender difference (%) (23),
Adolescent birth rate (number of live births to adolescent women per 1,000 adolescent women) (44),
Happy Planet Index (52)

**(Mean Test Accuracy --> 58.850%,                                 Standard Deviation Test Accuracy --> 7.3446, k = 9, Max test accuracy --> 86.111%)**

**precision --> [R:0.70657897 G:0.30307999 O:0.38106061 DO:0.48368878 Y:0.85958478]**

**recall --> [R:0.68806248 G:0.83905386 O:0.58939394 DO:0.70522114 Y:0.58629164]**

**f1 --> [R:0.6971978043733688 G:0.4453075889454838 O:0.46286518890906003 DO:0.5738156391811572 Y:0.6971098821643957]**

**Second:** Adult illiterate population, 15+ years, % female (0),
Adolescent birth rate (number of live births to adolescent women per 1,000 adolescent women) (44),
Happy Planet Index (52)

**(Mean Test Accuracy --> 58.811%,                                 Standard Deviation Test Accuracy --> 7.3453, k = 7, Max test accuracy --> 83.333%)**

**precision --> [R:0.66774055 G:0.31498042 O:0.43751468 DO:0.4707125  Y:0.88066393]**

**recall --> [R:0.66915117 G:0.57572559 O:0.75467385 DO:0.60798379 Y:0.60181621]**

**f1 --> [R:0.6684451157928385 G:0.40718775018245984 O:0.5539071710195333  DO:0.5306138018707287 Y:0.7150150809255432]**

**Third:** Adult illiterate population, 15+ years, % female (0),
Adolescent birth rate (number of live births to adolescent women per 1,000 adolescent women) (44),
Percentage of women (aged 15-49 years) who consider a husband to be justified in hitting or beating his wife for at least one of the specified reasons (51)

**(Mean Test Accuracy --> 58.655%,                                 Standard Deviation Test Accuracy --> 7.4080, k = 6, Max test accuracy --> 83.333%)**

**precision --> [R:0.69728656 G:0.29970238 O:0.42245442 DO:0.5553407  Y:0.84261763]**

**recall --> [R:0.68271083 G:0.62473958 O:0.82927619 DO:0.54864068 Y:0.59916277]**

**f1 --> [R:0.6899217195265055 G:0.4050788413070311  O:0.559755252555915 DO:0.5519703589197782 Y:0.7003356589417294]**