In [33]:
import pandas as pd
import numpy as np
import time as t
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [2]:
df = pd.read_csv('final_training_data.csv')

Before we get started with classification, we need to classify HDI. Our current HDI is in continuous form (i.e. HDI for Albania is 0.791). We will create 10 classes for HDI, that will cover 0-.1, .1-0.2, 0.2-0.3 and so on.

In [8]:
hdi_classified = []

for hdi in df['hdi']:
    v = -1
    if hdi < 0.1:
        v = 0
    elif hdi < 0.2:
        v = 1
    elif hdi < 0.3:
        v = 2
    elif hdi < 0.4:
        v = 3
    elif hdi < 0.5:
        v = 4
    elif hdi < 0.6:
        v = 5
    elif hdi < 0.7:
        v = 6
    elif hdi < 0.8:
        v = 7
    elif hdi < 0.9:
        v = 8
    elif hdi <= 1:
        v = 9
    hdi_classified.append(v)

df['HDI Classified'] = hdi_classified

In [35]:
classifiers = [DecisionTreeClassifier(), GradientBoostingClassifier(), RandomForestClassifier()]

# Classification (Supervised Learning)

## Decision Tree

In [36]:
X = df.iloc[:, 1:-2] #independent columns
y = df[['HDI Classified']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=1110)
clf = classifiers[0]
start = t.time()
clf.fit(X_train, y_train.values.ravel())
stop = t.time()
y_pred = clf.predict(X_test)

print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Precision: ", precision_score(y_test, y_pred, average="macro"))
print("Recall: ", recall_score(y_test, y_pred, average="macro"))
print("Time to train: ", stop-start)

Accuracy:  0.7049180327868853
Precision:  0.5812925170068027
Recall:  0.6031865377730039
Time to train:  0.00995182991027832


  _warn_prf(average, modifier, msg_start, len(result))


## Gradient Boosting

In [38]:
clf = classifiers[1]
start = t.time()
clf.fit(X_train, y_train.values.ravel())
stop = t.time()
y_pred = clf.predict(X_test)

print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Precision: ", precision_score(y_test, y_pred, average="macro"))
print("Recall: ", recall_score(y_test, y_pred, average="macro"))
print("Time to train: ", stop-start)

Accuracy:  0.7049180327868853
Precision:  0.5863095238095237
Recall:  0.6031865377730039
Time to train:  0.7360053062438965


  _warn_prf(average, modifier, msg_start, len(result))


## Random Forest

In [39]:
clf = classifiers[2]
start = t.time()
clf.fit(X_train, y_train.values.ravel())
stop = t.time()
y_pred = clf.predict(X_test)

print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Precision: ", precision_score(y_test, y_pred, average="macro"))
print("Recall: ", recall_score(y_test, y_pred, average="macro"))
print("Time to train: ", stop-start)

Accuracy:  0.7377049180327869
Precision:  0.6467120181405897
Recall:  0.5999283924095954
Time to train:  0.17653203010559082


  _warn_prf(average, modifier, msg_start, len(result))
