# Utilizing RandomForestClassifier to classify countries with different ESG Risks

In [145]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## Read in the source data, focus on a timeframe with fewest null values

In [146]:
frame = pd.read_csv('ESG_Data.csv')
frame = frame.loc[frame.Time == '2010']
frame = frame.drop(['Time'], axis=1).drop(['Literacy rate, adult total (% of people ages 15 and above)'], axis=1)
print(frame.isnull().sum())

Country Name                                                                          0
Adjusted savings: net forest depletion (% of GNI)                                    17
CO2 emissions (metric tons per capita)                                                3
Renewable electricity output (% of total electricity output)                          0
Unemployment, total (% of total labor force) (modeled ILO estimate)                  15
Access to electricity (% of population)                                               2
Ratio of female to male labor force participation rate (%) (modeled ILO estimate)    15
Individuals using the Internet (% of population)                                      5
Government Effectiveness: Estimate                                                    3
dtype: int64


## Read in the countries and their target value (classification

In [147]:
f = open('Countries.list.txt', 'r')
countries_list = [string.rstrip().split() for string in f.readlines()]

## Ensure that no countries are in the data frame that we don't have a target val for, and vice versa

In [149]:
for name in frame['Country Name'].values:
    if not name in [x[0] for x in sorted(countries_list)]:
        frame.drop(frame[frame['Country Name'] == name].index, inplace=True)

In [150]:
countries_list = [pair for pair in sorted(countries_list) if pair[0] in frame['Country Name'].values]

## Add target column to data frame

In [153]:
target = [pair[1] for pair in countries_list]
frame['Target'] = target
frame.dropna(inplace=True)

In [158]:
frame.head()

Unnamed: 0,Country Name,Adjusted savings: net forest depletion (% of GNI),CO2 emissions (metric tons per capita),Renewable electricity output (% of total electricity output),"Unemployment, total (% of total labor force) (modeled ILO estimate)",Access to electricity (% of population),Ratio of female to male labor force participation rate (%) (modeled ILO estimate),Individuals using the Internet (% of population),Government Effectiveness: Estimate,Target
0,Afghanistan,0.206708,0.289988,85.986547,11.52,42.7,19.215139,4.0,-1.454683,VHigh
2,Algeria,0.107828,3.312561,0.380461,9.96,98.877625,20.528571,12.5,-0.479542,Medium
4,Angola,0.0,1.244092,67.957423,9.43,33.393711,96.069474,2.8,-1.117896,High
6,Argentina,0.0,4.606804,28.585456,7.71,98.82,64.832344,45.0,-0.16279,Low
7,Armenia,0.122945,1.463069,39.485441,19.01,99.8,69.412809,25.0,-0.183843,Low


In [164]:
X = frame.drop(['Target', 'Country Name'], axis=1).values
y = frame.Target.values

## Split and standardize

In [167]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
sc = StandardScaler()
sc.fit(X_train)
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)

## Train the RFC, and score it

In [172]:
from sklearn.ensemble import RandomForestClassifier
RFC = RandomForestClassifier()

RFC.fit(X_train, y_train)

RandomForestClassifier()

In [173]:
RFC.score(X_test, y_test)

0.6923076923076923

## GridSearch to Optimize

In [195]:
from sklearn.model_selection import GridSearchCV
param_grid = {'n_estimators' : [10, 50, 100, 300], 
              'max_depth' : [10, 20, 50, 100], 
              'class_weight' : ['balanced', 'balanced_subsample'],
              'max_leaf_nodes' : [50, 60, 70, 80, 100]}

rfc = RandomForestClassifier(n_jobs=-1)

search = GridSearchCV(estimator=rfc, param_grid=param_grid)

In [196]:
search.fit(X_train, y_train)

GridSearchCV(estimator=RandomForestClassifier(n_jobs=-1),
             param_grid={'class_weight': ['balanced', 'balanced_subsample'],
                         'max_depth': [10, 20, 50, 100],
                         'max_leaf_nodes': [50, 60, 70, 80, 100],
                         'n_estimators': [10, 50, 100, 300]})

In [197]:
best = search.best_estimator_
print(best.score(X_test, y_test))
print(search.best_params_)

0.717948717948718
{'class_weight': 'balanced', 'max_depth': 100, 'max_leaf_nodes': 60, 'n_estimators': 100}


In [198]:
best.score(X_train, y_train)

1.0

^^^ The high training accuracy but low test accuracy indicates extreme overfitting: I believe it's mostly owing to the small size of the dataset