# <b>Managing the Quality Metric of Global Ecological Footprint<B>

In [1]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("Data_for_UCI_named.csv")
df.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


In [4]:
# Drop stab

df = df.drop('stab', axis=1)

In [5]:
# Missing values.

df.isna().sum()

tau1     0
tau2     0
tau3     0
tau4     0
p1       0
p2       0
p3       0
p4       0
g1       0
g2       0
g3       0
g4       0
stabf    0
dtype: int64

In [6]:
# Distribution of stabf.

df['stabf'].value_counts()

unstable    6380
stable      3620
Name: stabf, dtype: int64

In [7]:
# Convert stabf to binary.
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df['stabf'] = encoder.fit_transform(df['stabf'])

In [8]:
df['stabf'].value_counts()

1    6380
0    3620
Name: stabf, dtype: int64

In [9]:
# Split into dependent and independent variables.

X = df.iloc[:, :-1]
y = df.iloc[:, 12]

X.shape, y.shape

((10000, 12), (10000,))

In [10]:
X.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923


In [11]:
# Using StandardScaler to normalise the dataset,
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [12]:
# Table of the scaled data.

scaled_data = pd.DataFrame(data=X_scaled, columns=X.columns)
scaled_data['stabf'] = df['stabf']

scaled_data.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stabf
0,-0.835374,-0.791317,1.141704,1.652103,0.017397,1.079405,-0.017078,-1.092545,0.457467,1.220013,1.321628,1.579026,1
1,1.478297,-0.126705,-0.803111,-1.415043,1.752124,-1.593619,-1.438158,-0.011575,-0.406791,1.230354,0.135424,0.936256,0
2,1.357093,1.31214,-0.803499,-1.471504,-0.458492,0.098253,-0.06284,0.760963,-1.319852,0.881299,1.146596,-1.513802,1
3,-1.653138,0.882289,-0.278354,-1.060901,0.28425,0.513904,-1.591046,0.583414,-0.287304,1.64725,1.474543,-0.59175,1
4,-0.771543,0.860108,-0.11167,1.680114,-0.298075,0.28745,-1.376343,1.606636,0.992226,-0.25361,0.481133,1.079063,1


In [13]:
# Split into X and Y.

X = scaled_data.iloc[:, :-1]
y = scaled_data.iloc[:, 12]

X.shape, y.shape

((10000, 12), (10000,))

In [14]:
# Split into train and test sets.
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

In [15]:
# Shape of train set.

x_train.shape, y_train.shape

((8000, 12), (8000,))

In [16]:
# Shape of test set.

x_test.shape, y_test.shape

((2000, 12), (2000,))

# Modeling

In [17]:
# Random Forest.
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

forest = RandomForestClassifier(random_state=1)
forest.fit(x_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=1, verbose=0, warm_start=False)

In [18]:
# Prediction.

pred = forest.predict(x_test)

In [20]:
# Evaluation.
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

# Defining a function for evaluation.
def eval(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    pre = precision_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    roc = roc_auc_score(y_true, y_pred)
    
    print('Accuracy: {}'.format(round(acc * 100), 2))
    print('Precision: {}'.format(round(pre * 100), 2))
    print('Recall: {}'.format(round(rec * 100), 2))
    print('F1_score: {}'.format(round(f1 * 100), 2))
    print('ROC: {}'.format(round(roc * 100), 2))

In [21]:
# Evaluating the RF model.

eval(y_test, pred)

Accuracy: 89.0
Precision: 93.0
Recall: 90.0
F1_score: 91.0
ROC: 89.0


In [22]:
# Extra Trees.

tree = ExtraTreesClassifier(random_state=1)
tree.fit(x_train, y_train)



ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=1, verbose=0, warm_start=False)

In [23]:
# Prediction.

pred2 = tree.predict(x_test)

In [24]:
# Evaluating the model.

eval(y_test, pred2)

Accuracy: 89.0
Precision: 92.0
Recall: 91.0
F1_score: 92.0
ROC: 88.0


In [25]:
# Xtreme Gradient Boost
from xgboost import XGBClassifier

xgb = XGBClassifier(random_state=1)
xgb.fit(x_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=1,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [26]:
# Prediction.

pred3 = xgb.predict(x_test)

In [27]:
# Evaluating the model.

eval(y_test, pred3)

Accuracy: 92.0
Precision: 92.0
Recall: 96.0
F1_score: 94.0
ROC: 90.0


In [28]:
# Installing LGBM

!pip install lightgbm

Collecting lightgbm
  Downloading https://files.pythonhosted.org/packages/1f/cb/a8ec24334c35a7d0c87b4e4e056bd2137573c7c1bd81c760b79a2f370254/lightgbm-2.3.1-py2.py3-none-win_amd64.whl (544kB)
Installing collected packages: lightgbm
Successfully installed lightgbm-2.3.1


In [29]:
# Light Gradient Boosted Machine
from lightgbm import LGBMClassifier

lit = LGBMClassifier(random_state=1)
lit.fit(x_train, y_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_state=1, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [30]:
# Prediction.

pred4 = lit.predict(x_test)

In [31]:
# Evaluating the model.

eval(y_test, pred4)

Accuracy: 95.0
Precision: 95.0
Recall: 97.0
F1_score: 96.0
ROC: 94.0


In [32]:
# Hyperparameter Tuning.
from sklearn.model_selection import RandomizedSearchCV

n_est = [50, 100, 300, 500, 1000]
min_samsplit = [2, 3, 5, 7, 9]
min_samleaf = [1, 2, 4, 6, 8]
max_ft = ['auto', 'sqrt', 'log2', None]

hyperparameter_grid = {'n_estimators':n_est, 'min_samples_leaf':min_samleaf, 'min_samples_split':min_samsplit,
                       'max_features':max_ft}

In [33]:
rsc = RandomizedSearchCV(estimator=tree, param_distributions=hyperparameter_grid, 
                         cv=5, n_iter=10, scoring='accuracy', n_jobs=-1, verbose=1, random_state=1)
clf = rsc.fit(X, y)
clf

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   44.3s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   50.6s finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=1, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=10, n_jobs=-1,
          param_distributions={'n_estimators': [50, 100, 300, 500, 1000], 'min_samples_leaf': [1, 2, 4, 6, 8], 'min_samples_split': [2, 3, 5, 7, 9], 'max_features': ['auto', 'sqrt', 'log2', None]},
          pre_dispatch='2*n_jobs', random_state=1, refit=True,
          return_train_score='warn', scoring='accuracy', verbose=1)

In [34]:
# Best Score
best_score = clf.best_score_
print('Best score: {}'.format(best_score))

# Best parameters.
best_params = clf.best_params_
print('Best parameters: {}'.format(best_params))

Best score: 0.9275
Best parameters: {'n_estimators': 1000, 'min_samples_split': 2, 'min_samples_leaf': 8, 'max_features': None}


In [35]:
# Training a new Extra Tree Classifier.

tree = ExtraTreesClassifier(n_estimators=1000, min_samples_split=2, min_samples_leaf=8, 
                            max_features=None, random_state=1)
tree.fit(x_train, y_train)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features=None, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=8, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
           oob_score=False, random_state=1, verbose=0, warm_start=False)

In [36]:
# New Prediction

tree2 = tree.predict(x_test) 

In [37]:
# Feature Importance.

importance = tree.feature_importances_

In [38]:
table = pd.DataFrame(data=importance)
table

Unnamed: 0,0
0,0.13724
1,0.140508
2,0.13468
3,0.135417
4,0.003683
5,0.005337
6,0.005429
7,0.004962
8,0.102562
9,0.107578


In [39]:
table.max()

0    0.140508
dtype: float64

In [40]:
table.min()

0    0.003683
dtype: float64