In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [2]:
df=pd.read_csv('NFA 2019 public_data.csv')
df.head()

Unnamed: 0,country,year,country_code,record,crop_land,grazing_land,forest_land,fishing_ground,built_up_land,carbon,total,QScore
0,Armenia,1992,1,AreaPerCap,0.140292,0.199546,0.097188051,0.036888,0.02932,0.0,0.5032351,3A
1,Armenia,1992,1,AreaTotHA,483000.0,687000.0,334600.0,127000.0,100943.0008,0.0,1732543.0,3A
2,Armenia,1992,1,BiocapPerCap,0.159804,0.135261,0.084003213,0.013742,0.033398,0.0,0.4262086,3A
3,Armenia,1992,1,BiocapTotGHA,550176.2427,465677.9722,289207.1078,47311.55172,114982.2793,0.0,1467355.0,3A
4,Armenia,1992,1,EFConsPerCap,0.38751,0.189462,1.26e-06,0.004165,0.033398,1.114093,1.728629,3A


In [3]:
df['QScore'].value_counts()

3A    51481
2A    10576
2B    10096
1B       16
1A       16
Name: QScore, dtype: int64

In [4]:
df.isnull().sum()

country               0
year                  0
country_code          0
record                0
crop_land         20472
grazing_land      20472
forest_land       20472
fishing_ground    20473
built_up_land     20473
carbon            20473
total                 9
QScore                1
dtype: int64

In [5]:
df.shape

(72186, 12)

In [6]:
#for simplicity we will drop the missing values
df=df.dropna()

In [7]:
df.shape

(51713, 12)

In [8]:
df['QScore'].value_counts()

3A    51473
2A      224
1A       16
Name: QScore, dtype: int64

An obvious change in our target variable after removing the missing values is that there
are only three classes left #and from the distribution of the 3 classes, we can see that
there is an obvious imbalance between the classes. #There are methods that can be applied to
handle this imbalance such as oversampling and undersampling.
#Oversampling involves increasing the number of instances in the class with fewer instances
while undersampling #involves reducing the data points in the class with more instances.
#For now, we will convert this to a binary classification problem by combining class '2A'
and '1A'.

In [9]:
df['QScore']=df['QScore'].replace('1A','2A')

In [10]:
df['QScore'].value_counts()

3A    51473
2A      240
Name: QScore, dtype: int64

In [11]:
df_2A=df[df['QScore']=='2A']
df_3A=df[df['QScore']=='3A'].sample(350)

In [12]:
data_df=df_2A.append(df_3A)

In [13]:
data_df

Unnamed: 0,country,year,country_code,record,crop_land,grazing_land,forest_land,fishing_ground,built_up_land,carbon,total,QScore
1536,Algeria,2016,4,AreaPerCap,2.072989e-01,8.112722e-01,0.048357265,2.258528e-02,2.998367e-02,0.000000e+00,1.119497e+00,2A
1537,Algeria,2016,4,AreaTotHA,8.417600e+06,3.294260e+07,1963600,9.171000e+05,1.217520e+06,0.000000e+00,4.545842e+07,2A
1538,Algeria,2016,4,BiocapPerCap,2.021916e-01,2.636077e-01,0.027166736,7.947991e-03,2.924496e-02,0.000000e+00,5.301590e-01,2A
1539,Algeria,2016,4,BiocapTotGHA,8.210214e+06,1.070408e+07,1103135.245,3.227369e+05,1.187524e+06,0.000000e+00,2.152769e+07,2A
1540,Algeria,2016,4,EFConsPerCap,6.280528e-01,1.810332e-01,0.162800822,1.472910e-02,2.924496e-02,1.391455e+00,2.407316e+00,2A
...,...,...,...,...,...,...,...,...,...,...,...,...
18517,El Salvador,1978,60,EFConsTotGHA,1.426707e+06,8.448342e+05,1511534.046,2.235366e+04,3.120571e+05,1.703008e+06,5.820495e+06,3A
49917,Portugal,1983,174,BiocapTotGHA,2.421672e+06,7.510571e+05,8675585.105,7.848406e+05,1.520129e+05,0.000000e+00,1.278517e+07,3A
67457,"Venezuela, Bolivarian Republic of",2015,236,EFProdTotGHA,3.463971e+06,1.732704e+07,2547600.481,2.128542e+06,9.241772e+05,5.039439e+07,7.678571e+07,3A
372,Afghanistan,1982,2,EFConsPerCap,3.792774e-01,4.080230e-01,0.111879558,8.700000e-05,2.658155e-02,7.858304e-02,1.004432e+00,3A


In [14]:
import sklearn.utils

In [15]:
data_df=sklearn.utils.shuffle(data_df)

In [16]:
data_df

Unnamed: 0,country,year,country_code,record,crop_land,grazing_land,forest_land,fishing_ground,built_up_land,carbon,total,QScore
1536,Algeria,2016,4,AreaPerCap,2.072989e-01,8.112722e-01,0.048357265,2.258528e-02,2.998367e-02,0.000000e+00,1.119497e+00,2A
59426,Switzerland,1969,211,AreaPerCap,1.135413e-01,1.717734e-01,0.164812729,2.318310e-02,3.764946e-02,0.000000e+00,5.109600e-01,3A
52569,Romania,1980,183,EFProdTotGHA,2.195579e+07,2.394070e+06,11545181.71,6.564297e+05,2.738149e+06,6.794059e+07,1.072302e+08,3A
31554,Kazakhstan,2008,108,BiocapPerCap,1.167128e+00,1.922014e+00,0.247901342,5.780042e-02,3.343834e-02,0.000000e+00,3.428283e+00,3A
51021,Timor-Leste,2016,176,BiocapTotGHA,2.148567e+05,6.888098e+04,538809.031,1.066421e+06,4.894987e+04,0.000000e+00,1.937918e+06,2A
...,...,...,...,...,...,...,...,...,...,...,...,...
65467,Ukraine,2016,230,AreaTotHA,3.267800e+07,8.837000e+06,9678800,1.028120e+07,1.440340e+06,0.000000e+00,6.291534e+07,2A
23965,Ghana,1997,81,EFConsTotGHA,6.497406e+06,8.948022e+05,10144243.14,3.941338e+06,8.214522e+05,3.147551e+06,2.544679e+07,3A
22023,Djibouti,2016,72,EFProdTotGHA,1.562820e+04,2.030450e+05,148983.9107,2.447015e+04,1.949681e+05,2.536166e+05,8.407119e+05,2A
42453,Morocco,2016,143,EFConsTotGHA,2.034400e+07,5.634301e+06,5215279.784,2.980132e+06,9.241300e+05,2.488221e+07,5.998005e+07,2A


In [17]:
data_df=data_df.reset_index(drop=True)

In [18]:
data_df.shape

(590, 12)

In [19]:
data_df['QScore'].value_counts()

3A    350
2A    240
Name: QScore, dtype: int64

In [20]:
data_df.head()

Unnamed: 0,country,year,country_code,record,crop_land,grazing_land,forest_land,fishing_ground,built_up_land,carbon,total,QScore
0,Algeria,2016,4,AreaPerCap,0.2072989,0.8112722,0.048357265,0.02258528,0.02998367,0.0,1.119497,2A
1,Switzerland,1969,211,AreaPerCap,0.1135413,0.1717734,0.164812729,0.0231831,0.03764946,0.0,0.51096,3A
2,Romania,1980,183,EFProdTotGHA,21955790.0,2394070.0,11545181.71,656429.7,2738149.0,67940587.2,107230200.0,3A
3,Kazakhstan,2008,108,BiocapPerCap,1.167128,1.922014,0.247901342,0.05780042,0.03343834,0.0,3.428283,3A
4,Timor-Leste,2016,176,BiocapTotGHA,214856.7,68880.98,538809.031,1066421.0,48949.87,0.0,1937918.0,2A


In [21]:
data_df.drop(columns=['country','year','country_code'], inplace=True)

In [22]:
data_df.head()

Unnamed: 0,record,crop_land,grazing_land,forest_land,fishing_ground,built_up_land,carbon,total,QScore
0,AreaPerCap,0.2072989,0.8112722,0.048357265,0.02258528,0.02998367,0.0,1.119497,2A
1,AreaPerCap,0.1135413,0.1717734,0.164812729,0.0231831,0.03764946,0.0,0.51096,3A
2,EFProdTotGHA,21955790.0,2394070.0,11545181.71,656429.7,2738149.0,67940587.2,107230200.0,3A
3,BiocapPerCap,1.167128,1.922014,0.247901342,0.05780042,0.03343834,0.0,3.428283,3A
4,BiocapTotGHA,214856.7,68880.98,538809.031,1066421.0,48949.87,0.0,1937918.0,2A


In [23]:
X=data_df.drop('QScore',axis=1)
y=data_df['QScore']

In [24]:
#split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3 , random_state=0)

In [25]:
y_train.value_counts()

3A    246
2A    167
Name: QScore, dtype: int64

There is still an imbalance in the class distribution. For this, we use SMOTE only on the


training data to handle this.

In [26]:
X_train.head()

Unnamed: 0,record,crop_land,grazing_land,forest_land,fishing_ground,built_up_land,carbon,total
285,EFProdPerCap,0.035724,0.465888,0.443201339,0.003682909,0.023631,0.01736005,0.9894872
113,BiocapTotGHA,388536.6242,3673.652281,1221099.829,665265.0,146756.4949,0.0,2425332.0
18,EFProdTotGHA,103564.4246,7070.478866,867208.3107,62625.41,27456.31084,1661044.0,2728969.0
76,AreaTotHA,6000.0,50000.0,0.0,3017600.0,10474.30038,0.0,3084074.0
206,EFConsTotGHA,436824.8463,234720.4956,222127.9262,43060.74,62943.44893,7069367.0,8069045.0


In [27]:
#Encode Categorical Variable
from sklearn.preprocessing import LabelEncoder

In [28]:
label=LabelEncoder()

In [29]:
X_train['record']=label.fit_transform(X_train['record'])

In [30]:
X_train.head()

Unnamed: 0,record,crop_land,grazing_land,forest_land,fishing_ground,built_up_land,carbon,total
285,6,0.035724,0.465888,0.443201339,0.003682909,0.023631,0.01736005,0.9894872
113,3,388536.6242,3673.652281,1221099.829,665265.0,146756.4949,0.0,2425332.0
18,7,103564.4246,7070.478866,867208.3107,62625.41,27456.31084,1661044.0,2728969.0
76,1,6000.0,50000.0,0.0,3017600.0,10474.30038,0.0,3084074.0
206,5,436824.8463,234720.4956,222127.9262,43060.74,62943.44893,7069367.0,8069045.0


In [31]:
X_test['record']=label.transform(X_test['record'])

In [32]:
X_test.head()

Unnamed: 0,record,crop_land,grazing_land,forest_land,fishing_ground,built_up_land,carbon,total
225,6,0.24579,0.00266,0.137422988,0.194233,0.050238,0.253608,0.8839511
14,3,949409.6353,829583.4632,285609.1338,46478.27422,161324.4177,0.0,2272405.0
85,0,0.141464,0.768372,0.439728993,0.011913,0.039568,0.0,1.401045
418,0,0.147361,0.085142,1.274181543,0.844706,0.039067,0.0,2.390458
132,3,103564.4246,109479.5858,2180095.181,753434.6272,27456.31084,0.0,3174030.0


In [33]:
import imblearn

In [34]:
from imblearn.over_sampling import SMOTE

In [35]:
smote=SMOTE(random_state=1)

In [36]:
X_train_balance,y_balance=smote.fit_resample(X_train, y_train)

In [38]:
y_balance.value_counts()

2A    246
3A    246
Name: QScore, dtype: int64

In [40]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
normalised_train_df = scaler.fit_transform(X_train_balance.drop(columns=[ 'record' ]))
normalised_train_df = pd.DataFrame(normalised_train_df, columns=X_train_balance.drop(columns=[ 'record' ]).columns)
normalised_train_df[ 'record' ] = X_train_balance[ 'record']

In [41]:
normalised_train_df.head()

Unnamed: 0,crop_land,grazing_land,forest_land,fishing_ground,built_up_land,carbon,total,record
0,9.117221e-12,4.504243e-10,2.247636e-10,6.03365e-12,5.614787e-11,1.392559e-12,4.115255e-11,6
1,0.0001054224,3.551721e-06,0.0006192644,0.001089893,0.0003486938,0.0,0.0001201297,3
2,2.810033e-05,6.835804e-06,0.0004397931,0.0001025982,6.523627e-05,0.0001332428,0.0001351692,7
3,1.627991e-06,4.834046e-05,0.0,0.004943685,2.488697e-05,0.0,0.000152758,1
4,0.0001185245,0.0002269299,0.0001126492,7.054571e-05,0.0001495538,0.0005670784,0.0003996698,5


In [42]:
X_test = X_test.reset_index(drop= True )
normalised_test_df = scaler.transform(X_test.drop(columns=[ 'record' ]))
normalised_test_df = pd.DataFrame(normalised_test_df, columns=X_test.drop(columns=[ 'record' ]).columns)
normalised_test_df[ 'record' ] = X_test[ 'record' ]

In [43]:
normalised_test_df.head()

Unnamed: 0,crop_land,grazing_land,forest_land,fishing_ground,built_up_land,carbon,total,record
0,6.611487e-11,2.571592e-12,6.969223e-11,3.182083e-10,1.193648e-10,2.034346e-11,3.592521e-11,6
1,0.0002576051,0.0008020489,0.0001448429,7.61446e-05,0.0003833072,0.0,0.000112555,3
2,3.780776e-11,7.428688e-10,2.230027e-10,1.951645e-11,9.401318e-11,0.0,6.153752e-11,0
3,3.940786e-11,8.231599e-11,6.461841e-10,1.383868e-09,9.282417e-11,0.0,1.105444e-10,0
4,2.810033e-05,0.0001058459,0.001105606,0.00123434,6.523627e-05,0.0,0.0001572137,3


In [45]:
from sklearn.linear_model import LogisticRegression

In [46]:
log_reg=LogisticRegression()

In [47]:
log_reg.fit(normalised_train_df,y_balance)

In [48]:
from sklearn.metrics import confusion_matrix,classification_report,f1_score, accuracy_score,precision_score,recall_score

In [49]:
from sklearn.model_selection import cross_val_score

In [53]:
score=cross_val_score(log_reg,normalised_train_df,y_balance,cv=5, scoring='f1_macro')

In [54]:
score.mean()

0.476354163331952

In [55]:
from sklearn.model_selection import KFold

In [62]:
kf=KFold(n_splits=5)
kf.split(normalised_train_df)

f1_scores =[]
#run for every split
for train_index, test_index in kf.split(normalised_train_df):
    
    x_train, x_test = normalised_train_df.iloc[train_index],normalised_train_df.iloc[test_index]
    yy_train, yy_test = y_balance[train_index],y_balance[test_index]
    model = LogisticRegression().fit(x_train, yy_train)
    #save result to list
    f1_scores.append(f1_score(y_true=yy_test, y_pred=model.predict(x_test),pos_label= '2A' )* 100 )
     

In [63]:
print(f1_scores)

[55.88235294117647, 55.88235294117647, 58.2089552238806, 57.97101449275361, 0.0]


In [65]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits= 5 , shuffle= True , random_state= 1 )
f1_scores = []
#run for every split
for train_index, test_index in skf.split(normalised_train_df, y_balance):
    x_train, x_test = np.array(normalised_train_df)[train_index],np.array(normalised_train_df)[test_index]
    yy_train, yy_test = y_balance[train_index], y_balance[test_index]
    model = LogisticRegression().fit(x_train, yy_train)
    #save result to list
    f1_scores.append(f1_score(y_true=yy_test, y_pred=model.predict(x_test), pos_label= '2A' ))

In [66]:
print(f1_scores)

[0.42, 0.6666666666666666, 0.5420560747663551, 0.43298969072164945, 0.5142857142857142]


In [67]:
from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()
scores = cross_val_score(LogisticRegression(), normalised_train_df, y_balance, cv=loo, scoring= 'f1_macro' )
average_score = scores.mean() * 100 


In [68]:
average_score

46.138211382113816

In [71]:
new_predictions=log_reg.predict(normalised_test_df)

In [72]:
new_predictions

array(['3A', '2A', '2A', '2A', '2A', '2A', '2A', '2A', '3A', '2A', '3A',
       '2A', '2A', '2A', '3A', '2A', '2A', '2A', '3A', '3A', '2A', '2A',
       '2A', '2A', '2A', '2A', '2A', '2A', '3A', '3A', '2A', '3A', '2A',
       '3A', '3A', '3A', '2A', '2A', '3A', '2A', '3A', '2A', '2A', '2A',
       '3A', '3A', '3A', '3A', '2A', '2A', '2A', '3A', '3A', '3A', '2A',
       '2A', '2A', '3A', '3A', '3A', '3A', '2A', '2A', '2A', '3A', '2A',
       '2A', '2A', '2A', '2A', '2A', '3A', '3A', '2A', '3A', '3A', '2A',
       '3A', '3A', '2A', '3A', '3A', '2A', '3A', '2A', '3A', '2A', '2A',
       '3A', '3A', '2A', '2A', '2A', '2A', '3A', '2A', '2A', '2A', '3A',
       '2A', '3A', '2A', '2A', '2A', '2A', '2A', '2A', '3A', '2A', '2A',
       '2A', '2A', '2A', '2A', '3A', '2A', '2A', '3A', '3A', '2A', '2A',
       '2A', '3A', '2A', '3A', '2A', '2A', '3A', '2A', '2A', '2A', '2A',
       '2A', '2A', '3A', '3A', '2A', '2A', '3A', '2A', '2A', '2A', '2A',
       '2A', '2A', '3A', '3A', '3A', '2A', '3A', '3

In [75]:
confusion_matrix(y_test,new_predictions)

array([[48, 25],
       [60, 44]], dtype=int64)

In [76]:
accuracy_score(y_test,new_predictions)

0.519774011299435

In [77]:
print(classification_report(y_test,new_predictions))

              precision    recall  f1-score   support

          2A       0.44      0.66      0.53        73
          3A       0.64      0.42      0.51       104

    accuracy                           0.52       177
   macro avg       0.54      0.54      0.52       177
weighted avg       0.56      0.52      0.52       177



In [85]:
precision_score(y_test,new_predictions, pos_label='2A')

0.4444444444444444

In [86]:
recall_score(y_test,new_predictions,pos_label='2A')

0.6575342465753424

In [87]:
f1_score(y_test,new_predictions,pos_label='2A')

0.5303867403314917