In [3]:
import pandas as pd
import numpy as np
df = pd.read_csv('NFA 2019 public_data.csv')
df.head()

  df = pd.read_csv('NFA 2019 public_data.csv')


Unnamed: 0,country,year,country_code,record,crop_land,grazing_land,forest_land,fishing_ground,built_up_land,carbon,total,QScore
0,Armenia,1992,1,AreaPerCap,0.140292,0.199546,0.097188051,0.036888,0.02932,0.0,0.5032351,3A
1,Armenia,1992,1,AreaTotHA,483000.0,687000.0,334600.0,127000.0,100943.0008,0.0,1732543.0,3A
2,Armenia,1992,1,BiocapPerCap,0.159804,0.135261,0.084003213,0.013742,0.033398,0.0,0.4262086,3A
3,Armenia,1992,1,BiocapTotGHA,550176.2427,465677.9722,289207.1078,47311.55172,114982.2793,0.0,1467355.0,3A
4,Armenia,1992,1,EFConsPerCap,0.38751,0.189462,1.26e-06,0.004165,0.033398,1.114093,1.728629,3A


In [22]:
df.shape

(72186, 12)

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72186 entries, 0 to 72185
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   country         72186 non-null  object 
 1   year            72186 non-null  int64  
 2   country_code    72186 non-null  int64  
 3   record          72186 non-null  object 
 4   crop_land       51714 non-null  float64
 5   grazing_land    51714 non-null  float64
 6   forest_land     51714 non-null  object 
 7   fishing_ground  51713 non-null  float64
 8   built_up_land   51713 non-null  float64
 9   carbon          51713 non-null  float64
 10  total           72177 non-null  float64
 11  QScore          72185 non-null  object 
dtypes: float64(6), int64(2), object(4)
memory usage: 6.6+ MB


In [24]:
df.isnull().sum()

country               0
year                  0
country_code          0
record                0
crop_land         20472
grazing_land      20472
forest_land       20472
fishing_ground    20473
built_up_land     20473
carbon            20473
total                 9
QScore                1
dtype: int64

In [25]:
df.isna().sum()

country               0
year                  0
country_code          0
record                0
crop_land         20472
grazing_land      20472
forest_land       20472
fishing_ground    20473
built_up_land     20473
carbon            20473
total                 9
QScore                1
dtype: int64

In [26]:
#check distribution of target variable
df['QScore'].value_counts()

QScore
3A    51481
2A    10576
2B    10096
1A       16
1B       16
Name: count, dtype: int64

In [23]:
#for simplicity, we will drop the rows with missing values.
df = df.dropna()
df.isna().sum()

country           0
year              0
country_code      0
record            0
crop_land         0
grazing_land      0
forest_land       0
fishing_ground    0
built_up_land     0
carbon            0
total             0
QScore            0
dtype: int64

In [None]:
#An obvious change in our target variable after removing the missing values is that there are only three classes left #and from the distribution of the 3 classes, we can see that there is an obvious imbalance between the classes. #There are methods that can be applied to handle this imbalance such as oversampling and undersampling.
#Oversampling involves increasing the number of instances in the class with fewer instances while undersampling #involves reducing the data points in the class with more instances.
#For now, we will convert this to a binary classification problem by combining class '2A' and '1A'.

In [25]:
df['QScore'] = df['QScore'].replace(['1A'], '2A')
df.QScore.value_counts()

QScore
3A    51473
2A      240
Name: count, dtype: int64

In [27]:
df_2A = df[df.QScore=='2A']
df_3A = df[df.QScore=='3A'].sample(350)
data_df = pd.concat([df_2A, df_3A], ignore_index=True)

In [29]:
import sklearn.utils
data_df = sklearn.utils.shuffle(data_df)
data_df = data_df.reset_index(drop=True)
data_df.shape
data_df.QScore.value_counts()

QScore
3A    350
2A    240
Name: count, dtype: int64

In [31]:
#more preprocessing
data_df = data_df.drop(columns=['country_code', 'country', 'year'])
X = data_df.drop(columns='QScore')
y = data_df['QScore']

In [39]:
#split the data into training and testing sets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
y_train.value_counts()

QScore
3A    250
2A    163
Name: count, dtype: int64

In [None]:
#There is still an imbalance in the class distribution.
#For this, we use SMOTE only on the training data to handle this.

In [53]:
#encode categorical variable
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
x_train.record = encoder.fit_transform(x_train.record)
x_test.record = encoder.transform(x_test.record)
x_test.record

225    5
14     6
85     1
418    7
132    6
      ..
346    4
369    5
140    3
533    5
171    5
Name: record, Length: 177, dtype: int64

In [59]:
import imblearn
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=1)
x_train_balanced, y_balanced = smote.fit_resample(x_train, y_train)

In [61]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
normalised_train_df = scaler.fit_transform(x_train_balanced.drop(columns=['record']))
normalised_train_df = pd.DataFrame(normalised_train_df, columns=x_train_balanced.drop(columns=['record']).columns)
normalised_train_df['record'] = x_train_balanced['record']

In [67]:
x_test = x_test.reset_index(drop=True)
normalised_test_df = scaler.transform(x_test.drop(columns=['record']))
normalised_test_df = pd.DataFrame(normalised_test_df, columns=x_test.drop(columns=['record']).columns)
normalised_test_df['record'] = x_test['record']

In [69]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()
log_reg.fit(normalised_train_df, y_balanced)

In [75]:
#cross validation
from sklearn.model_selection import cross_val_score
scores=cross_val_score(log_reg, normalised_train_df, y_balanced, cv=5, scoring='f1_macro')
scores

array([0.48003098, 0.50248756, 0.46524064, 0.43227899, 0.48583527])

In [123]:
# K-Fold cross validation
from sklearn.model_selection import KFold
kf=KFold(n_splits=5)
kf.split(normalised_train_df)


#run for every split
for train_index, test_index in kf.split(normalised_train_df):
    x_train, x_test=normalised_train_df.iloc[train_index], normalised_train_df.iloc[test_index]
    y_train, y_test=y_balanced.iloc[train_index], y_balanced.iloc[test_index]

model = LogisticRegression().fit(x_train, y_train)

#save result to list
from sklearn.metrics import f1_score
f1_scores=[]
f1_scores.append(f1_score(y_true=y_test, y_pred=model.predict(x_test), pos_label="2A")*100)
print(f1_scores)

[0.0]


In [None]:
# Stratified KFold Cross Validation
