In [100]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn.utils
import imblearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold, LeaveOneOut, RandomizedSearchCV  
from sklearn.metrics import recall_score, accuracy_score, precision_score, f1_score, confusion_matrix 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
%matplotlib inline

ModuleNotFoundError: No module named 'lightgbm'

In [5]:
data_df = pd.read_csv( 'https://query.data.world/s/wh6j7rxy2hvrn4ml75ci62apk5hgae' )

In [10]:
data_df['QScore'].value_counts()

3A    51473
2A      240
Name: QScore, dtype: int64

In [7]:
data_df.dropna(inplace = True)

In [9]:
data_df['QScore']=data_df['QScore'].replace(['1A'], '2A')

In [12]:
data_df2A = data_df[data_df['QScore']=='2A']
data_df3A = data_df[data_df['QScore']=='3A'].sample(350)
data_df = data_df2A.append(data_df3A)

In [15]:
data_df.head()

Unnamed: 0,country,year,country_code,record,crop_land,grazing_land,forest_land,fishing_ground,built_up_land,carbon,total,QScore
0,Georgia,2016,73,AreaTotHA,447057.2,1946943.0,2822400.0,267700.0,139657.0,0.0,5623757.0,2A
1,Hungary,2016,97,EFConsTotGHA,6264528.0,603974.4,3899932.798,203463.8,1559617.0,22696890.0,35228410.0,2A
2,Nicaragua,1986,157,EFProdTotGHA,1303620.0,2665690.0,2326240.595,2792.917,124571.3,746658.5,7169574.0,3A
3,Colombia,2015,44,AreaPerCap,0.07453239,0.8515884,1.213006798,0.1629009,0.02833873,0.0,2.330367,3A
4,Canada,1975,33,EFProdTotGHA,53428520.0,12063650.0,82078397.52,10514600.0,804855.1,141010900.0,299901000.0,3A


In [14]:
data_df = sklearn.utils.shuffle(data_df)
data_df = data_df.reset_index(drop = True)
data_df.QScore.value_counts()

3A    350
2A    240
Name: QScore, dtype: int64

In [50]:
data_df.isnull().sum()

record            0
crop_land         0
grazing_land      0
forest_land       0
fishing_ground    0
built_up_land     0
carbon            0
total             0
QScore            0
dtype: int64

Unnamed: 0,country,year,country_code,record,crop_land,grazing_land,forest_land,fishing_ground,built_up_land,carbon,total,QScore
0,Georgia,2016,73,AreaTotHA,4.470572e+05,1.946943e+06,2822400,2.677000e+05,1.396570e+05,0.000000e+00,5.623757e+06,2A
1,Hungary,2016,97,EFConsTotGHA,6.264528e+06,6.039744e+05,3899932.798,2.034638e+05,1.559617e+06,2.269689e+07,3.522841e+07,2A
2,Nicaragua,1986,157,EFProdTotGHA,1.303620e+06,2.665690e+06,2326240.595,2.792917e+03,1.245713e+05,7.466585e+05,7.169574e+06,3A
3,Colombia,2015,44,AreaPerCap,7.453239e-02,8.515884e-01,1.213006798,1.629009e-01,2.833873e-02,0.000000e+00,2.330367e+00,3A
4,Canada,1975,33,EFProdTotGHA,5.342852e+07,1.206365e+07,82078397.52,1.051460e+07,8.048551e+05,1.410109e+08,2.999010e+08,3A
...,...,...,...,...,...,...,...,...,...,...,...,...
585,Montenegro,2015,273,EFProdPerCap,6.088749e-02,1.100655e-01,0.720174,7.739580e-03,3.727246e-02,1.309918e+00,2.246057e+00,3A
586,Cuba,1995,49,AreaTotHA,4.047929e+06,2.636071e+06,2246500,5.737800e+06,3.256930e+05,0.000000e+00,1.499399e+07,3A
587,Djibouti,2016,72,AreaTotHA,2.000000e+03,1.700000e+06,5600,2.305000e+05,2.495080e+04,0.000000e+00,1.963051e+06,2A
588,Iraq,2016,103,AreaPerCap,1.424632e-01,1.075194e-01,0.022175868,3.456748e-03,3.237784e-02,0.000000e+00,3.079930e-01,2A


In [18]:
data_df.drop(columns = ['country_code', 'country', 'year'], inplace = True)

In [79]:
X_train, X_test, y_train, y_test = train_test_split(data_df.drop(columns = 'QScore'), data_df.QScore, test_size = 0.3, random_state = 0)

In [80]:
y_train.value_counts(), y_test.value_counts()

(3A    251
 2A    162
 Name: QScore, dtype: int64, 3A    99
 2A    78
 Name: QScore, dtype: int64)

In [81]:
encoder = LabelEncoder()
X_train.record = encoder.fit_transform(X_train.record)
X_test.record = encoder.transform(X_test.record)

In [82]:
smote = SMOTE(random_state = 1)
X_train_balanced, y_train_balanced = smote.fit_sample(X_train, y_train)

In [83]:
y_train_balanced.value_counts()

3A    251
2A    251
Name: QScore, dtype: int64

In [84]:
y_test.isnull().sum()

0

In [85]:
y_train.value_counts()

3A    251
2A    162
Name: QScore, dtype: int64

In [86]:
scaler = MinMaxScaler()
normalised_train_df = scaler.fit_transform(X_train_balanced.drop(columns = ['record']))
normalised_train_df = pd.DataFrame(normalised_train_df, columns = X_train_balanced.drop(columns = ['record']).columns)
normalised_train_df['record'] = X_train_balanced['record']
normalised_train_df.head()

Unnamed: 0,crop_land,grazing_land,forest_land,fishing_ground,built_up_land,carbon,total,record
0,1.030634e-09,1.743366e-09,3.516377e-09,1.677543e-09,2.730936e-10,1.897582e-11,5.816242e-10,4
1,2.9226e-10,4.52446e-08,2.521384e-08,1.229333e-09,4.372915e-10,0.0,4.021844e-09,2
2,0.0005401644,0.00517306,0.001195259,0.0002708644,0.0001122876,0.000347676,0.0006638164,5
3,4.769046e-10,3.184285e-09,2.070291e-09,2.090118e-09,1.767847e-10,2.7765e-10,6.231933e-10,6
4,1.328665e-10,5.368281e-10,7.25578e-09,5.88506e-09,1.65081e-10,0.0,7.331758e-10,2


In [87]:
X_test = X_test.reset_index(drop = True)
normalised_test_df = scaler.fit_transform(X_test.drop(columns = ['record']))
normalised_test_df = pd.DataFrame(normalised_test_df, columns = X_test.drop(columns = ['record']).columns)
normalised_test_df['record'] = X_test.record

In [88]:
log_reg = LogisticRegression()
log_reg.fit(normalised_train_df, y_train_balanced)

LogisticRegression()

In [46]:
scores = cross_val_score(log_reg, normalised_train_df, y_train_balanced, cv= 5 , scoring= 'f1_macro' )
scores
#prints

array([0.52779221, 0.55453087, 0.56892231, 0.52616191, 0.50980392])

In [89]:
new_predictions = log_reg.predict(normalised_test_df)
cnf_mat = confusion_matrix(y_true=y_test, y_pred=new_predictions, labels=['2A', '3A'])
cnf_mat

array([[75,  3],
       [82, 17]], dtype=int64)

In [90]:
accuracy = accuracy_score(y_true=y_test, y_pred=new_predictions)
print('Accuracy: {}'.format(round(accuracy*100), 2))

Accuracy: 52.0


In [91]:
f1 = f1_score(y_true=y_test, y_pred=new_predictions, pos_label='2A')
print('F1: {}'.format(round(f1*100), 2))

F1: 64.0


In [97]:
precision = precision_score(y_true=y_test, y_pred=new_predictions, pos_label='2A')
print('Precision: {}'.format(round(precision*100), 2)) 

Precision: 48.0


In [94]:
recall = recall_score(y_true=y_test, y_pred=new_predictions, pos_label='3A')
print('Recall: {}'.format(round(recall*100), 2))

Recall: 17.0


In [95]:
17/(17+82)

0.1717171717171717

In [72]:
kf = KFold(n_splits=5)
kf.split(normalised_train_df) 
f1_scores = []
#run for every split
for train_index, test_index in kf.split(normalised_train_df):
    x_train, x_test = normalised_train_df.iloc[train_index], normalised_train_df.iloc[test_index]
    y_train, y_test = y_train_balanced[train_index], y_train_balanced[test_index]
    model = LogisticRegression().fit(x_train, y_train)
  #save result to list
    f1_scores.append(f1_score(y_test, model.predict(x_test), 
                   pos_label='2A')*100)
f1_scores

[54.687500000000014,
 48.33333333333334,
 55.28455284552846,
 58.823529411764696,
 0.0]

In [75]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
f1_scores = []
#run for every split
for train_index, test_index in skf.split(normalised_train_df, y_train_balanced):
    x_train, x_test = np.array(normalised_train_df)[train_index], np.array(normalised_train_df)[test_index]
    y_train, y_test  = y_train_balanced[train_index], y_train_balanced[test_index]
    model = LogisticRegression().fit(x_train, y_train)
  #save result to list
    f1_scores.append(f1_score(y_true=y_test, y_pred=model.predict(x_test), pos_label='2A'))
f1_scores

[0.6212121212121212,
 0.5283018867924528,
 0.5050505050505051,
 0.5663716814159292,
 0.576]

In [77]:
loo = LeaveOneOut()
scores = cross_val_score(LogisticRegression(), normalised_train_df, y_train_balanced, cv=loo, 
                         scoring='f1_macro')
average_score = scores.mean() * 100

In [78]:
average_score

51.39442231075697

In [99]:
dec_tree = DecisionTreeClassifier()
dec_tree.fit(normalised_train_df, y_train_balanced)

DecisionTreeClassifier()