In [None]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
import matplotlib.pyplot as plt

In [None]:
SEED = 1
random.seed(SEED)
np.random.seed(SEED)

In [None]:
Kenya = pd.read_csv("GeoAI/Agricultural Plastic/Data/Kenya_training.csv")
Spain =  pd.read_csv("GeoAI/Agricultural Plastic/Data/Spain_training.csv")
VNM =  pd.read_csv("GeoAI/Agricultural Plastic/Data/VNM_training.csv")
VNM.rename(columns={'Lat': 'lat', 'Lon': 'lon'}, inplace=True)
df = pd.concat([Kenya, Spain, VNM], axis=0)

In [None]:
y_ = df['TARGET']-1
X_ = df[['lat', 'lon', 'blue_p50', 'green_p50', 'nir_p50', 'nira_p50', 're1_p50', 're2_p50', 
         're3_p50', 'red_p50', 'swir1_p50', 'swir2_p50', 'VV_p50', 'VH_p50']]

In [None]:
X_train, X_eval, y_train, y_eval = train_test_split(X_, y_, test_size=0.1, random_state=1)

In [None]:
from sklearn.ensemble import RandomForestClassifier

RFR = RandomForestClassifier()
RFR.fit(X_train, y_train)
RFR.score(X_eval, y_eval)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

GBR = GradientBoostingClassifier()
GBR.fit(X_train, y_train)
GBR.score(X_eval, y_eval)

In [None]:
from catboost import CatBoostClassifier

CAT = CatBoostClassifier()
CAT.fit(X_train, y_train, silent=True)
print(CAT.score(X_eval, y_eval))

In [None]:
from lightgbm import LGBMClassifier

LGB = LGBMClassifier(force_col_wise=True)
LGB.fit(X_train, y_train)
print(LGB.score(X_eval, y_eval))

In [None]:
from xgboost import XGBClassifier
XGB = XGBClassifier()
XGB.fit(X_train, y_train, eval_set=[(X_eval, y_eval)], verbose=False)
print(XGB.score(X_eval, y_eval))

## Submission

In [None]:
Kenya = pd.read_csv("GeoAI/Agricultural Plastic/Data/Kenya_testing.csv")
Spain =  pd.read_csv("GeoAI/Agricultural Plastic/Data/Spain_validation.csv")
VNM =  pd.read_csv("GeoAI/Agricultural Plastic/Data/VNM_testing.csv")
VNM.rename(columns={'Lon': 'lon', 'Lat': 'lat'}, inplace=True)

submission = pd.DataFrame()
country = ['Kenya', 'Spain', 'VNM']
files = [Kenya, Spain, VNM]

In [None]:
for i in range(3):
    tmp = pd.DataFrame()
    # Create ID for submission file
    ID = [country[i] + '_' + str(x) for x in files[i]['ID']]
    # Predict
    X_ = files[i][['lat', 'lon', 'blue_p50', 'green_p50', 'nir_p50', 'nira_p50', 're1_p50', 're2_p50', 
         're3_p50', 'red_p50', 'swir1_p50', 'swir2_p50', 'VV_p50', 'VH_p50']]
    rfr  = RFR.pred_proba(X_)*0.96
    gbr  = GBR.pred_proba(X_)*0.96
    cat  = GBR.pred_proba(X_)*0.97
    lgb = LGB.pred_proba(X_)*0.98
    xgb = XGB.pred_proba(X_)*0.98

pred = (rfr+gbr+cat+lgb+xgb)/(0.96+0.96+0.97+0.98+0.98)

In [None]:
pred = np.round((rfr+gbr+cat+lgb+xgb)/5,0).astype(int)
    
    TARGET = pred+1
    
    tmp["ID"] = ID
    tmp["TARGET"] = TARGET
    submission = pd.concat([submission, tmp], axis=0)
submission.to_csv('submission_2.csv',index=False)

## Verificacao

In [None]:
consolidado_0 = pd.read_csv('submission_0.csv')
consolidado_2 = pd.read_csv('submission_2.csv')

c = pd.DataFrame()
c['s0'] = consolidado_0['TARGET']
c['s2'] = consolidado_2['TARGET']

c['diferente'] = (c['s0'] != c['s2'])

# Contando quantas vezes os valores são diferentes
num_dif = c['diferente'].sum()
num_dif

num_dif/c.shape[0]