In [None]:
import pandas as pd
import numpy as np
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import metrics

# Introduction 
<img src="https://image.cnbcfm.com/api/v1/image/107088080-1657718778529-gettyimages-1236448511-AFP_9PR2JK.jpeg?v=1657718907&w=929&h=523&vtcrop=y" width="400" height="200" />


This notebook will analyse the [Amazônia Rainforest Deforestation](https://www.kaggle.com/datasets/diegosilvadefrana/brazilian-deforestation-from-2000-to-2021) dataset, which contains satellite data about deforestation from the Amazônia rainforest. The dataset is structured by the following tables.

* Counties.csv

| Column Name | Type | Description |
| --- | --- | --- |
| Nome_Microrregião | string | County Name |
| Código Município Completo | int | County Id which the first two numbers represent the state of which the county is located |

--------------------------------------------------------------------


* data.csv
| Column Name | Type | Description |
| --- | --- | --- |
| ano | int | Year |
| id_municipio | int | County Id it is the same as the one located in the table Counties.csv |
| area | int | Total area measured |
| desmatado | float | Total area deforestaed |
| incremento | float | Area measured after thus it is a increment to the previous measure|
| floresta | float | total forest area |
| nuvem | float | area covered by the clouds |
| nao_observado | float | non-measured area |
| nao_floresta | float | non-forest area  |
| hidrografia | float | hydrographic area |

----------------------------------------------------------
* states.csv

| Column Name | Type | Description |
| --- | --- | --- |
| estados_ido | int | State id |
| Estados | int | State name |

# Data Transformation 

In [None]:
df = pd.read_csv("/kaggle/input/brazilian-deforestation-from-2000-to-2021/data.csv")
mun = pd.read_csv("/kaggle/input/brazilian-deforestation-from-2000-to-2021/Counties.csv",sep = ';')
states = pd.read_csv("/kaggle/input/brazilian-deforestation-from-2000-to-2021/states.csv")

#### The first thing that we're goin to do is to make two new columns in the data.csv table this columns will cotain the county name and state. Thus we'll need the Counties.csv and states.csv tables to make it.

In [None]:
def call(number):
    """This fuction will use the first two characters of a number 
    and return all the matches from the states["estados_id"] dataframe"""
    num = str(number)[0:2]
    num = int(num)
    return states[states["estados_id"] == num]

In [None]:
def transform(df):
    """It creates two lists with the name of the county and its state"""
    munic = []
    esta = []
    for i in range(len(df["id_municipio"])):
        ind = mun[mun["Código Município Completo"] == df["id_municipio"][i]]["Nome_Microrregião"].index[0]
        nome_mun = mun[mun["Código Município Completo"] == df["id_municipio"][i]]["Nome_Microrregião"][ind]

        ind_es = mun[mun["Código Município Completo"] == df["id_municipio"][i]]["Código Município Completo"][ind]
        m = call(ind_es)["Estados"].index[0]
        nome_est = call(ind_es)["Estados"][m]

        munic.append(nome_mun)
        esta.append(nome_est)
    return munic, esta

In [None]:
def stats_year(df,nome,Mean):
    """it returns the total sum of the nome column grouped 
    by the ano column"""
    sum = df[["ano",nome]].groupby(['ano']).sum()
    media = sum[nome].mean()
    vals = []
    if Mean == False:
        for k in sum[nome]:
            vals.append(k)
    else:
        for k in sum[nome]:
            vals.append(k/media)
    return np.array(vals),sum.index

In [None]:
lista_mun , lista_est = transform(df) #lets add the new columns to our dataset
df["municipios"] = lista_mun
df["estados"] = lista_est

# Geting Statistical Insights
In this section we'll study the correlation between some variables. We'll also analyze the relationship between deforestaion and forest area.

In [None]:
new = df[["ano","area","desmatado","floresta","nuvem","nao_observado","nao_floresta","hidrografia","estados","municipios"]]
corr = round(new.corr(),2)
mask = np.triu(np.ones_like(corr, dtype=bool))
f, ax = plt.subplots(figsize=(9, 5))

sns.heatmap(corr, mask=mask, vmin=-1,vmax=1,annot = True)

As we see from the correlation matrix above, there is a clear correlation between area and floresta (forest) which is expected since the more area the more forest it can have in. Besides there is a certain correlation between area and desmatado (deforestation) which is again expected.

In [None]:
lista1, anos1 = stats_year(df,"desmatado",True)
lista2, anos2 = stats_year(df,"floresta",True)
plt.figure(figsize=(9, 5))
plt.plot(list(anos1),lista1,"r", label="Deforestation Rate of Change")
plt.plot(list(anos2),lista2,"g", label="Forest Area Rate of Change")
plt.ylabel("Percentage %")
plt.legend()

The line plot shows to us that there is a anti-correlationship between deforestation and forest area. But this relationship is unclear from 2005 and beyond because the Forest Area starts to random variate it may be due to social-political response and bias measures.

# Findind The Counties with more deforestation
Lets now find the counties with more deforestation for it we need create a new dataset with the deforestation rate and its respectively county.

In [None]:
anos = list(set(df["ano"]))
year = []
desmat = []
munic = []
esta = []
for ano in anos:
  
  new = df[df["ano"]==ano]
  new  = new.sort_values(by=['desmatado'], ascending=False)

  year.append(np.array(list(new.copy().iloc[0:10]["ano"])[:]))
  desmat.append(np.array(list(new.copy().iloc[0:10]["desmatado"])[:]))
  munic.append(np.array(list(new.copy().iloc[0:10]["municipios"])[:]))
  esta.append(np.array(list(new.copy().iloc[0:10]["estados"])[:]))

In [None]:
dic = dict()
dic["ano"] = np.array(year).reshape(1,-1)[0]
dic["estado"] = np.array(esta).reshape(1,-1)[0]
dic["desmatado"] = np.array(desmat).reshape(1,-1)[0]
dic["municipio"] = np.array(munic).reshape(1,-1)[0]
novo = pd.DataFrame(dic)
novo

In [None]:
muns = list(Counter(novo["municipio"]))
mat = []
for i in range(len(muns)):  
  
  data = df[df["municipios"] == muns[i]]
  mat.append(stats_year(data,"desmatado",False)[0])

In [None]:
plt.figure(figsize=(15, 8))
for i in range(len(muns)):
  
  if muns[i] == "São Félix do Xingu":
    plt.plot(list(set(novo.ano)),mat[i])  
    plt.text(list(set(novo.ano))[-1]-4, mat[i][-1]-1000, str(muns[i]), fontsize = 10)
  elif muns[i] == "Arinos":
    plt.plot(list(set(novo.ano)),mat[i])  
    plt.text(list(set(novo.ano))[-1]-4, mat[i][-1], str(muns[i]), fontsize = 10)
  else:
    plt.plot(list(set(novo.ano)),mat[i])
    plt.text(list(set(novo.ano))[-1], mat[i][-1], str(muns[i]), fontsize = 10)
plt.ylabel("Deforestation (Km^2)",fontsize = 20)
plt.show()

As shown above the counties: São Félix do Xingu, Altamira and Pindaré,. Are the ones with a upward tendency for deforestation from the lasts years. Some others (Paragominas, Arinos,Imperatriz, etc) reached a plateau. But none of them reached a downward tendency yet.  

# Predicting Future Deforestation
Lets now use a Machine learning Algorithm to estimate the total states deforestation for 2022 and 2023. But first we need to make our traind set X from the previous dataset 

In [None]:
estd = list(Counter(df["estados"]))
mat = []
for i in range(len(estd)):  
  
   data = df[df["estados"] == estd[i]]
   mat.append(stats_year(data,"desmatado",False)[0])

ano = []
est = []
for estado in estd:
  for i in range(2000,2022):
    ano.append(i)
    est.append(estado)

In [None]:
d = {}
d["ano"] = ano
d["estado"] = est
d["desmatado"] = np.array(mat).reshape(1,-1)[0]

X = pd.DataFrame(d)
labels = list(Counter(X["estado"]))

X["estado"] = LabelEncoder().fit_transform(X["estado"])
Y = X.pop("desmatado")
labels_encod = list(Counter(X["estado"]))

 ## Model
The model that I choose was a ensemble model with three models KNeighborsRegressor, SGDRegressor and BaggingRegressor. According to the big numbers law this three models mixied will result a more precision than use them separately.

In [None]:
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

## Fine tune Hyperparameters
Before we train our ensembe lets tune separately hyperparameters to the three models using GridSearchCV and then use VotingRegressor to train the model.

In [None]:
KN = KNeighborsRegressor()
bag = BaggingRegressor()

mod = GridSearchCV(estimator=KN,param_grid= {'n_neighbors':[1,2,3,4,5,6,7,8,9,10]},cv=2)

mod2 = GridSearchCV(estimator=bag,param_grid= {'n_estimators':[100,120,130,150,180]},cv=2)

mod3 = GridSearchCV(estimator=SGDRegressor(max_iter=1200,early_stopping=True),param_grid={'penalty':["l1","l2"]} ,cv=2)

vot = VotingRegressor(estimators=[("kn",mod),("bag",mod2),("est",mod3)])

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.1,random_state=True)

In [None]:
vot.fit(X_train,y_train)
metrics.r2_score(y_test,vot.predict(X_test))

##### Thus our R^2 is close to 0.89 which isn't bad considerig the training set size. Now lets predict the future states deforestation to 2022 and 2023. #####

In [None]:
m = []
for i in labels_encod:
  for year in range(2022,2024):
      m.append([year,i])
pred = scaler.transform(m)

In [None]:
predic = vot.predict(pred)

In [None]:
Df = pd.DataFrame(d)
ano = [i for i in range(2000,2024)]

In [None]:
plt.figure(figsize=(15,8))
c = 0
for i in labels:
  dat = Df[Df["estado"] == i]
  es = list(dat["desmatado"])
  es.append(predic[c])
  es.append(predic[c+1])
  plt.plot(ano,es,label = i)
  c+=2
plt.axvline(2021, color='k', linestyle='--')
plt.legend()
plt.xticks(ano, rotation=45)
plt.show()

# Final Conclusions
From the finds shown previously we can conclude that.

1. Amazônia deforestation is currently in a high trend
2. The Counties São Félix do Xingu, Altamira and Pindaré have one of the highest deforestation rates, currently.
3. The states Pará and Mato Grosso have one of the highest deforestation rates, currently.