In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, StackingRegressor, VotingRegressor
from sklearn.linear_model import LogisticRegression
from xgboost import XGBRegressor

In [2]:
df = pd.read_csv('crop_yield.csv')
df.head()

Unnamed: 0,Crop,Precipitation,SpecificHumidity_g_kg,RelativeHumidity_%,Temperature_C,Yield
0,"Cocoa, beans",2248.92,17.72,83.4,26.01,11560
1,"Cocoa, beans",1938.42,17.54,82.11,26.11,11253
2,"Cocoa, beans",2301.54,17.81,82.79,26.24,9456
3,"Cocoa, beans",2592.35,17.61,85.07,25.56,9321
4,"Cocoa, beans",2344.72,17.61,84.12,25.76,8800


### Pré-Processamento dos dados

Como já realizamos a análise exploratória dos dados, podemos partir para a preparação dos dados para o algoritmo de Machine Learning.

**Tratamento dos dados categóricos**

O parâmetro *drop-first=True* serve para evitar multicolinearidade entre os dados, ou seja, variáveis que se correlacionam entre si.

In [3]:
df = pd.get_dummies(df, columns=['Crop'], drop_first=True)
df.head()

Unnamed: 0,Precipitation,SpecificHumidity_g_kg,RelativeHumidity_%,Temperature_C,Yield,Crop_Oil palm fruit,"Crop_Rice, paddy","Crop_Rubber, natural"
0,2248.92,17.72,83.4,26.01,11560,False,False,False
1,1938.42,17.54,82.11,26.11,11253,False,False,False
2,2301.54,17.81,82.79,26.24,9456,False,False,False
3,2592.35,17.61,85.07,25.56,9321,False,False,False
4,2344.72,17.61,84.12,25.76,8800,False,False,False


**Separando variáveis independentes (X) da variável dependente (y)**

In [4]:
X = df[['Crop_Oil palm fruit', 'Crop_Rice, paddy']]
y = df['Yield']

**Separação dos dados em treino e teste**

Método HoldOut, porém também faremos validação cruzada.

In [5]:
SEED = 80

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

**Normalização**

In [6]:
normalization = MinMaxScaler()

X_train = normalization.fit_transform(X_train)
X_test = normalization.transform(X_test)

### Treinamento com vários modelos

In [7]:
def view_scores(y_true, y_pred, title):
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    print(title)
    print(f'MSE: {mse}')
    print(f'R² Score: {r2*100:.2f}')

#### Modelo Baseline

In [9]:
model_lr = LogisticRegression()
model_lr.fit(X_train, y_train)

model_lr_predict = model_lr.predict(X_test)
view_scores(y_test, model_lr_predict, 'Modelo Baseline (Logistic Regression)')

Modelo Baseline (Logistic Regression)
MSE: 157739757.5
R² Score: 97.61
