In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, StackingRegressor, VotingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor

from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv('crop_yield.csv')
df.head()

Unnamed: 0,Crop,Precipitation,SpecificHumidity_g_kg,RelativeHumidity_%,Temperature_C,Yield
0,"Cocoa, beans",2248.92,17.72,83.4,26.01,11560
1,"Cocoa, beans",1938.42,17.54,82.11,26.11,11253
2,"Cocoa, beans",2301.54,17.81,82.79,26.24,9456
3,"Cocoa, beans",2592.35,17.61,85.07,25.56,9321
4,"Cocoa, beans",2344.72,17.61,84.12,25.76,8800


### Pré-Processamento dos dados

Como já realizamos a análise exploratória dos dados, podemos partir para a preparação dos dados para o algoritmo de Machine Learning.

**Tratamento dos dados categóricos**

O parâmetro *drop-first=True* serve para evitar multicolinearidade entre os dados, ou seja, variáveis que se correlacionam entre si.

In [3]:
df = pd.get_dummies(df, columns=['Crop'], drop_first=True)

**Combinação das 2 features de humidade**

In [4]:
df['HumidityCombined'] = df['RelativeHumidity_%'] * df['SpecificHumidity_g_kg'] # Combinação das features

df.drop(['SpecificHumidity_g_kg', 'RelativeHumidity_%'], axis=1, inplace=True) # Excluindo valores de humidade anteriores
df.head()

Unnamed: 0,Precipitation,Temperature_C,Yield,Crop_Oil palm fruit,"Crop_Rice, paddy","Crop_Rubber, natural",HumidityCombined
0,2248.92,26.01,11560,False,False,False,1477.848
1,1938.42,26.11,11253,False,False,False,1440.2094
2,2301.54,26.24,9456,False,False,False,1474.4899
3,2592.35,25.56,9321,False,False,False,1498.0827
4,2344.72,25.76,8800,False,False,False,1481.3532


**Separando variáveis independentes (X) da variável dependente (y)**

In [None]:
X = df[['Crop_Oil palm fruit', 'Crop_Rice, paddy']] # TODO: Acrescentar mais dados
y = df['Yield']

**Separação dos dados em treino e teste**

Método HoldOut, porém também faremos validação cruzada.

In [6]:
SEED = 80

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

**Normalização**

In [7]:
normalization = MinMaxScaler()

X_train = normalization.fit_transform(X_train)
X_test = normalization.transform(X_test)

### Treinamento com vários modelos

In [8]:
def view_scores(y_true, y_pred, title):
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    print(title)
    print(f'MSE: {mse}')
    print(f'RMSE: {np.sqrt(mse)}')
    print(f'R² Score: {r2*100:.2f}')

In [9]:
def cros_validate_view_scores(results): # TODO: Ajustar para problemas de regressão linear com métricas MSE e MRSE
   mse_scores = -results.mean() # Converte para valores positivos
   
   print(f'MSE: {mse_scores}')
   print(f'RMSE: {np.sqrt(mse_scores)}')

In [10]:
kfold = KFold(n_splits=10, shuffle=True, random_state=SEED)

#### Modelo Baseline

In [11]:
model_lr = LinearRegression()
model_lr.fit(X_train, y_train)

model_lr_predict = model_lr.predict(X_test)
view_scores(y_test, model_lr_predict, 'Modelo Baseline (Logistic Regression)')

Modelo Baseline (Logistic Regression)
MSE: 114944076.08889213
RMSE: 10721.197511887007
R² Score: 98.26


**Validação cruzada**

OBS: Para problemas de regressão, é necessário utilizar o KFold.

In [12]:
pipeline_lr = Pipeline([
    ('scaler', MinMaxScaler()),
    ('estimator', LinearRegression())
])

results_lr = cross_val_score(pipeline_lr, X, y,
                            cv=kfold,
                            scoring='neg_mean_squared_error')
cros_validate_view_scores(results_lr)

MSE: 65204414.20665129
RMSE: 8074.925027927584


#### Voting Regressor

In [13]:
base_models = [
    ('dt', DecisionTreeRegressor(random_state=SEED)),
    ('rt', RandomForestRegressor(n_estimators=100, random_state=SEED)),
    ('knn', KNeighborsRegressor()),
    ('svc', SVR()),
    ('lr', LinearRegression())
]

model_bg = VotingRegressor(estimators=base_models)
model_bg.fit(X_train, y_train)

model_bg_predict = model_bg.predict(X_test)
view_scores(y_test, model_bg_predict, 'Voting Regressor')

Voting Regressor
MSE: 617446224.7055215
RMSE: 24848.465238431156
R² Score: 90.63


#### Stacking

In [14]:
model_st = StackingRegressor(estimators=base_models,
                             final_estimator=DecisionTreeRegressor())
model_st.fit(X_train, y_train)

model_st_predict = model_st.predict(X_test)
view_scores(y_test, model_st_predict, 'Stacking')

Stacking
MSE: 131583153.44264556
RMSE: 11470.97003058789
R² Score: 98.00


#### XGBoost

In [15]:
model_xgb = XGBRegressor()
model_xgb.fit(X_train, y_train)

model_xgb_predict = model_xgb.predict(X_test)
view_scores(y_test, model_xgb_predict, 'XGBoost Regressor')

XGBoost Regressor
MSE: 114944096.49725339
RMSE: 10721.198463663164
R² Score: 98.26


#### Floresta Aleatória

In [16]:
model_rf = RandomForestRegressor(random_state=SEED)
model_rf.fit(X_train, y_train)

model_rf_predict = model_rf.predict(X_test)
view_scores(y_test, model_rf_predict, 'Floresta Aleatória')

Floresta Aleatória
MSE: 114603676.16290611
RMSE: 10705.310652330745
R² Score: 98.26


#### Bagging Regressor

In [17]:
model_bg = BaggingRegressor(estimator=LinearRegression(), random_state=SEED)
model_bg.fit(X_train, y_train)

model_bg_predict = model_bg.predict(X_test)
view_scores(y_test, model_bg_predict, 'Bagging Regressor')

Bagging Regressor
MSE: 114400311.71535671
RMSE: 10695.80813755355
R² Score: 98.26
