In [16]:
import os
import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import (
    explained_variance_score,
    r2_score,
    mean_squared_error)

# Modelo de regresón lineal sin agregación en H3

El modelo de este notebook utiliza técnicas convencionales de aprendizaje automático para un regresión lineal.

El tratamiento del componente geoespacial, es decir, los elementos de malla hexagonales de H3, se incorpora al modelo utilizando _One Hot Encoding_ de los elementos de la malla.

## Carga de datos

In [2]:
# Ruta a archivos de datos
DATA_PATH = os.path.join(
    os.path.dirname(os.getcwd()),
    'data',
    'datos-produccion-maiz',
    'agg-maize-panel-rcp2p6.csv')

H3_CATALOGUE = os.path.join(
    os.path.dirname(os.getcwd()),
    'data',
    'datos-produccion-maiz',
    '01_h3_cells_catalogue.csv')

# Carga de datos
data = pd.read_csv(DATA_PATH)
cat_h3 = pd.read_csv(H3_CATALOGUE)
h3_res = 'hex_3'

# Unir resolución de malla a datos
data = (
    data
    .join(
        other=cat_h3[['id',h3_res]]
        ,on='id'
        ,how='left'
        ,rsuffix='__ignore')
    .drop(columns=['id','lon','lat','id__ignore'])
    .dropna(axis=0))

data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 94221 entries, 0 to 94222
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   mean_precip    94221 non-null  float64
 1   mean_precip_2  94221 non-null  float64
 2   mean_temp      94221 non-null  float64
 3   mean_temp_2    94221 non-null  float64
 4   mean_yield     94221 non-null  float64
 5   period         94221 non-null  object 
 6   hex_3          94221 non-null  object 
dtypes: float64(5), object(2)
memory usage: 5.8+ MB


## Partición en Train y Test

In [3]:
# Definición de variable objetivo
target = 'mean_yield'

# Partición de conjuntos de datos
# Estructura de predictores
# 0: mean_precip
# 1: mean_precip_2
# 2: mean_temp
# 3: mean_temp_2
# 4...: hex_3 en OHE
x_train = (
    data
    .query("period in ['2005-2035','2035-2065']")
    .drop(columns=['period', target])
    .to_numpy())
y_train = (
    data
    .query("period in ['2005-2035','2035-2065']")
    .filter(items=[target])
    .to_numpy())

x_test = (
    data
    .query("period == '2065-2099'")
    .drop(columns=['period', target])
    .to_numpy())
y_test = (
    data
    .query("period == '2065-2099'")
    .filter(items=[target])
    .to_numpy())

print(
    f"""Datasets dimensions:
    x_train: {x_train.shape}
    y_train: {y_train.shape}
    x_test: {x_test.shape}
    y_test: {y_test.shape}""")

Datasets dimensions:
    x_train: (62814, 5)
    y_train: (62814, 1)
    x_test: (31407, 5)
    y_test: (31407, 1)


## Ingeniería de características

In [4]:
# Iniciar objeto encoder
ohe = OneHotEncoder(
    drop='first',
    handle_unknown='error',
    sparse_output=False)

# Ajustar encoder
ohe.fit(cat_h3[h3_res].unique().reshape((-1,1)))

# Transformar conjuntos train y test
x_train = np.hstack((
    np.delete(arr=x_train, obj=4, axis=1),
    ohe.transform(X=x_train[:,4].reshape((-1,1))) ))
x_test = np.hstack((
    np.delete(arr=x_test, obj=4, axis=1),
    ohe.transform(X=x_test[:,4].reshape((-1,1))) ))

# Inspeccionar dimensiones
x_train.shape, x_test.shape

## Modelo de regresión lineal

Entrenamiento de modelo

Tiempo de procesamiento: `6min 48s`

In [10]:
%%time

# Iniciar instancia de modelo
ols = LinearRegression(
    fit_intercept=True,
    n_jobs=-1)

# Entrenamiento
ols.fit(X=x_train, y=y_train)

Coeficiente de determinación para datos de entrenamiento

$R^2 = 0.956334$

In [13]:
ols.score(X=x_train, y=y_train)

0.9563346525301238

Otras métricas de desempeño dentro del conjunto de entramiento

In [17]:
y_pred = ols.predict(X=x_train)

In [20]:
var_score_train = explained_variance_score(y_true=y_train, y_pred=y_pred)
r2_train = r2_score(y_true=y_train, y_pred=y_pred)
rmse_train = mean_squared_error(y_true=y_train, y_pred=y_pred)

print(
    f"""Métricas en train:
    varianza explicada: {100*var_score_train:.2f} %
    coef. determinación: {r2_train:.6f}
    RMSE: {np.sqrt(rmse_train):.6f}""")

Métricas en train:
    varianza explicada: 95.63 %
    coef. determinación: 0.956335
    RMSE: 0.700559


## Desempeño en Test

In [21]:
y_pred = ols.predict(X=x_test)

In [22]:
var_score_test = explained_variance_score(y_true=y_test, y_pred=y_pred)
r2_test = r2_score(y_true=y_test, y_pred=y_pred)
rmse_test = mean_squared_error(y_true=y_test, y_pred=y_pred)

print(
    f"""Métricas en train:
    varianza explicada: {100*var_score_test:.2f} %
    coef. determinación: {r2_test:.6f}
    RMSE: {np.sqrt(rmse_test):.6f}""")

Métricas en train:
    varianza explicada: 95.03 %
    coef. determinación: 0.950279
    RMSE: 0.739847


# Normalidad geoespacial de residuales

## Datos expandidos

In [28]:
residual_test = (
    data
    .query("period == '2065-2099'")
    .assign(
        mean_yield_pred = y_pred,
        residuals = lambda df: df.mean_yield_pred - df.mean_yield))

residual_test.head()

Unnamed: 0,mean_precip,mean_precip_2,mean_temp,mean_temp_2,mean_yield,period,hex_3,mean_yield_pred,residuals
62816,190.444427,54862.181776,7.80471,112.890992,3.524696,2065-2099,831284fffffffff,3.435467,-0.089229
62817,106.560491,16800.366451,6.571009,102.951142,2.802876,2065-2099,8312a2fffffffff,2.735325,-0.06755
62818,125.62808,23525.656879,6.060377,113.34065,2.55703,2065-2099,83129afffffffff,2.525273,-0.031758
62819,293.91236,118260.797323,10.605312,127.296821,4.206531,2065-2099,831298fffffffff,4.616246,0.409715
62820,285.46642,112295.478936,10.128092,127.814835,4.747389,2065-2099,83129cfffffffff,4.377287,-0.370101
