In [311]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
import torch
from torch.utils.data import TensorDataset, DataLoader
from torch import nn

In [312]:
data = pd.read_csv('dados_nir.csv')
display(data)

Unnamed: 0,carbono_organico,carbono_total,nitrogenio_total,argila,areia,lodo,densidade_aparente,ph_agua,carbonato,potassio,...,2532,2534,2536,2538,2540,2542,2544,2546,2548,2550
0,0.5200,0.51502,0.02649,49.93611,22.9,27.2,,4.81,,0.88137,...,0.138690,0.138212,0.137727,0.137237,0.136740,0.136235,0.135727,0.135212,0.134692,0.134163
1,0.1500,0.15451,0.02046,36.05959,18.6,45.3,1.67131,6.37,,0.30398,...,0.394498,0.393718,0.392915,0.392090,0.391228,0.390332,0.389390,0.388402,0.387355,0.386245
2,0.8956,0.90996,0.09325,21.46390,10.7,67.8,1.17019,7.78,0.11663,0.80226,...,0.507267,0.506657,0.506060,0.505472,0.504900,0.504338,0.503792,0.503257,0.502737,0.502232
3,0.3800,0.37688,0.04956,35.94715,2.1,62.0,1.32571,7.11,,0.44368,...,0.590587,0.589850,0.589120,0.588398,0.587682,0.586973,0.586272,0.585575,0.584888,0.584205
4,0.1800,0.17545,0.03677,54.09450,3.5,42.4,1.22003,7.05,,0.96416,...,0.454657,0.453862,0.453073,0.452287,0.451502,0.450708,0.449912,0.449105,0.448285,0.447450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1971,1.3900,1.38599,0.15400,29.21183,8.6,62.2,1.45599,6.50,,0.87103,...,0.544790,0.544550,0.544365,0.544230,0.544145,0.544105,0.544115,0.544165,0.544260,0.544400
1972,1.2800,1.28361,0.12422,35.45140,11.4,53.1,,5.44,,0.81826,...,0.419525,0.419150,0.418815,0.418510,0.418235,0.417995,0.417785,0.417600,0.417430,0.417290
1973,1.8400,1.83704,0.19499,24.78274,17.2,58.0,1.18506,6.80,,1.60070,...,0.484470,0.484375,0.484325,0.484315,0.484345,0.484420,0.484535,0.484685,0.484870,0.485095
1974,1.3900,1.38972,0.15441,27.38510,14.7,57.9,1.50977,7.10,,0.98071,...,0.402505,0.402295,0.402130,0.402000,0.401915,0.401865,0.401850,0.401880,0.401940,0.402030


In [313]:
# Preprocessing
for coluna in data.columns:
    if data[coluna].isnull().sum() > 0:
        print(f'Coluna {coluna} tem {data[coluna].isnull().sum()} valores nulos')

Coluna carbono_organico tem 2 valores nulos
Coluna densidade_aparente tem 891 valores nulos
Coluna carbonato tem 1311 valores nulos


In [314]:
data['carbono_organico'] = data['carbono_organico'].fillna(data['carbono_organico'].mean())
data['densidade_aparente'] = data['densidade_aparente'].fillna(data['densidade_aparente'].mean())

In [315]:
data['carbonato'].describe()

count    665.000000
mean       6.139778
std        9.602257
min        0.006560
25%        0.285940
50%        1.908130
75%        8.230870
max       89.030870
Name: carbonato, dtype: float64

In [316]:
# Devido à grande quantidade de valores nulos e altíssima amplitude de valores, a coluna 'carbonato' foi removida
data = data.drop(columns=['carbonato'])

In [317]:
scaler = StandardScaler()
data_scaled = pd.DataFrame(scaler.fit_transform(data), columns=data.columns) # Mesma coisa que um SNV
display(data_scaled)

Unnamed: 0,carbono_organico,carbono_total,nitrogenio_total,argila,areia,lodo,densidade_aparente,ph_agua,potassio,1350,...,2532,2534,2536,2538,2540,2542,2544,2546,2548,2550
0,-0.560288,-0.639876,-0.786675,2.192281,-0.681641,-0.546280,0.000000,-1.161806,0.437307,-1.781693,...,-2.166210,-2.167962,-2.169890,-2.171962,-2.174174,-2.176519,-2.178938,-2.181440,-2.183989,-2.186594
1,-0.694008,-0.767960,-0.819380,1.184382,-0.834740,0.352421,2.098890,0.067182,-0.346772,1.450560,...,-0.144298,-0.148599,-0.153238,-0.158202,-0.163606,-0.169435,-0.175766,-0.182606,-0.190045,-0.198114
2,-0.424544,-0.499560,-0.424592,0.124248,-1.116014,1.469591,-0.724599,1.177999,0.329878,0.648978,...,0.747025,0.743994,0.740902,0.737699,0.734450,0.731092,0.727661,0.724128,0.720513,0.716816
3,-0.610885,-0.688955,-0.661551,1.176216,-1.422212,1.181609,0.151656,0.650164,-0.157064,1.874119,...,1.405587,1.401502,1.397292,1.392956,1.388461,1.383817,1.379004,1.373998,1.368830,1.363441
4,-0.683165,-0.760521,-0.730920,2.494319,-1.372366,0.208430,-0.443783,0.602895,0.549733,1.768842,...,0.331194,0.326736,0.322169,0.317451,0.312580,0.307474,0.302172,0.296622,0.290797,0.284685
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1971,-0.245866,-0.330434,-0.095106,0.687007,-1.190784,1.191540,0.885700,0.169598,0.423265,0.933072,...,1.043609,1.043479,1.043611,1.043954,1.044502,1.045205,1.046094,1.047083,1.048203,1.049450
1972,-0.285620,-0.366808,-0.256622,1.140208,-1.091091,0.739707,0.000000,-0.665484,0.351605,0.505768,...,0.053513,0.052397,0.051439,0.050559,0.049758,0.049075,0.048469,0.047903,0.047298,0.046776
1973,-0.083234,-0.170182,0.127210,0.365307,-0.884586,0.983002,-0.640816,0.405942,1.414136,0.273566,...,0.566839,0.567894,0.569139,0.570527,0.572056,0.573759,0.575592,0.577512,0.579515,0.581638
1974,-0.245866,-0.329108,-0.092882,0.554325,-0.973597,0.978036,1.188716,0.642286,0.572207,-0.280319,...,-0.081013,-0.080814,-0.080416,-0.079897,-0.079177,-0.078335,-0.077369,-0.076200,-0.074944,-0.073598


In [318]:
x = data_scaled.iloc[:, 9:]
display(x)

Unnamed: 0,1350,1352,1354,1356,1358,1360,1362,1364,1366,1368,...,2532,2534,2536,2538,2540,2542,2544,2546,2548,2550
0,-1.781693,-1.786271,-1.792078,-1.799107,-1.807060,-1.815228,-1.822814,-1.829311,-1.834629,-1.839293,...,-2.166210,-2.167962,-2.169890,-2.171962,-2.174174,-2.176519,-2.178938,-2.181440,-2.183989,-2.186594
1,1.450560,1.441641,1.431983,1.422215,1.412764,1.404124,1.396606,1.390261,1.384773,1.379200,...,-0.144298,-0.148599,-0.153238,-0.158202,-0.163606,-0.169435,-0.175766,-0.182606,-0.190045,-0.198114
2,0.648978,0.653814,0.657928,0.661224,0.663653,0.665580,0.667292,0.669185,0.671601,0.674717,...,0.747025,0.743994,0.740902,0.737699,0.734450,0.731092,0.727661,0.724128,0.720513,0.716816
3,1.874119,1.875644,1.876491,1.876701,1.876466,1.876199,1.876224,1.876749,1.877667,1.878725,...,1.405587,1.401502,1.397292,1.392956,1.388461,1.383817,1.379004,1.373998,1.368830,1.363441
4,1.768842,1.766171,1.762363,1.757535,1.752039,1.746457,1.741213,1.736251,1.731271,1.725436,...,0.331194,0.326736,0.322169,0.317451,0.312580,0.307474,0.302172,0.296622,0.290797,0.284685
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1971,0.933072,0.932844,0.934015,0.936444,0.939679,0.943099,0.946053,0.948153,0.949309,0.949724,...,1.043609,1.043479,1.043611,1.043954,1.044502,1.045205,1.046094,1.047083,1.048203,1.049450
1972,0.505768,0.506979,0.509481,0.512237,0.514282,0.514726,0.513379,0.510530,0.506972,0.503793,...,0.053513,0.052397,0.051439,0.050559,0.049758,0.049075,0.048469,0.047903,0.047298,0.046776
1973,0.273566,0.274757,0.277118,0.280656,0.285010,0.289460,0.293291,0.296010,0.297451,0.297873,...,0.566839,0.567894,0.569139,0.570527,0.572056,0.573759,0.575592,0.577512,0.579515,0.581638
1974,-0.280319,-0.279857,-0.279019,-0.277871,-0.276553,-0.275194,-0.273962,-0.272724,-0.271452,-0.270048,...,-0.081013,-0.080814,-0.080416,-0.079897,-0.079177,-0.078335,-0.077369,-0.076200,-0.074944,-0.073598


In [319]:
y = data_scaled.iloc[:, :9]
display(y)

Unnamed: 0,carbono_organico,carbono_total,nitrogenio_total,argila,areia,lodo,densidade_aparente,ph_agua,potassio
0,-0.560288,-0.639876,-0.786675,2.192281,-0.681641,-0.546280,0.000000,-1.161806,0.437307
1,-0.694008,-0.767960,-0.819380,1.184382,-0.834740,0.352421,2.098890,0.067182,-0.346772
2,-0.424544,-0.499560,-0.424592,0.124248,-1.116014,1.469591,-0.724599,1.177999,0.329878
3,-0.610885,-0.688955,-0.661551,1.176216,-1.422212,1.181609,0.151656,0.650164,-0.157064
4,-0.683165,-0.760521,-0.730920,2.494319,-1.372366,0.208430,-0.443783,0.602895,0.549733
...,...,...,...,...,...,...,...,...,...
1971,-0.245866,-0.330434,-0.095106,0.687007,-1.190784,1.191540,0.885700,0.169598,0.423265
1972,-0.285620,-0.366808,-0.256622,1.140208,-1.091091,0.739707,0.000000,-0.665484,0.351605
1973,-0.083234,-0.170182,0.127210,0.365307,-0.884586,0.983002,-0.640816,0.405942,1.414136
1974,-0.245866,-0.329108,-0.092882,0.554325,-0.973597,0.978036,1.188716,0.642286,0.572207


In [320]:
variancia_explicada = 0
num_componentes = 0

while variancia_explicada < 0.999999:
    num_componentes += 1
    Pca = PCA(n_components=num_componentes, random_state=42)
    Pca.fit(x, y)
    variancia_explicada = sum(Pca.explained_variance_ratio_)
    
x = pd.DataFrame(Pca.transform(x))

print(f'Número de componentes principais: {num_componentes}')
print(f'Variância explicada: {variancia_explicada}')

Número de componentes principais: 37
Variância explicada: 0.9999990364500078


In [321]:
display(x)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,27,28,29,30,31,32,33,34,35,36
0,-52.140600,-2.349065,1.739515,0.551455,-0.469251,-0.383462,-0.472800,0.134597,0.024432,-0.266204,...,0.007798,-0.021575,0.024915,0.047033,0.010464,-0.001328,0.000003,-0.000529,-0.003603,0.000346
1,21.395444,-12.007842,1.293277,2.919085,-0.054140,-1.699247,0.040907,-0.773450,-0.139886,-0.089241,...,-0.027049,0.016408,0.012258,-0.037085,0.012612,0.018866,0.010425,-0.001925,0.007901,-0.001726
2,23.194669,0.266656,-3.417727,-0.464179,-0.263919,-0.044051,-0.126278,-0.175132,0.093893,0.050268,...,-0.000909,0.019683,0.006598,-0.004072,0.000576,0.006579,0.009767,0.008619,-0.004869,0.003148
3,43.423628,-2.134162,-0.760617,-0.639729,-1.075625,-0.200045,-0.356512,-0.026994,0.171909,0.010964,...,-0.010640,0.000862,-0.013235,0.004573,0.008183,0.001139,-0.005489,0.004519,-0.010047,0.002817
4,30.004477,-9.221446,-0.953964,-0.815489,-1.548030,-0.583379,-0.160463,-0.131603,0.054381,0.013891,...,-0.010664,0.016975,-0.004886,0.011324,0.005148,-0.004006,-0.009406,-0.004389,-0.006460,0.001736
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1971,26.576706,1.034465,-1.115663,-0.229946,-0.093411,-0.162910,-0.097657,-0.095668,0.074882,0.049412,...,-0.006106,0.006652,-0.013703,0.021860,0.005397,-0.004459,0.005333,0.001337,0.002087,0.004835
1972,9.007369,-3.074298,-0.524513,-0.364168,-0.575661,-0.170925,-0.113006,-0.102870,0.081861,0.025053,...,-0.002243,0.005612,-0.015023,0.008120,0.000526,-0.015237,0.000733,0.002407,0.012199,-0.001032
1973,12.646861,1.691644,-1.816820,-0.365278,0.160449,-0.033995,0.016354,-0.133282,-0.052650,0.066236,...,-0.004479,0.011864,-0.008018,0.013079,-0.000009,-0.000976,0.003440,0.007281,0.005121,-0.000351
1974,-3.074252,1.109029,-1.494544,-0.362437,-0.038650,-0.191830,-0.100419,-0.061350,-0.045879,0.050614,...,-0.013191,0.006098,-0.007692,0.005747,-0.001640,-0.003833,-0.000109,0.008519,0.004250,-0.001545


In [322]:
# Transformando em tensores para poder utilizar na NN
x_tensor = torch.tensor(x.values, dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.float32)

In [323]:
x_tensor_cnn = x_tensor.unsqueeze(1)

In [324]:
x_train, x_test, y_train, y_test = train_test_split(x_tensor_cnn, y_tensor, test_size=0.3, random_state=42)

In [325]:
tam_batch = 32
dataset_treino = TensorDataset(x_train, y_train)

loader_treino = DataLoader(dataset_treino, batch_size=tam_batch, shuffle=True)

In [338]:
class RedeNeuralConvolucional(nn.Module):
    def __init__(self):
        super(RedeNeuralConvolucional, self).__init__()

        self.bloco_convolucional = nn.Sequential(
            nn.Conv1d(in_channels=1, out_channels=16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.BatchNorm1d(16),
            nn.Conv1d(in_channels=16, out_channels=32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.BatchNorm1d(32),
            nn.AdaptiveAvgPool1d(1))
        
        self.regressor = nn.Sequential(
            nn.Flatten(),
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, 9))
        
    def forward(self, x):
        x_1 = self.bloco_convolucional(x)
        predict = self.regressor(x_1)
        return predict

In [339]:
rede = RedeNeuralConvolucional()
loss_funct = nn.MSELoss()
optimizer = torch.optim.Adam(rede.parameters(), lr=1e-3)

In [340]:
for epoch in range(50):
    rede.train()
    loss_atual = 0.0
    for batch_x, batch_y in loader_treino:
        outputs = rede(batch_x)
        loss = loss_funct(outputs, batch_y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        loss_atual += loss.item()

In [343]:
rede.eval()

with torch.no_grad():
    y_pred = rede(x_test).cpu().numpy()
    y_test = y_test.cpu().numpy()

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mape = mean_absolute_percentage_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'MAPE: {mape}')
print(f'R2: {r2}')

MSE: 0.9200844168663025
RMSE: 0.9592103362083435
MAPE: 64095655034880.0
R2: 0.2638789713382721
