In [8]:
from google.colab import drive
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [9]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
!cp "/content/drive/My Drive/BaseDeDatos/ClimateEngineTmin.csv" "/content/"


In [11]:
Tmin = pd.read_csv("ClimateEngineTmin.csv",sep=';')


In [12]:
Tmin.columns = [None] * len(Tmin.columns)
Tmin.columns = ['Fecha','Temperatura_Minima']


In [13]:
Tmin['Fecha'] = pd.to_datetime(Tmin['Fecha'], format='%Y-%m-%d:%I %p')
Tmin['Fecha'] = pd.to_datetime(Tmin['Fecha'])
Tmin.set_index('Fecha', inplace=True)
Tmin = Tmin.asfreq('D')

In [14]:
Tmin['Temperatura_Minima'] = Tmin['Temperatura_Minima'].str.replace(',', '.')
Tmin['Temperatura_Minima'] = Tmin['Temperatura_Minima'].astype(float)

In [15]:
Tmin.isnull().sum()

Temperatura_Minima    0
dtype: int64

In [16]:
Tmin

Unnamed: 0_level_0,Temperatura_Minima
Fecha,Unnamed: 1_level_1
2000-01-01,6.45
2000-01-02,6.55
2000-01-03,6.65
2000-01-04,7.45
2000-01-05,6.25
...,...
2022-12-27,6.45
2022-12-28,7.55
2022-12-29,4.75
2022-12-30,3.25


In [17]:
completo = Tmin.copy()

In [18]:
completo

Unnamed: 0_level_0,Temperatura_Minima
Fecha,Unnamed: 1_level_1
2000-01-01,6.45
2000-01-02,6.55
2000-01-03,6.65
2000-01-04,7.45
2000-01-05,6.25
...,...
2022-12-27,6.45
2022-12-28,7.55
2022-12-29,4.75
2022-12-30,3.25


In [19]:
fechas_a_eliminar = pd.to_datetime(['2011-08-04','2011-08-05','2011-08-06','2011-08-07','2011-08-08','2011-08-09','2011-08-10',
                                    '2011-09-04','2011-10-04','2011-11-04','2012-11-05','2013-11-06','2014-11-07','2015-11-08',
                                    '2022-06-04','2022-07-04','2022-07-14','2022-07-24','2022-07-25'])

# Utilizar loc para establecer el valor de la columna como NaN en las fechas especificadas
imputar = completo.copy()
imputar.loc[fechas_a_eliminar, 'Temperatura_Minima'] = None


In [20]:
imputar.isnull().sum()

Temperatura_Minima    19
dtype: int64

In [21]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
import tensorflow as tf

In [22]:
# Establecer la semilla para TensorFlow
tf.random.set_seed(123)

imputarTmin = imputar.copy()
# Supongamos que df es tu DataFrame, y 'a_rellenar' es la columna que quieres completar
series = imputarTmin['Temperatura_Minima']

#Eliminar los valores de nan
series_droppednan = series.dropna()

# Crear secuencias de los datos
sequence_length = 15
sequences = []
for i in range(len(series_droppednan) - sequence_length):
    sequences.append(series_droppednan[i:i+sequence_length].values)
sequences = np.array(sequences)

# Ejemplos y etiquetas
X = sequences[:, :-1]
Y = sequences[:, -1]

# Redimensionar los ejemplos para el LSTM
X = np.expand_dims(X, -1)

# Crear el modelo
model = Sequential()
model.add(LSTM(200, activation='relu', input_shape=(X.shape[1], 1)))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

# Entrenar el modelo
model.fit(X, Y, epochs=60, verbose=1)

# Utilizar el modelo para introducir valores faltantes
for i in range(len(series)):
    if pd.isnull(series[i]):
        input = np.expand_dims(np.expand_dims(series[i-sequence_length+1:i].values, -1), 0)
        series[i] = model.predict(input)[0, 0]

# Ahora series debería tener los valores faltantes llenados

Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60


In [23]:
# Crea un nuevo DataFrame con la serie
rnn_imputed = pd.DataFrame({'Temperatura_Minima': series.values}, index=series.index)


In [24]:
imputar.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 8401 entries, 2000-01-01 to 2022-12-31
Freq: D
Data columns (total 1 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Temperatura_Minima  8382 non-null   float64
dtypes: float64(1)
memory usage: 389.3 KB


In [25]:
linear_imputed = imputar.copy()

# Rellenar valores faltantes utilizando interpolate de pandas
linear_imputed['Temperatura_Minima'] = linear_imputed['Temperatura_Minima'].interpolate(method='linear')


In [26]:
imputar.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 8401 entries, 2000-01-01 to 2022-12-31
Freq: D
Data columns (total 1 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Temperatura_Minima  8382 non-null   float64
dtypes: float64(1)
memory usage: 389.3 KB


In [27]:
from sklearn.metrics import mean_squared_error

In [28]:
# Identificar dónde 'imputar' originalmente tenía valores NaN para 'Temperatura_Maxima'
nan_indices = imputar['Temperatura_Minima'].isna()
print(nan_indices)

Fecha
2000-01-01    False
2000-01-02    False
2000-01-03    False
2000-01-04    False
2000-01-05    False
              ...  
2022-12-27    False
2022-12-28    False
2022-12-29    False
2022-12-30    False
2022-12-31    False
Freq: D, Name: Temperatura_Minima, Length: 8401, dtype: bool


In [29]:
# Asegúrate de que los índices de ambos DataFrame alinean
completo_aligned = completo.loc[nan_indices]
rnn_imputed_aligned = rnn_imputed.loc[nan_indices]
linear_imputed_aligned = linear_imputed.loc[nan_indices]

In [30]:
# Calcular RMSE para la comparación sólo en los puntos donde habían valores nulos originalmente
rmse_rnn = np.sqrt(mean_squared_error(completo_aligned['Temperatura_Minima'], rnn_imputed_aligned['Temperatura_Minima']))
rmse_linear = np.sqrt(mean_squared_error(completo_aligned['Temperatura_Minima'], linear_imputed_aligned['Temperatura_Minima']))

In [31]:
print(f'RMSE para la imputación usando RNN: {rmse_rnn}')
print(f'RMSE para la imputación usando regresión lineal: {rmse_linear}')

RMSE para la imputación usando RNN: 1.8213319105573322
RMSE para la imputación usando regresión lineal: 2.3478309013897207
