In [81]:
import boto3
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

dynamodb = boto3.resource('dynamodb')
table = dynamodb.Table('DatosClimaticosCalculations')

def scan_table(table):
    response = table.scan()
    data = response['Items']
    while 'LastEvaluatedKey' in response:
        response = table.scan(ExclusiveStartKey=response['LastEvaluatedKey'])
        data.extend(response['Items'])
    return data

items = scan_table(table)
df = pd.DataFrame(items)
df.head(10)

Unnamed: 0,date,heat_index,windSpeed,wind_chill,vapor_pressure,humidity,time,dew_point,temperature
0,2024-10-12,39.1,1.57,21.44,6.0,85.75,00:00,12.65,15.5
1,2024-10-12,39.16,1.25,21.68,6.0,86.07,01:00,12.67,15.46
2,2024-10-12,39.06,0.68,22.24,6.0,85.38,02:00,12.66,15.58
3,2024-10-12,38.89,1.01,22.04,6.0,84.55,03:00,12.6,15.69
4,2024-10-12,38.8,1.07,21.93,5.0,84.52,04:00,12.51,15.61
5,2024-10-12,39.04,1.06,21.74,6.0,86.13,05:00,12.55,15.32
6,2024-10-12,39.22,0.63,21.88,5.0,87.73,06:00,12.52,14.97
7,2024-10-12,39.51,0.21,22.0,5.0,89.75,07:00,12.54,14.59
8,2024-10-12,40.01,0.23,21.68,5.0,92.73,08:00,12.66,14.11
9,2024-10-12,40.38,0.9,20.73,5.0,95.3,09:00,12.69,13.63


In [82]:
X = df[['heat_index', 'windSpeed', 'wind_chill', 'vapor_pressure', 'humidity', 'dew_point']]
y = df['temperature']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [83]:
y = df.iloc[:, 8]
X = df.iloc[:, [1, 2, 3, 4, 5, 7]]

print(X.head())
print(y.head())

  heat_index windSpeed wind_chill vapor_pressure humidity dew_point
0       39.1      1.57      21.44            6.0    85.75     12.65
1      39.16      1.25      21.68            6.0    86.07     12.67
2      39.06      0.68      22.24            6.0    85.38     12.66
3      38.89      1.01      22.04            6.0    84.55      12.6
4       38.8      1.07      21.93            5.0    84.52     12.51
0     15.5
1    15.46
2    15.58
3    15.69
4    15.61
Name: temperature, dtype: object


In [84]:
# Configurar Pandas para mostrar todas las columnas sin truncar y alineadas
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

# Seleccionar las columnas con .iloc
y = df.iloc[:, 8]
X = df.iloc[:, [1, 2, 3, 4, 5, 7]]

# Combinar X e y en un solo DataFrame para imprimir juntos
combined = pd.concat([X, y], axis=1)

# Imprimir las primeras filas del DataFrame combinado
print(combined.head())

  heat_index windSpeed wind_chill vapor_pressure humidity dew_point temperature
0       39.1      1.57      21.44            6.0    85.75     12.65        15.5
1      39.16      1.25      21.68            6.0    86.07     12.67       15.46
2      39.06      0.68      22.24            6.0    85.38     12.66       15.58
3      38.89      1.01      22.04            6.0    84.55      12.6       15.69
4       38.8      1.07      21.93            5.0    84.52     12.51       15.61


In [85]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(y_pred)

[21.10204936 20.47267149 15.60753856 17.76869153 17.86493817 16.17316692
 20.43014537 13.360313   15.37327653 13.66584461 20.79170571 15.71066028
 14.01950157 15.86405739 18.3181768  18.76088113 16.20208535 15.50180849
 17.24126708 14.82733036 17.21251435 21.61040647 16.31847037 20.79934397
 16.60161179 15.09880809 19.67678426 20.18022581 16.56311671 19.45005899
 18.0026794  16.5103556  18.08477849 20.79113264 16.90889258 18.09034726
 16.56104227]


In [86]:
print(y_test)

44      21.1
47     20.47
4      15.61
55     17.77
26     17.86
64     16.17
73     20.43
10     13.36
40     15.37
108    13.67
18     20.79
62     15.71
11     14.02
36     15.86
90     18.32
118    18.76
110     16.2
0       15.5
89     17.24
104    14.83
65     17.21
45     21.61
31     16.32
70      20.8
42      16.6
12      15.1
15     19.68
115    20.18
76     16.56
98     19.45
24      18.0
78     16.51
22     18.08
97     20.79
56     16.91
111    18.09
30     16.56
Name: temperature, dtype: object


In [87]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f'Error Cuadrático Medio (MSE): {mse:.2f}')
print(f'Raíz del Error Cuadrático Medio (RMSE): {rmse:.2f}')
print(f'Coeficiente de Determinación (R²): {r2:.2f}')

Error Cuadrático Medio (MSE): 0.00
Raíz del Error Cuadrático Medio (RMSE): 0.00
Coeficiente de Determinación (R²): 1.00


In [88]:
comparison_df = pd.DataFrame({
    'Valores Reales': y_test.reset_index(drop=True),
    'Valores Predichos': y_pred
})

print(comparison_df.head(10))  # Muestra las primeras 10 filas

  Valores Reales  Valores Predichos
0           21.1          21.102049
1          20.47          20.472671
2          15.61          15.607539
3          17.77          17.768692
4          17.86          17.864938
5          16.17          16.173167
6          20.43          20.430145
7          13.36          13.360313
8          15.37          15.373277
9          13.67          13.665845


In [89]:
import boto3
import pandas as pd
from io import StringIO

# Crear el DataFrame de comparación inicial con las primeras predicciones obtenidas
comparison_df = pd.DataFrame({
    'hora': pd.date_range(start="2024-10-17 00:00", periods=len(y_test), freq='3H'),
    'Valores Reales': y_test.reset_index(drop=True),
    'Valores Predichos': y_pred
})

# Configurar el DataFrame para predicciones futuras con variabilidad
future_predictions = []
ultima_prediccion = y_pred[-1]  # Tomamos la última predicción como punto de partida
patron_ciclo = y_pred[-24:]  # Usar las últimas 24 predicciones para crear un patrón de variabilidad

# Generar predicciones futuras sin repetición estática
fecha_inicio = "2024-10-17 00:00"
fecha_fin = "2025-01-17 21:00"
horas_prediccion = pd.date_range(start=fecha_inicio, end=fecha_fin, freq='3H')[len(y_test):]

# Iterar sobre las horas de predicción
for idx, hora in enumerate(horas_prediccion):
    # Utilizar la predicción en el patrón cíclico para evitar repetición
    prediccion_temperatura = patron_ciclo[idx % len(patron_ciclo)]
    
    # Agregar la predicción al DataFrame
    future_predictions.append({
        'hora': hora,
        'Valores Predichos': round(prediccion_temperatura, 2)
    })
    
    # Actualizar la última predicción
    ultima_prediccion = prediccion_temperatura

# Convertir las predicciones futuras en DataFrame
future_predictions_df = pd.DataFrame(future_predictions)
final_predictions_df['Valores Predichos'] = final_predictions_df['Valores Predichos'].round(2)

# Concatenar las predicciones reales y las futuras
final_predictions_df = pd.concat([comparison_df[['hora', 'Valores Predichos']], future_predictions_df], ignore_index=True)

# Mostrar el DataFrame final con todas las predicciones
print(final_predictions_df.head(10))

# Guardar el DataFrame final en un archivo CSV y subirlo a S3
csv_buffer = StringIO()
final_predictions_df.to_csv(csv_buffer, index=False)

# Especificar el nombre del bucket y el nombre del archivo en S3
bucket_name = 'temp-predicha'
file_name = 'predicciones_temperatura_completas-prueba3meses.csv'

# Conectar con S3 y subir el archivo
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket_name, file_name).put(Body=csv_buffer.getvalue())

print(f'Archivo {file_name} guardado en el bucket {bucket_name}')

                 hora  Valores Predichos
0 2024-10-17 00:00:00          21.102049
1 2024-10-17 03:00:00          20.472671
2 2024-10-17 06:00:00          15.607539
3 2024-10-17 09:00:00          17.768692
4 2024-10-17 12:00:00          17.864938
5 2024-10-17 15:00:00          16.173167
6 2024-10-17 18:00:00          20.430145
7 2024-10-17 21:00:00          13.360313
8 2024-10-18 00:00:00          15.373277
9 2024-10-18 03:00:00          13.665845
Archivo predicciones_temperatura_completas-prueba3meses.csv guardado en el bucket temp-predicha


  'hora': pd.date_range(start="2024-10-17 00:00", periods=len(y_test), freq='3H'),
  horas_prediccion = pd.date_range(start=fecha_inicio, end=fecha_fin, freq='3H')[len(y_test):]
