# Datos

In [34]:
import pandas as pd
import glob
import os

# ==============================
# 1️⃣ Ruta donde están tus CSV
# ==============================
ruta = "./data_entrenamiento/"  # <-- cambia por tu carpeta
archivos = glob.glob(os.path.join(ruta, "*.csv"))

# ==============================
# 2️⃣ Leer todos los CSV y procesar
# ==============================
dfs = []

for archivo in archivos:
    # Obtener ticker desde el nombre del archivo (sin extensión)
    ticker = os.path.splitext(os.path.basename(archivo))[0]
    
    # Leer CSV
    df = pd.read_csv(archivo)
    
    # Detectar columnas que empiecen con "Sector_"
    sector_cols = [c for c in df.columns if c.startswith("Sector_")]
    
    # Detectar el sector cuyo valor es 1
    sector_name = df.loc[0, sector_cols].idxmax().replace("Sector_", "")
    
    # Agregar columnas Ticker y Sector
    df["Ticker"] = ticker
    df["Sector"] = sector_name
    
    dfs.append(df)


In [35]:
# ==============================
# 3️⃣ Combinar todo en un solo DataFrame
# ==============================
final_df = pd.concat(dfs, ignore_index=True)


In [37]:
result = final_df[["Ticker", "Sector"]].copy()

In [39]:
# ==============================
# 2️⃣ Eliminar duplicados por Ticker
# ==============================
result = result.drop_duplicates(subset=["Ticker"])

In [41]:
# ==============================
# 3️⃣ Limpiar nombres de Ticker
# ==============================
result["Ticker"] = result["Ticker"].str.replace("_completo_arreglado", "", regex=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result["Ticker"] = result["Ticker"].str.replace("_completo_arreglado", "", regex=False)


In [42]:
result

Unnamed: 0,Ticker,Sector
0,GOOGL,communication-services
830,SRE,utilities
1660,STT,financial-services
2490,PGR,financial-services
3320,LEN,consumer-cyclical
...,...,...
414477,BIIB,healthcare
415307,FDS,financial-services
416222,ADM,consumer-defensive
417052,KMI,energy


In [44]:
# ==============================
# 4️⃣ Resultado final
# ==============================
print(result.head())

# (Opcional) Guardar a nuevo archivo
result.to_csv("dataset_limpio.csv", index=False)


     Ticker                  Sector
0     GOOGL  communication-services
830     SRE               utilities
1660    STT      financial-services
2490    PGR      financial-services
3320    LEN       consumer-cyclical


# COMPLETO

In [60]:
import pandas as pd
from sklearn.metrics import mean_squared_error
import numpy as np

# ==============================
# 1️⃣ Cargar el CSV
# ==============================
df = pd.read_csv("./V. SP500 Completo/resultados_predicciones_futuras_Completo.csv")

# ==============================
# 2️⃣ Calcular el RMSE solo para Pred_Average
# ==============================
# Real = Rentabilidad
rmse = np.sqrt(mean_squared_error(df["Rentabilidad"], df["Pred_GRU"]))

print(rmse)


26.865038448131177


In [61]:
result = df[["Ticker", "Pred_GRU", "Rentabilidad", "Dif_GRU"]].copy()

In [63]:
result["Dif_GRU"] = result["Dif_GRU"].abs()

In [64]:
result

Unnamed: 0,Ticker,Pred_GRU,Rentabilidad,Dif_GRU
0,LULU,36.402821,-10.777444,47.180264
1,KVUE,29.556635,-33.328167,62.884802
2,CAG,29.803476,-8.866028,38.669504
3,LYB,35.983067,-5.266232,41.249298
4,PAYC,23.221273,-19.705873,42.927146
...,...,...,...,...
497,GOOGL,-24.274979,43.212649,67.487627
498,GS,-25.356680,10.261436,35.618116
499,LRCX,-27.823957,59.123488,86.947445
500,TER,-18.959181,70.902560,89.861741


In [66]:
result_sorted = result.sort_values(by="Dif_GRU", ascending=True)

In [40]:
result_sorted

Unnamed: 0,Ticker,Pred_LSTM,Rentabilidad,Dif_LSTM
175,EIX,2.398582,2.347295,0.051287
222,WMB,0.633603,0.815066,0.181462
197,AIG,3.436965,3.686664,0.249699
77,CRM,8.593882,8.296026,0.297855
104,FTNT,14.294834,14.780872,0.486038
...,...,...,...,...
437,WBD,-11.570107,89.291749,100.861855
500,TER,-37.714661,70.902560,108.617221
461,INTC,-12.217705,102.276180,114.493885
483,WDC,-17.074862,102.021578,119.096439


In [67]:
import pandas as pd

df_sector = pd.read_csv("dataset_limpio.csv")        # contiene Ticker, Sector

# ==============================
# 2️⃣ Hacer merge por Ticker
# ==============================
df_final = pd.merge(result_sorted, df_sector, on="Ticker", how="left")

# ==============================
# 3️⃣ Ver resultado
# ==============================
print(df_final)


    Ticker   Pred_GRU  Rentabilidad     Dif_GRU                  Sector
0       PG  -1.376492     -1.347636    0.028856      consumer-defensive
1      LKQ   8.276737      8.454633    0.177895       consumer-cyclical
2      SHW  -0.807946     -1.212518    0.404572         basic-materials
3      AIG   4.173892      3.686664    0.487228      financial-services
4     SMCI  11.913442     11.334907    0.578535              technology
..     ...        ...           ...         ...                     ...
497    TER -18.959181     70.902560   89.861741              technology
498    WBD -12.726489     89.291749  102.018238  communication-services
499   INTC -19.970196    102.276180  122.246376              technology
500    WDC -26.314707    102.021578  128.336285              technology
501     MU -33.036709    100.152083  133.188792              technology

[502 rows x 5 columns]


In [68]:
df_final.to_csv("diferencias_mse_Completo.csv", index=False)

# SIN TECNOLOGIA

In [None]:
import pandas as pd
from sklearn.metrics import mean_squared_error
import numpy as np

# ==============================
# 1️⃣ Cargar el CSV
# ==============================
df = pd.read_csv("./V. Sin Tecnologia/resultados_predicciones_futuras_sin_Tecnologia.csv")

# ==============================
# 2️⃣ Calcular el RMSE solo para Pred_Average
# ==============================
# Real = Rentabilidad
rmse = np.sqrt(mean_squared_error(df["Rentabilidad"], df["Pred_GRU"]))

In [44]:
rmse

26.191405363391883

In [69]:
result = df[["Ticker", "Pred_GRU", "Rentabilidad", "Dif_GRU"]].copy()

In [70]:
result["Dif_GRU"] = result["Dif_GRU"].abs()

In [71]:
result

Unnamed: 0,Ticker,Pred_GRU,Rentabilidad,Dif_GRU
0,LULU,36.402821,-10.777444,47.180264
1,KVUE,29.556635,-33.328167,62.884802
2,CAG,29.803476,-8.866028,38.669504
3,LYB,35.983067,-5.266232,41.249298
4,PAYC,23.221273,-19.705873,42.927146
...,...,...,...,...
497,GOOGL,-24.274979,43.212649,67.487627
498,GS,-25.356680,10.261436,35.618116
499,LRCX,-27.823957,59.123488,86.947445
500,TER,-18.959181,70.902560,89.861741


In [73]:
result_sorted = result.sort_values(by="Dif_GRU", ascending=True)

In [74]:
import pandas as pd

df_sector = pd.read_csv("dataset_limpio.csv")        # contiene Ticker, Sector

# ==============================
# 2️⃣ Hacer merge por Ticker
# ==============================
df_final = pd.merge(result_sorted, df_sector, on="Ticker", how="left")

# ==============================
# 3️⃣ Ver resultado
# ==============================
print(df_final)

    Ticker   Pred_GRU  Rentabilidad     Dif_GRU                  Sector
0       PG  -1.376492     -1.347636    0.028856      consumer-defensive
1      LKQ   8.276737      8.454633    0.177895       consumer-cyclical
2      SHW  -0.807946     -1.212518    0.404572         basic-materials
3      AIG   4.173892      3.686664    0.487228      financial-services
4     SMCI  11.913442     11.334907    0.578535              technology
..     ...        ...           ...         ...                     ...
497    TER -18.959181     70.902560   89.861741              technology
498    WBD -12.726489     89.291749  102.018238  communication-services
499   INTC -19.970196    102.276180  122.246376              technology
500    WDC -26.314707    102.021578  128.336285              technology
501     MU -33.036709    100.152083  133.188792              technology

[502 rows x 5 columns]


In [75]:
df_final.to_csv("diferencias_mse_Sin_Tecnologia.csv", index=False)

# REVISAR RMSE SIN TECNOLOGIA

In [76]:
df_final

Unnamed: 0,Ticker,Pred_GRU,Rentabilidad,Dif_GRU,Sector
0,PG,-1.376492,-1.347636,0.028856,consumer-defensive
1,LKQ,8.276737,8.454633,0.177895,consumer-cyclical
2,SHW,-0.807946,-1.212518,0.404572,basic-materials
3,AIG,4.173892,3.686664,0.487228,financial-services
4,SMCI,11.913442,11.334907,0.578535,technology
...,...,...,...,...,...
497,TER,-18.959181,70.902560,89.861741,technology
498,WBD,-12.726489,89.291749,102.018238,communication-services
499,INTC,-19.970196,102.276180,122.246376,technology
500,WDC,-26.314707,102.021578,128.336285,technology


In [77]:
import pandas as pd

# Supongamos que ya tienes tu DataFrame cargado como df

# ==============================
# 1️⃣ Quitar todos los del sector "technology"
# ==============================
df_data = df_final[df_final["Sector"] != "technology"]
df_data = df_data[df_data["Sector"] != None]

In [78]:
# ==============================
# 2️⃣ Eliminar tickers específicos
# ==============================
tickers_a_eliminar = ["GOOGL", "GOOG", "AMZN", "META", "TSLA"]  # agrega los que quieras
df_data = df_data[~df_data["Ticker"].isin(tickers_a_eliminar)]

# ==============================
# 3️⃣ Mostrar resultado
# ==============================
print(df_data)


    Ticker   Pred_GRU  Rentabilidad     Dif_GRU                  Sector
0       PG  -1.376492     -1.347636    0.028856      consumer-defensive
1      LKQ   8.276737      8.454633    0.177895       consumer-cyclical
2      SHW  -0.807946     -1.212518    0.404572         basic-materials
3      AIG   4.173892      3.686664    0.487228      financial-services
5       NI  -0.582593      0.051309    0.633902               utilities
..     ...        ...           ...         ...                     ...
486   KVUE  29.556635    -33.328167   62.884802      consumer-defensive
488    FDS  34.699360    -30.798026   65.497386      financial-services
491    TTD  26.498880    -43.077100   69.575980  communication-services
493    APP -28.655161     45.728729   74.383890                     NaN
498    WBD -12.726489     89.291749  102.018238  communication-services

[416 rows x 5 columns]


In [80]:
rmse = np.sqrt(mean_squared_error(df_data["Rentabilidad"], df_data["Pred_GRU"]))

In [81]:
rmse

22.65587002174198

# RMSE por SECTORES

RMSE ENTRENAMIENTO COMPLETO

In [57]:
import pandas as pd
import numpy as np

# Cargar CSV
df = pd.read_csv("./V. SP500 Completo/diferencias_mse_Completo.csv")

# Calcular RMSE por sector
rmse_por_sector_completo = (
    df.groupby("Sector")
      .apply(lambda g: np.sqrt(np.mean((g["Pred_Average"] - g["Rentabilidad"])**2)))
      .reset_index(name="RMSE")
)

print(rmse_por_sector_completo)


                    Sector       RMSE
0          basic-materials  24.836167
1   communication-services  37.804231
2        consumer-cyclical  23.234383
3       consumer-defensive  25.783526
4                   energy  20.159555
5       financial-services  18.658365
6               healthcare  23.152368
7              industrials  20.845860
8              real-estate  19.553449
9               technology  40.848383
10               utilities  15.282074


  .apply(lambda g: np.sqrt(np.mean((g["Pred_Average"] - g["Rentabilidad"])**2)))


In [32]:
import pandas as pd
import numpy as np

# Cargar CSV
df = pd.read_csv("./diferencias_mse_Completo.csv")

# Calcular RMSE por sector
rmse_por_sector_completo = (
    df.groupby("Sector")
      .apply(lambda g: np.sqrt(np.mean((g["Pred_TFT"] - g["Rentabilidad"])**2)))
      .reset_index(name="RMSE")
)

print(rmse_por_sector_completo)

                    Sector       RMSE
0          basic-materials  26.219955
1   communication-services  39.291978
2        consumer-cyclical  24.963826
3       consumer-defensive  25.484792
4                   energy  19.789374
5       financial-services  19.477572
6               healthcare  24.097791
7              industrials  21.324299
8              real-estate  19.222447
9               technology  41.961499
10               utilities  17.104689


  .apply(lambda g: np.sqrt(np.mean((g["Pred_TFT"] - g["Rentabilidad"])**2)))


In [58]:
import pandas as pd
import numpy as np

# Cargar CSV
df = pd.read_csv("./diferencias_mse_Completo.csv")

# Calcular RMSE por sector
rmse_por_sector_completo = (
    df.groupby("Sector")
      .apply(lambda g: np.sqrt(np.mean((g["Pred_LSTM"] - g["Rentabilidad"])**2)))
      .reset_index(name="RMSE")
)

print(rmse_por_sector_completo)

                    Sector       RMSE
0          basic-materials  24.866173
1   communication-services  37.848403
2        consumer-cyclical  22.658706
3       consumer-defensive  27.708125
4                   energy  22.221748
5       financial-services  19.036770
6               healthcare  23.155178
7              industrials  21.853480
8              real-estate  20.869536
9               technology  40.571326
10               utilities  16.031628


  .apply(lambda g: np.sqrt(np.mean((g["Pred_LSTM"] - g["Rentabilidad"])**2)))


In [82]:
import pandas as pd
import numpy as np

# Cargar CSV
df = pd.read_csv("./diferencias_mse_Completo.csv")

# Calcular RMSE por sector
rmse_por_sector_completo = (
    df.groupby("Sector")
      .apply(lambda g: np.sqrt(np.mean((g["Pred_GRU"] - g["Rentabilidad"])**2)))
      .reset_index(name="RMSE")
)

print(rmse_por_sector_completo)

                    Sector       RMSE
0          basic-materials  25.571459
1   communication-services  37.026386
2        consumer-cyclical  23.417727
3       consumer-defensive  24.775851
4                   energy  19.117372
5       financial-services  18.870009
6               healthcare  23.998658
7              industrials  21.497625
8              real-estate  19.801377
9               technology  40.996995
10               utilities  13.786849


  .apply(lambda g: np.sqrt(np.mean((g["Pred_GRU"] - g["Rentabilidad"])**2)))


RMSE ENTRENAMIENTO SIN TECNOLOGIA

In [33]:
import pandas as pd
import numpy as np

# Cargar CSV
df = pd.read_csv("./V. Sin Tecnologia/diferencias_mse_Sin_Tecnologia.csv")

# Calcular RMSE por sector
rmse_por_sector_sin_Tech = (
    df.groupby("Sector")
      .apply(lambda g: np.sqrt(np.mean((g["Pred_Average"] - g["Rentabilidad"])**2)))
      .reset_index(name="RMSE")
)

print(rmse_por_sector_sin_Tech)


                    Sector       RMSE
0          basic-materials  23.598339
1   communication-services  36.906222
2        consumer-cyclical  21.428216
3       consumer-defensive  22.622007
4                   energy  17.612718
5       financial-services  18.283008
6               healthcare  21.993873
7              industrials  19.595090
8              real-estate  19.218856
9               technology  38.064950
10               utilities  12.549327


  .apply(lambda g: np.sqrt(np.mean((g["Pred_Average"] - g["Rentabilidad"])**2)))


In [34]:
import pandas as pd
import numpy as np

# Cargar CSV
df = pd.read_csv("./diferencias_mse_Sin_Tecnologia.csv")

# Calcular RMSE por sector
rmse_por_sector_sin_Tech = (
    df.groupby("Sector")
      .apply(lambda g: np.sqrt(np.mean((g["Pred_TFT"] - g["Rentabilidad"])**2)))
      .reset_index(name="RMSE")
)

print(rmse_por_sector_sin_Tech)

                    Sector       RMSE
0          basic-materials  23.190499
1   communication-services  37.215077
2        consumer-cyclical  23.310448
3       consumer-defensive  23.208278
4                   energy  18.750466
5       financial-services  18.831448
6               healthcare  22.585067
7              industrials  19.696092
8              real-estate  19.739563
9               technology  37.219077
10               utilities  13.045717


  .apply(lambda g: np.sqrt(np.mean((g["Pred_TFT"] - g["Rentabilidad"])**2)))


In [59]:
import pandas as pd
import numpy as np

# Cargar CSV
df = pd.read_csv("./diferencias_mse_Sin_Tecnologia.csv")

# Calcular RMSE por sector
rmse_por_sector_sin_Tech = (
    df.groupby("Sector")
      .apply(lambda g: np.sqrt(np.mean((g["Pred_LSTM"] - g["Rentabilidad"])**2)))
      .reset_index(name="RMSE")
)

print(rmse_por_sector_sin_Tech)

                    Sector       RMSE
0          basic-materials  25.862477
1   communication-services  37.938325
2        consumer-cyclical  21.009306
3       consumer-defensive  22.425316
4                   energy  17.830855
5       financial-services  19.883598
6               healthcare  22.889235
7              industrials  21.372686
8              real-estate  19.867149
9               technology  40.240761
10               utilities  13.520057


  .apply(lambda g: np.sqrt(np.mean((g["Pred_LSTM"] - g["Rentabilidad"])**2)))


In [83]:
import pandas as pd
import numpy as np

# Cargar CSV
df = pd.read_csv("./diferencias_mse_Sin_Tecnologia.csv")

# Calcular RMSE por sector
rmse_por_sector_sin_Tech = (
    df.groupby("Sector")
      .apply(lambda g: np.sqrt(np.mean((g["Pred_GRU"] - g["Rentabilidad"])**2)))
      .reset_index(name="RMSE")
)

print(rmse_por_sector_sin_Tech)

                    Sector       RMSE
0          basic-materials  25.571459
1   communication-services  37.026386
2        consumer-cyclical  23.417727
3       consumer-defensive  24.775851
4                   energy  19.117372
5       financial-services  18.870009
6               healthcare  23.998658
7              industrials  21.497625
8              real-estate  19.801377
9               technology  40.996995
10               utilities  13.786849


  .apply(lambda g: np.sqrt(np.mean((g["Pred_GRU"] - g["Rentabilidad"])**2)))
