In [3]:
import pandas as pd
import plotly.express as px

import matplotlib.pyplot as plt

In [2]:
route_data = "./Datos ecopetrol/Datos_analisis_EC_traducidos_totalmente.csv"
data = pd.read_csv(route_data)
data["Date"] = pd.to_datetime(data["Date"])
data = data.sort_values(by="Date")

news_per_day = data.groupby('Date').size().reset_index(name='count')
news_per_week = data.groupby(pd.Grouper(key='Date', freq='W')).size().reset_index(name='count')
news_per_month = data.groupby(pd.Grouper(key='Date', freq='M')).size().reset_index(name='count')

In [None]:
fig = px.bar(news_per_day, x='Date', y='count', title='Cantidad de noticias por día')
fig.update_xaxes(title='Fecha')
fig.update_yaxes(title='Cantidad de noticias')
fig.show()

In [11]:
def line_chart_st(data, variables_list, title, width=600, height=500, y_title=""):
    plt.figure(figsize=(width/100, height/100))  # Convertir las dimensiones de píxeles a pulgadas
    for var in variables_list:
        plt.plot(data.index, data[var], label=var)

    plt.title(title, color='white')  # Establecer el color del título
    plt.xlabel('Date', color='white')  # Establecer el color del texto del eje x
    plt.ylabel(y_title, color='white')  # Establecer el color del texto del eje y
    plt.tick_params(axis='x', colors='white')  # Establecer el color de los valores del eje x
    plt.tick_params(axis='y', colors='white')  # Establecer el color de los valores del eje y
    plt.legend()
    plt.style.use('dark_background')  # Estilo oscuro
    plt.gca().set_facecolor('#121212')  # Ajustar el color de fondo
    plt.tight_layout()

    plt.show()

In [26]:
plot_data = news_per_month.loc[news_per_month["Date"]>="2012-07-01"]
plot_data

Unnamed: 0,Date,count
5,2012-07-31,15
6,2012-08-31,9
7,2012-09-30,12
8,2012-10-31,15
9,2012-11-30,37
...,...,...
141,2023-11-30,77
142,2023-12-31,28
143,2024-01-31,39
144,2024-02-29,32


In [5]:
fig = px.bar(plot_data, x='Date', y='count', title='Cantidad de noticias por semana', text='count')
fig.update_xaxes(title='Semana')
fig.update_yaxes(title='Cantidad de noticias')
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(
    uniformtext_minsize=8,
    uniformtext_mode='hide',
    width=800,
    height=500,
    title={
        'text': "Cantidad de noticias por semana",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top',
        'font': {'size': 25}
    }
)
fig.show()

In [28]:
data = data.loc[(data["Date"]>="2012-07-01") & (data["Date"]<"2023-12-31")]

news_per_day = data.groupby('Date').size().reset_index(name='count')
news_per_week = data.groupby(pd.Grouper(key='Date', freq='W')).size().reset_index(name='count')
news_per_month = data.groupby(pd.Grouper(key='Date', freq='M')).size().reset_index(name='count')

# Calcular los tamaños de los conjuntos
total_rows = len(news_per_week)
train_size = int(total_rows * 0.7)
val_size = int(total_rows * 0.15)

# Dividir los datos en conjuntos de entrenamiento, validación y prueba
train_weekly = news_per_week[:train_size]
val_weekly = news_per_week[train_size:train_size+val_size]
test_weekly = news_per_week[train_size+val_size:]

date1 = train_weekly['Date'].max().strftime('%Y-%m-%d')
date2 = val_weekly['Date'].max().strftime('%Y-%m-%d')
date3 = test_weekly['Date'].max().strftime('%Y-%m-%d')
print("Train until date for week data:", date1)
print(date2)
print(date3)

Train until date for week data: 2020-07-12
2022-04-03
2023-12-31


In [None]:
# Dividir los datos en conjuntos de entrenamiento, validación y prueba
train_weekly = data.loc[data['Date'] <= date1]
val_weekly = data.loc[(data['Date'] > date1) & (data['Date'] <= date2)]
test_weekly = data.loc[data["Date"] > date2 ]

train_weekly["Split"] = "Train"
val_weekly["Split"] = "Validation"
test_weekly["Split"] = "Test"

week_data = pd.concat([train_weekly,val_weekly,test_weekly], ignore_index=True)

In [None]:
#week_data.to_csv("./Datos/week_data_split.csv", index=False)

In [29]:
data = data.loc[(data["Date"]>="2012-07-01") & (data["Date"]<"2023-12-31")]

news_per_day = data.groupby('Date').size().reset_index(name='count')
news_per_week = data.groupby(pd.Grouper(key='Date', freq='W')).size().reset_index(name='count')
news_per_month = data.groupby(pd.Grouper(key='Date', freq='M')).size().reset_index(name='count')

# Calcular los tamaños de los conjuntos
total_rows = len(news_per_day)
train_size = int(total_rows * 0.7)
val_size = int(total_rows * 0.15)

# Dividir los datos en conjuntos de entrenamiento, validación y prueba
train_daily = news_per_day[:train_size]
val_daily = news_per_day[train_size:train_size+val_size]
test_daily = news_per_day[train_size+val_size:]

date1 = train_daily['Date'].max().strftime('%Y-%m-%d')
date2 = val_daily['Date'].max().strftime('%Y-%m-%d')
date3 = test_daily['Date'].max().strftime('%Y-%m-%d')
print("Train until date for daily data:", date1)
print(date2)
print(date3)

Train until date for daily data: 2020-10-29
2022-06-26
2023-12-26


In [None]:
# Dividir los datos en conjuntos de entrenamiento, validación y prueba
train_daily = data.loc[data['Date'] <= date1]
val_daily = data.loc[(data['Date'] > date1) & (data['Date'] <= date2)]
test_daily = data.loc[data["Date"] > date2 ]

train_daily["Split"] = "Train"
val_daily["Split"] = "Validation"
test_daily["Split"] = "Test"

day_data = pd.concat([train_daily,val_daily,test_daily], ignore_index=True)

In [None]:
#day_data.to_csv("./Datos/day_data_split.csv", index=False)