In [32]:
'''
1. Obtener Datos de la BD Fecha | ID | Ingreso  >> renombrar columnas
2. Pivot Table por ID
3. Hay fechas faltantes? Llenar fechas
3.5 Checar valores atipicos
4. Manejo de nulos (son al principio o no hubo ingresos)
4. Agrupar por semana *
5. 
'''
from statsmodels.tsa.holtwinters import SimpleExpSmoothing
import pandas as pd
import numpy as np

In [33]:
# load data
def load_data() -> pd.DataFrame:
    '''
    Args:
        None

    Returns:
        pivot_data [pd.DataFrame]: Dataframe containing time series by company_id.
    '''
    data = pd.read_csv('../model/dataquery.csv', index_col=0)
    data.columns = ['date', 'company', 'data']
    data['date'] = pd.to_datetime(data['date'])

    pivot_data = data.pivot(index='date', columns='company', values='data')
    pivot_data.columns = [f'company_{col}' for col in pivot_data.columns]
    pivot_data.reset_index(inplace=True)

    return pivot_data

pivot_data = load_data()
pivot_data

Unnamed: 0,date,company_1,company_2,company_3,company_4,company_5,company_6
0,2020-11-01,9.002395e+05,,2.740852e+04,,,
1,2020-11-02,7.580464e+05,,1.102843e+05,,,
2,2020-11-03,7.018318e+05,,1.185705e+05,,,
3,2020-11-04,5.149224e+05,,1.210628e+05,,,
4,2020-11-05,7.544703e+05,,1.639952e+05,,,
...,...,...,...,...,...,...,...
1516,2024-12-27,2.200914e+06,2.594027e+06,1.213341e+06,6978.373338,50096.2245,653910.662472
1517,2024-12-28,2.312137e+06,2.603804e+06,1.103839e+06,6675.982785,35172.2700,665208.517454
1518,2024-12-29,2.174856e+06,2.783804e+06,1.272284e+06,6604.603972,32966.6400,587478.563432
1519,2024-12-30,2.713556e+06,2.780263e+06,1.378832e+06,6360.479553,32294.2095,660170.179135


In [34]:
def fill_missing_dates(data:pd.DataFrame) -> pd.DataFrame:
    """
    Fill missing dates in a DataFrame with null values for all columns.
    
    Parameters:
        data (pd.DataFrame): DataFrame containing the date column and a columns for each company ID.
        date_column (str): Name of the column containing the dates.
    
    Returns:
        pd.DataFrame: DataFrame with missing dates filled with null values.
    """

    date_column = 'date'
    
    # Ensure the date column is in datetime format
    data[date_column] = pd.to_datetime(data[date_column])
    
    # Set the date column as the index
    data.set_index(date_column, inplace=True)
    
    # Create a complete date range from the min to the max date in the DataFrame
    complete_date_range = pd.date_range(start=data.index.min(), end=data.index.max(), freq='D')
    
    # Reindex the DataFrame to this complete date range
    df_reindexed = data.reindex(complete_date_range)
    
    # Rename the index back to the original date column name
    df_reindexed.index.name = date_column
    
    # Reset index to convert the date index back to a column
    df_reindexed.reset_index(inplace=True)
    
    return df_reindexed

complete_dates = pivot_data.copy()
complete_dates = fill_missing_dates(pivot_data)
complete_dates

Unnamed: 0,date,company_1,company_2,company_3,company_4,company_5,company_6
0,2020-11-01,9.002395e+05,,2.740852e+04,,,
1,2020-11-02,7.580464e+05,,1.102843e+05,,,
2,2020-11-03,7.018318e+05,,1.185705e+05,,,
3,2020-11-04,5.149224e+05,,1.210628e+05,,,
4,2020-11-05,7.544703e+05,,1.639952e+05,,,
...,...,...,...,...,...,...,...
1517,2024-12-27,2.200914e+06,2.594027e+06,1.213341e+06,6978.373338,50096.2245,653910.662472
1518,2024-12-28,2.312137e+06,2.603804e+06,1.103839e+06,6675.982785,35172.2700,665208.517454
1519,2024-12-29,2.174856e+06,2.783804e+06,1.272284e+06,6604.603972,32966.6400,587478.563432
1520,2024-12-30,2.713556e+06,2.780263e+06,1.378832e+06,6360.479553,32294.2095,660170.179135


In [36]:
def handle_outliers(dataframe: pd.DataFrame) -> pd.DataFrame:
    '''
    Replaces outliers by zero to be treated like missing values.

    Args:
        dataframe [pd.DataFrame]: Columns 'date' and one for ecah company ID.

    Returns:
        [pd.DataFrame]: Same columns, outliers replaced by NaN values.


    '''
    for column in dataframe.columns:
        if column != 'date':
            # Calculate Q1 (25th percentile) and Q3 (75th percentile) for the current time series column
            Q1 = dataframe[column].quantile(0.25)
            Q3 = dataframe[column].quantile(0.75)
            IQR = Q3 - Q1  # Interquartile range

            # Define lower and upper bounds for outliers
            lower_bound = Q1 - 3 * IQR
            upper_bound = Q3 + 3* IQR

            # Replace outliers with NaN in the current column
            dataframe[column] = dataframe[column].apply(lambda x: x if lower_bound <= x <= upper_bound else np.nan)
    
    return dataframe

no_outliers = complete_dates.copy()
no_outliers = handle_outliers(no_outliers)
no_outliers
    

Unnamed: 0,date,company_1,company_2,company_3,company_4,company_5,company_6
0,2020-11-01,9.002395e+05,,2.740852e+04,,,
1,2020-11-02,7.580464e+05,,1.102843e+05,,,
2,2020-11-03,7.018318e+05,,1.185705e+05,,,
3,2020-11-04,5.149224e+05,,1.210628e+05,,,
4,2020-11-05,7.544703e+05,,1.639952e+05,,,
...,...,...,...,...,...,...,...
1517,2024-12-27,2.200914e+06,2.594027e+06,,6978.373338,50096.2245,653910.662472
1518,2024-12-28,2.312137e+06,2.603804e+06,1.103839e+06,6675.982785,35172.2700,665208.517454
1519,2024-12-29,2.174856e+06,2.783804e+06,,6604.603972,32966.6400,587478.563432
1520,2024-12-30,2.713556e+06,2.780263e+06,,6360.479553,32294.2095,660170.179135


In [37]:
def fill_missing_data(data: pd.DataFrame) -> pd.DataFrame:
    # Create a copy of the data to avoid modifying the original dataframe
    filled_data = data.copy()

    # Iterate over each column except the 'date' column
    for column in filled_data.columns:
        if column != 'date':
            # Calculate the mean of the series, ignoring NaNs
            mean_value = filled_data[column].mean()
            
            # Fill NaN values with the mean value
            filled_data[column] = filled_data[column].fillna(mean_value)
        
    return filled_data

no_missing_values = no_outliers.copy()
no_missing_values = fill_missing_data(no_missing_values)
no_missing_values

Unnamed: 0,date,company_1,company_2,company_3,company_4,company_5,company_6
0,2020-11-01,9.002395e+05,1.429850e+06,2.740852e+04,8161.320082,12620.00149,265817.198772
1,2020-11-02,7.580464e+05,1.429850e+06,1.102843e+05,8161.320082,12620.00149,265817.198772
2,2020-11-03,7.018318e+05,1.429850e+06,1.185705e+05,8161.320082,12620.00149,265817.198772
3,2020-11-04,5.149224e+05,1.429850e+06,1.210628e+05,8161.320082,12620.00149,265817.198772
4,2020-11-05,7.544703e+05,1.429850e+06,1.639952e+05,8161.320082,12620.00149,265817.198772
...,...,...,...,...,...,...,...
1517,2024-12-27,2.200914e+06,2.594027e+06,2.411223e+05,6978.373338,50096.22450,653910.662472
1518,2024-12-28,2.312137e+06,2.603804e+06,1.103839e+06,6675.982785,35172.27000,665208.517454
1519,2024-12-29,2.174856e+06,2.783804e+06,2.411223e+05,6604.603972,32966.64000,587478.563432
1520,2024-12-30,2.713556e+06,2.780263e+06,2.411223e+05,6360.479553,32294.20950,660170.179135
