In [None]:
from Utils.DataLoader import DataLoader
from Utils.DuckDb import DuckDb
from datetime import datetime
import pandas as pd
from typing import List

In [None]:
DUCK_DB_UTILS = DuckDb()

In [None]:
def get_user_list(limit: int = 9999999999) -> List[str]:
    conn = DUCK_DB_UTILS.get_connection()
    query = '''
        SELECT DISTINCT(msno)
        FROM main.user_logs ul
        LIMIT ?
    '''
    query_results = conn.execute(query, [limit]).fetchall()
    result = list(
        map(
            lambda qr: qr[0], query_results
        )
    )
    return result

In [None]:
def get_dataset_by_users(msnos: List[str]) -> pd.DataFrame:
    query = '''
        SELECT
            ----------------------
            -- Calculated fields --
            ----------------------
            50 + (0.0051 * num_unq) + (0.0001 * ul.total_secs) AS cost,
            --	t.actual_amount_paid - cost AS net_profit,
            ----------------------
            -- User Logs fields --
            ----------------------
            ul.msno,
            ul.safra,
            ul.num_25,
            ul.num_50,
            ul.num_75,
            ul.num_985,
            ul.num_100,
            ul.num_unq,
            ul.total_secs,
            ul.total_hours,
            -------------------------
            -- Transactions fields --
            -------------------------
            t.msno,
            t.payment_method_id,
            t.payment_plan_days,
            t.plan_list_price,
            t.actual_amount_paid,
            t.is_auto_renew,
            t.is_cancel,
            t.safra,
            t.transaction_date_year,
            t.transaction_date_month,
            t.transaction_date_day,
            t.transaction_date_day_of_week,
            t.transaction_date_day_of_year,
            t.membership_expire_date_year,
            t.membership_expire_date_month,
            t.membership_expire_date_day,
            t.membership_expire_date_day_of_week,
            t.membership_expire_date_day_of_year,
            t.discount,
            t.price_per_month,
            ---------------------
            -- Members columns --
            ---------------------
            m.msno,
            m.safra,
            m.city,
            m.registered_via,
            m.is_active,
            m.registration_init_time_year,
            m.registration_init_time_month,
            m.registration_init_time_day,
            m.registration_init_time_day_of_week,
            m.registration_init_time_day_of_year
        FROM
            main.user_logs ul
        INNER JOIN
            main.transactions t ON
            t.msno == ul.msno
            AND t.safra == ul.safra
        INNER JOIN
            main.members m ON
            m.msno = ul.msno AND m.safra = ul.safra
        WHERE
            ul.msno IN ?
            AND
            t.is_cancel = False
        ORDER BY
            ul.msno, ul.safra
    '''

    conn = DUCK_DB_UTILS.get_connection()
    query_results = conn.execute(query, (msnos,)).fetch_df()
    return query_results

In [None]:
users_msno = get_user_list(10000)
print(users_msno)

df = get_dataset_by_users(users_msno)

In [None]:
def calc_past_months(df: pd.DataFrame, users_msno: List[str]) -> pd.DataFrame:
    df_by_users = {}
    for usr in users_msno:
        df_by_users[usr] = df[df['msno'] == usr]


    def process_user_row(user_df: pd.DataFrame, row: pd.Series) -> pd.DataFrame:
        current_safra = row['safra']

        safras_to_consider = [
            -1, # previous safra
            +1, # next safra
        ]

        if not all_safras_exist(user_df, current_safra, safras_to_consider):
            # print(f'Linha sem safras {safras_to_consider}')
            return row
        
        for safra_modifier in safras_to_consider:
            safra = current_safra + safra_modifier
            safra_row = user_df[user_df['safra'] == safra].reset_index()
            title = f'cost{safra_modifier if safra_modifier < 0 else f"+{safra_modifier}"}M'
            row[title] = safra_row['cost'][0]

        # print(f'Linha {row} ajustada')
        return row


    def all_safras_exist(user_df: pd.DataFrame, current_safra: int, safras_to_consider: List[int]) -> bool:
        for safra_modifier in safras_to_consider:
            safra = current_safra + safra_modifier
            filtered = user_df[user_df['safra'] == safra]

            if len(filtered) == 0:
                return False
            
        return True


    rows = []
    for msno, user_df in df_by_users.items():
        # print(f'Processando usuário {msno}')
        
        for _, user_row in user_df.iterrows():
            user_row = process_user_row(user_df, user_row)
            rows.append(user_row)
            # print(user_row)

    result = pd.DataFrame(rows)
    return result

In [None]:
treated_df = calc_past_months(df, users_msno)


In [None]:
treated_df.head()