In [1]:
from Utils.DataLoader import DataLoader
from Utils.DuckDb import DuckDb
from datetime import datetime
import pandas as pd
from typing import List
import math
import numpy as np

In [2]:
DUCK_DB_UTILS = DuckDb()

In [3]:
MININUM_USERS_TO_CONSIDER = 1_500_000
BATCH_SIZE_TO_GET_DATA_FROM_DATABASE = 50_000
PROCESS_USERS_BATCH_SIZE = 10_000

# MININUM_USERS_TO_CONSIDER = 10_000
# BATCH_SIZE_TO_GET_DATA_FROM_DATABASE = 10_000
# PROCESS_USERS_BATCH_SIZE = 10_000

In [4]:
# Max safra = 201702, so we need to consider three months before
MAX_SAFRA_TO_CONSIDER = 201611

In [5]:
datetime_string_identifier = datetime.now().strftime('%Y_%m_%d_%Hh%Mm')
DATABASE_NEW_TABLE = 'treated_churn_dataset_' + datetime_string_identifier
TEMP_TABLE_NAME = 'temp_' + datetime_string_identifier

In [6]:
DUCK_DB_CONN = DUCK_DB_UTILS.get_connection()

In [7]:
def get_user_list(limit: int = 9999999999, offset: int = 0) -> List[str]:
    conn = DUCK_DB_UTILS.get_connection()
    query = '''
        SELECT DISTINCT(msno)
        FROM main.user_logs ul
        ORDER BY msno
        LIMIT ?
        OFFSET ?
    '''
    query_results = conn.execute(query, [limit, offset]).fetchall()
    result = list(
        map(
            lambda qr: qr[0], query_results
        )
    )
    return result

In [8]:
def get_dataset_by_users(msnos: List[str]) -> pd.DataFrame:
    query = '''
        SELECT
            ----------------------
            -- Calculated fields --
            ----------------------
            50 + (0.0051 * num_unq) + (0.0001 * ul.total_secs) AS cost,
            --	t.actual_amount_paid - cost AS net_profit,
            ----------------------
            -- User Logs fields --
            ----------------------
            ul.msno,
            ul.safra,
            ul.num_25,
            ul.num_50,
            ul.num_75,
            ul.num_985,
            ul.num_100,
            ul.num_unq,
            ul.total_secs,
            ul.total_hours,
            -------------------------
            -- Transactions fields --
            -------------------------
            t.msno,
            t.payment_method_id,
            t.payment_plan_days,
            t.plan_list_price,
            t.actual_amount_paid,
            t.is_auto_renew,
            t.is_cancel,
            t.safra,
            t.transaction_date_year,
            t.transaction_date_month,
            t.transaction_date_day,
            t.transaction_date_day_of_week,
            t.transaction_date_day_of_year,
            t.membership_expire_date_year,
            t.membership_expire_date_month,
            t.membership_expire_date_day,
            t.membership_expire_date_day_of_week,
            t.membership_expire_date_day_of_year,
            t.discount,
            t.price_per_month,
            ---------------------
            -- Members columns --
            ---------------------
            m.msno as members_msno,
            m.safra as members_safra,
            m.city,
            m.registered_via,
            m.is_active,
            m.registration_init_time_year,
            m.registration_init_time_month,
            m.registration_init_time_day,
            m.registration_init_time_day_of_week,
            m.registration_init_time_day_of_year
        FROM
            main.user_logs ul
        INNER JOIN
            main.transactions t ON
            t.msno == ul.msno
            AND t.safra == ul.safra
        LEFT JOIN
            main.members m ON
            m.msno = ul.msno AND m.safra = ul.safra
        WHERE
            ul.msno IN ?
        ORDER BY
            ul.msno,
            ul.safra
    '''

    conn = DUCK_DB_UTILS.get_connection()
    query_results = conn.execute(query, (msnos,)).fetch_df()
    return query_results

In [9]:
def create_database_table(df: pd.DataFrame):
    DUCK_DB_CONN.register('df_view', df)
    DUCK_DB_CONN.execute(
        f"CREATE TABLE IF NOT EXISTS {DATABASE_NEW_TABLE} AS SELECT * FROM df_view WHERE FALSE"
    )

In [10]:
def upload_dataframe_to_duck_db(df: pd.DataFrame):
    print(f'{datetime.now()} Inserindo {len(df)} registros na tabela {DATABASE_NEW_TABLE}')

    DUCK_DB_CONN.register('df', df)
    DUCK_DB_CONN.execute(
        f"INSERT INTO {DATABASE_NEW_TABLE} SELECT * FROM df"
    )

    print(f'{datetime.now()} Inseridos com sucesso')

In [11]:
def truncate_table():
    DUCK_DB_CONN.execute(
        f"TRUNCATE {DATABASE_NEW_TABLE}"
    )

In [12]:
def get_dataset() -> pd.DataFrame:
    all_dfs: List[pd.DataFrame] = []

    count = 0
    while count < MININUM_USERS_TO_CONSIDER:
        print(f'Processando count: {count}')

        users_msno = get_user_list(limit=BATCH_SIZE_TO_GET_DATA_FROM_DATABASE, offset=count)
        count += BATCH_SIZE_TO_GET_DATA_FROM_DATABASE

        all_dfs.append(
            get_dataset_by_users(users_msno)
        )

    print(f'Qtd. de dataframes: {len(all_dfs)}')

    all_dfs = list(
        filter(
            lambda df: df.__len__() > 0, all_dfs
        )
    )

    print(f'Qtd. de dataframes pós remoção dos vazios: {len(all_dfs)}')

    result = pd.concat(all_dfs)
    return result

In [13]:
# full_dataframe = get_dataset()
full_dataframe = pd.read_csv('./full-dataframe-cached-complete.csv')

In [None]:
full_dataframe.to_csv(f'./full-dataframe-cached-{datetime_string_identifier}.csv', index=False)

In [15]:
users_msno = list(full_dataframe['msno'].unique())

In [16]:
def get_next_safras(safra: int, month_qty: int) -> int:
    str_safra = str(safra)
    year, month = int(str_safra[:4]), int(str_safra[4:])

    month -= 1

    month += month_qty

    year += month // 12
    month = month % 12

    month += 1

    formatted_month = f'0{month}' if month < 10 else (month)
    return int(f'{year}{formatted_month}')


In [17]:
def get_previous_months_cols_values(user_df: pd.DataFrame, row: pd.Series, cols: List[str]) -> pd.DataFrame:
    current_safra = row['safra']

    safras_to_consider = [
        # -5, # previous safras
        # -4,
        # -3,
        -2,
        -1,

        # +1, # next safra
    ]

    if not all_safras_exist(user_df, current_safra, safras_to_consider):
        # print(f'Linha sem safras {safras_to_consider}')
        return row
    
    for col in cols:
        for safra_modifier in safras_to_consider:
            safra = get_next_safras(current_safra, safra_modifier)
            safra_row = user_df[user_df['safra'] == safra].reset_index()
            title = f'{col}{safra_modifier if safra_modifier < 0 else f"+{safra_modifier}"}M'
            row[title] = safra_row[col][0]

    # print(f'Linha {row} ajustada')
    return row


def all_safras_exist(user_df: pd.DataFrame, current_safra: int, safras_to_consider: List[int]) -> bool:
    for safra_modifier in safras_to_consider:
        next_safra = get_next_safras(current_safra, safra_modifier)
        filtered = user_df[user_df['safra'] == next_safra]

        if len(filtered) == 0:
            return False
        
    return True


In [18]:
# Key: user msno, value: row to copy if the current values are missing
USER_ROW_TO_COPY_MEMBER_INFO = {}

In [19]:
def calc_churn(df: pd.DataFrame, users_msno: List[str]) -> pd.DataFrame:
    global USER_ROW_TO_COPY_MEMBER_INFO

    df_by_users = {}

    print(f'Separando DataFrames por usuários')
    for index, usr in enumerate(users_msno):
        if index % 1000 == 0:
            print(f'-> {index} / {len(users_msno)}')
            
        df_by_users[usr] = df[df['msno'] == usr]


    def __calc_row_churn(user_df: pd.DataFrame, row: pd.Series):
        months_to_consider_churn = 3

        is_churn = False
        no_churn_information = False
        for m in range(months_to_consider_churn, 0, -1):
            next_safra = get_next_safras(row['safra'], m)

            # print('#' * 100)
            # print(f'User: {row["msno"]}')
            # print(f'Current safra: {row["safra"]}')
            # print(f'Next safra: {next_safra}')

            # Can't obtain info from this safra so on
            if row["safra"] > MAX_SAFRA_TO_CONSIDER:
                print('current safra > MAX_SAFRA_TO_CONSIDER!')
                no_churn_information = True
                break

            # print(f'Recuperando informações da safra {next_safra}')

            # print(user_df['safra'])
            next_safra_row = user_df[user_df['safra'] == next_safra].reset_index()

            # No more payment info, consider churn
            if len(next_safra_row) == 0:
                print(f'Safra {next_safra} não encontrada, pulando')
                is_churn = True
                break

            # Canceled, is churn
            if next_safra_row['is_cancel'][0] == True:
                print(f'Safra {next_safra} encontrada com is_cancel, marcando como churn!')
                is_churn = True
                break

        row['is_churn'] = is_churn
        row['no_churn_information'] = no_churn_information
        return row
    

    def __calc_row_churn_v2(user_df: pd.DataFrame, row: pd.Series):
        '''
        Consider only the third month ahead
        '''
        months_to_consider_churn = 3

        is_churn = False
        no_churn_information = False

        # print('#' * 100)
        # print(f'User: {row["msno"]}')
        # print(f'Current safra: {row["safra"]}')

        next_safra = get_next_safras(row['safra'], months_to_consider_churn)
        # print(f'Next safra: {next_safra}')

        # Can't obtain info from this safra so on
        if row["safra"] > MAX_SAFRA_TO_CONSIDER:
            # print('current safra > MAX_SAFRA_TO_CONSIDER!')
            no_churn_information = True

        else:
            next_safra_row = user_df[user_df['safra'] == next_safra].reset_index()

            # No more payment info, consider churn
            if len(next_safra_row) == 0:
                # print(f'Safra {next_safra} não encontrada, pulando')
                is_churn = True

            # Canceled, is churn
            elif next_safra_row['is_cancel'][0] == True:
                # print(f'Safra {next_safra} encontrada com is_cancel, marcando como churn!')
                is_churn = True

        row['is_churn'] = is_churn
        row['no_churn_information'] = no_churn_information
        return row



    def __fill_out_members_data_if_needed(user_df: pd.DataFrame, row: pd.Series):
        # Sometimes, we don't have the member information from all safras,
        # so we'll copy the values from the closer month

        members_msno_col = 'members_msno'

        # Already filled out
        if not row[members_msno_col] == None:
            row['_filled_out_members_info'] = 'Infos já existentes'
            return row

        # No user info available in any safras
        safras_df = user_df[~user_df[members_msno_col].isna()]
        if len(safras_df) == 0:
            row['_filled_out_members_info'] = 'Sem infos'
            return row

        members_cols_to_consider = [
            'city', 'registered_via', 'is_active', 'registration_init_time_year',
            'registration_init_time_month', 'registration_init_time_day',
            'registration_init_time_day_of_week', 'registration_init_time_day_of_year',
            'members_msno'
        ]

        if row['msno'] not in USER_ROW_TO_COPY_MEMBER_INFO:
            current_safra = row['safra']

            min_safra = safras_df['safra'].min()
            max_safra = safras_df['safra'].max()

            safra_to_consider = min_safra if current_safra < max_safra else max_safra

            row_to_copy = safras_df[safras_df['safra'] == safra_to_consider]
            row_to_copy = row_to_copy.reset_index()
            USER_ROW_TO_COPY_MEMBER_INFO[row['msno']] = row_to_copy
        else:
            row_to_copy = USER_ROW_TO_COPY_MEMBER_INFO[row['msno']]

        row['_filled_out_members_info'] = 'Copiadas'
        for col in members_cols_to_consider:
            row[col] = row_to_copy[col][0]

        return row


    rows = []
    users_qty = len(df_by_users.values())
    count = 0
    for msno, user_df in df_by_users.items():
        # print(f'Processando usuário {msno}')

        count += 1
        if count % 500 == 0:
            print(f'{datetime.now()} Processando usuário {count}/{users_qty} ({msno})')

        for _, user_row in user_df.iterrows():
            user_row = __fill_out_members_data_if_needed(user_df, user_row)
            # user_row = __calc_row_churn(user_df, user_row)
            user_row = __calc_row_churn_v2(user_df, user_row)

            user_row = get_previous_months_cols_values(
                user_df,
                user_row,
                ['num_unq', 'total_secs', 'num_25', 'num_50', 'num_75', 'num_985', 'num_100']
            )

            rows.append(user_row)
            # print(user_row)

        # Resetting this cache
        USER_ROW_TO_COPY_MEMBER_INFO = {}

    result = pd.DataFrame(rows)
    return result

In [20]:
def process_dataframe(full_dataframe: pd.DataFrame, users_msno: List[str]):
    # Process in batches
    total_iter = math.floor(len(full_dataframe) / PROCESS_USERS_BATCH_SIZE)

    print(f'Total de iterações: {total_iter}')
    print(f'Tamanho do Dataframe completo: {len(full_dataframe)}')

    result = None
    for i in range(0, total_iter):
        start = PROCESS_USERS_BATCH_SIZE * i
        end = PROCESS_USERS_BATCH_SIZE * (i + 1)

        print(f'{datetime.now()} Processando usuários ({start} / {end})')

        batch_users = users_msno[start:end]

        print(f'{datetime.now()} Separando um DF apenas com os usuários em questão')
        users_df = full_dataframe[full_dataframe['msno'].isin(batch_users)]

        print(f'Tamanho do Dataframe dos usuários do batch: {len(users_df)}')

        treated_df = calc_churn(
            users_df,
            batch_users
        )

        print(f'Tamanho da Dataframe: {len(treated_df)}')
        if len(treated_df) == 0:
            continue

        print('Removendo linhas sem informações de membros suficientes')
        treated_df = treated_df[~treated_df['members_msno'].isna()]

        # print('Removendo linhas sem informações suficientes para calcular o churn')
        # treated_df = treated_df[treated_df['no_churn_information'] == False]

        print(f'Tamanho da Dataframe pós filtros: {len(treated_df)}')

        create_database_table(treated_df)
        if i == 0:
            print('Truncando tabela')
            truncate_table()

        upload_dataframe_to_duck_db(treated_df)

        print('#' * 50)

        result = treated_df

    # Return the last processed one for debug purposes
    return result

In [21]:
treated_df = process_dataframe(
    full_dataframe,
    users_msno
)

Total de iterações: 450
Tamanho do Dataframe completo: 4505078
2025-03-16 12:05:26.409700 Processando usuários (0 / 10000)
2025-03-16 12:05:26.409771 Separando um DF apenas com os usuários em questão
Tamanho do Dataframe dos usuários do batch: 88711
Separando DataFrames por usuários
-> 0 / 10000
-> 1000 / 10000
-> 2000 / 10000
-> 3000 / 10000
-> 4000 / 10000
-> 5000 / 10000
-> 6000 / 10000
-> 7000 / 10000
-> 8000 / 10000
-> 9000 / 10000
2025-03-16 12:06:23.423551 Processando usuário 500/10000 (+/E2nO8eLJPkPPpGxPku1WC7CzgEMkoe1qaToS/tIkE=)
2025-03-16 12:06:50.430619 Processando usuário 1000/10000 (+0MZZ5TW5El2B1kb+5hyH+OG+Fl9W3Nim/AoXqrSWqI=)
2025-03-16 12:07:16.699206 Processando usuário 1500/10000 (+1VHsZGC0OKuc2X8mUNpUmHgtE1XIxBckQyGiGwyx3s=)
2025-03-16 12:07:43.969137 Processando usuário 2000/10000 (+2Xud/e/yNe16BlzzyWEo0DRVM896O5yX8wVCbs/1SA=)
2025-03-16 12:08:09.538693 Processando usuário 2500/10000 (+3dY1322IjbX1OhRijMHXQ3d5H1sHjs2uTLhM5WK9yg=)
2025-03-16 12:08:34.227061 Processa

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

2025-03-16 16:06:53.517959 Inseridos com sucesso
##################################################
2025-03-16 16:06:53.519605 Processando usuários (240000 / 250000)
2025-03-16 16:06:53.519963 Separando um DF apenas com os usuários em questão
Tamanho do Dataframe dos usuários do batch: 88082
Separando DataFrames por usuários
-> 0 / 10000
-> 1000 / 10000
-> 2000 / 10000
-> 3000 / 10000
-> 4000 / 10000
-> 5000 / 10000
-> 6000 / 10000
-> 7000 / 10000
-> 8000 / 10000
-> 9000 / 10000
2025-03-16 16:07:49.061823 Processando usuário 500/10000 (6gY/htVKWRCuA5Ql2FEYw0JKDLSWY+Q44wqC4DxmQO8=)
2025-03-16 16:08:16.954986 Processando usuário 1000/10000 (6hiZW8QiUN82yLOgtACU8pnmRrx/1Aq365lX8SXPcPU=)
2025-03-16 16:08:42.552229 Processando usuário 1500/10000 (6isPUK1PnErQmYth9wy4IDF4cwh2H3nYiHePJO+p9p8=)
2025-03-16 16:10:23.005646 Processando usuário 2000/10000 (6k0/Yqi5l6ldoLVvxKHFQxU7BLjQZQ+NMGgPsPfqPDI=)
2025-03-16 16:09:34.626832 Processando usuário 2500/10000 (6lCBmYKtbMf8Cq8dwSczfMVYP1AS3qggUW1fMS

In [22]:
DUCK_DB_CONN.close()

In [25]:
treated_df[['safra', 'is_churn', 'is_cancel', 'no_churn_information', 'num_unq', 'num_unq-1M', 'num_unq-2M', 'total_secs', 'total_secs-1M', 'total_secs-2M', 'num_25', 'num_25-1M', 'num_25-2M', 'num_50', 'num_50-1M', 'num_50-2M', 'num_75', 'num_75-1M', 'num_75-2M', 'num_985', 'num_985-1M', 'num_985-2M', 'num_100', 'num_100-1M', 'num_100-2M']]

Unnamed: 0,safra,is_churn,is_cancel,no_churn_information,num_unq,num_unq-1M,num_unq-2M,total_secs,total_secs-1M,total_secs-2M,...,num_50-2M,num_75,num_75-1M,num_75-2M,num_985,num_985-1M,num_985-2M,num_100,num_100-1M,num_100-2M
4441153,201612,False,False,True,184,,,31786,,,...,,4,,,23,,,96,,
4441160,201606,True,False,False,586,,,154839,,,...,,48,,,31,,,524,,
4441161,201608,True,False,False,1439,,,246362,,,...,,91,,,107,,,809,,
4441163,201608,False,False,False,203,,,54199,,,...,,11,,,8,,,186,,
4441164,201609,False,False,False,288,,,88995,,,...,,13,,,22,,,306,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4505071,201608,False,False,False,289,82.0,216.0,80181,16079.0,78065.0,...,16.0,31,6.0,13.0,34,3.0,17.0,288,52.0,308.0
4505072,201609,False,False,False,333,289.0,82.0,69545,80181.0,16079.0,...,16.0,42,31.0,6.0,43,34.0,3.0,234,288.0,52.0
4505073,201610,True,False,False,282,333.0,289.0,49442,69545.0,80181.0,...,47.0,12,42.0,31.0,15,43.0,34.0,206,234.0,288.0
4505074,201611,False,False,False,481,282.0,333.0,102839,49442.0,69545.0,...,42.0,55,12.0,42.0,43,15.0,43.0,410,206.0,234.0


In [None]:
treated_df['no_churn_information'].value_counts()

In [None]:
treated_df[treated_df['no_churn_information'] == True]

In [None]:
treated_df[treated_df['msno'] == '+++l/EXNMLTijfLBa8p2TUVVVp2aFGSuUI/h7mLmthw='][['safra', 'is_churn', 'is_cancel', 'no_churn_information']]

In [None]:
treated_df.columns

In [None]:
input('TODO: DELETAR LINHAS COM is_churn = NaN e sem informações de membros')

In [None]:
# upload_dataframe_to_duck_db(treated_df)