In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
# Define path to files
path_to_files = "../berkabank/primary/"


# Load the data
files = ["accounts","transactions"]
data = {file: pd.read_csv(f"{path_to_files}{file}.csv") for file in files}

# Convert dates to datetime
data = {
    key: df.apply(lambda col: pd.to_datetime(col) if "date" in col.name else col)
    for key, df in data.items()
}

In [3]:
data['transactions'].columns

Index(['trans_id', 'account_id', 'transaction_date', 'transaction_type',
       'transaction_operation', 'transaction_amount',
       'account_balance_after_transaction', 'k_symbol'],
      dtype='object')

In [47]:
from dataclasses import dataclass
import pandas as pd
import numpy as np
from typing import List, Optional, Dict
from pydantic import BaseModel


@dataclass
class EodBalanceBuilder:
    """
    Utility class for creating end-of-day balance from transactions and accounts data.
    Steps:
        1. Generate list of all dates and account IDs
        2. Left join with self.transactions DataFrame
        3. Sort eod_balance by account_id and transaction_date
        4. Merge eod_balance with account on account_id to get account_creation_date
        5. Remove self.transactions before account creation date
        6. Fill NaNs with 0
        7. Give sign to transaction amount based on transaction type
        8. Calculate end-of-day balance
        9. Rename columns
        10. Sort by balance_date and account_id
        11. Filter columns
        12. Return eod_balance

    Attributes:
        transactions: DataFrame of transactions
        accounts: DataFrame of accounts
        config: Dictionary of configuration parameters
    Methods:
        run: Creates end-of-day balance from transactions and accounts data.

    Returns:
        eod_balance: DataFrame of end-of-day balance
    """

    transactions: pd.DataFrame
    accounts: pd.DataFrame
    config: Dict[str, Optional[str]]

    def run(self):
        
        # Generate list of all dates and account IDs
        dates = pd.date_range(
            start=self.transactions[self.config["transaction_date"]].min(),
            end=self.transactions[self.config["transaction_date"]].max(),
            freq="D",
        )
        all_account_ids = self.transactions[self.config["account_id"]].unique()
        all_dates_df = pd.DataFrame(
            [(account_id, date) for account_id in all_account_ids for date in dates],
            columns=[self.config["account_id"], self.config["transaction_date"]],
        )

        # Convert dates to datetime
        all_dates_df[self.config["transaction_date"]] = pd.to_datetime(
            all_dates_df[self.config["transaction_date"]]
        )
        self.accounts['account_creation_date'] = pd.to_datetime(self.accounts['account_creation_date'])
        self.transactions['transaction_date'] = pd.to_datetime(self.transactions['transaction_date'])
        

        # Left join with self.transactions DataFrame
        eod_balance = pd.merge(
            all_dates_df,
            self.transactions,
            how="left",
            on=[self.config["account_id"], self.config["transaction_date"]],
        )

        # Sort eod_balance by account_id and transaction_date
        eod_balance = eod_balance.sort_values(
            [self.config["account_id"], self.config["transaction_date"]]
        )

        # Merge eod_balance with account on account_id to get account_creation_date
        eod_balance = eod_balance.merge(
            self.accounts[
                [self.config["account_id"], self.config["account_creation_date"]]
            ],
            on=self.config["account_id"],
            how="left",
        )

        # Remove self.transactions before account creation date
        eod_balance = eod_balance[
            eod_balance[self.config["transaction_date"]]
            >= eod_balance[self.config["account_creation_date"]]
        ]

        # Fill NaNs with 0
        eod_balance[self.config["transaction_amount"]] = eod_balance[
            self.config["transaction_amount"]
        ].fillna(0)

        # Give sign to transaction amount based on transaction type
        eod_balance[self.config["daily_amount_flow"]] = np.where(
            eod_balance[self.config["transaction_type"]] == self.config["outflow"],
            -eod_balance[self.config["transaction_amount"]],
            eod_balance[self.config["transaction_amount"]],
        )

        # Calculate end-of-day balance
        eod_balance[self.config["end_of_day_balance"]] = eod_balance.groupby(
            self.config["account_id"]
        )[self.config["daily_amount_flow"]].cumsum()

        # Rename columns
        eod_balance.rename(
            columns={self.config["transaction_date"]: self.config["balance_date"]},
            inplace=True,
        )

        # Sort by balance_date and account_id
        eod_balance = eod_balance.sort_values(
            [self.config["balance_date"], self.config["account_id"]]
        )

        # Filter columns
        eod_balance = eod_balance[
            [
                self.config["account_id"],
                self.config["balance_date"],
                self.config["end_of_day_balance"],
                self.config["daily_amount_flow"],
                self.config["account_creation_date"],
            ]
        ]
        return eod_balance


In [48]:
transactions  = pd.read_csv('gs://berkabank/production/data/01_raw/transactions.csv')
accounts = pd.read_csv('gs://berkabank/production/data/01_raw/accounts.csv')

In [49]:
config= {
        
            "transaction_date": "transaction_date",
            "account_id": "account_id",
            "transaction_amount": "transaction_amount",
            "transaction_type": "transaction_type",
            "daily_amount_flow": "daily_amount_flow",
            "end_of_day_balance": "end_of_day_balance",
            "account_creation_date": "account_creation_date",
            "balance_date": "balance_date",
            "outflow": "outflow",
        }
        

In [50]:
PIPELINE_NAME = 'production'
eod_balance_builder = EodBalanceBuilder(
        transactions=pd.read_csv('gs://berkabank/production/data/01_raw/transactions.csv'),
        accounts=pd.read_csv('gs://berkabank/production/data/01_raw/accounts.csv'),
        config= {
            "transaction_date": "transaction_date",
            "account_id": "account_id",
            "transaction_amount": "transaction_amount",
            "transaction_type": "transaction_type",
            "daily_amount_flow": "daily_amount_flow",
            "end_of_day_balance": "end_of_day_balance",
            "account_creation_date": "account_creation_date",
            "balance_date": "balance_date",
            "outflow": "outflow",
        }
    )
    
eod_balance = eod_balance_builder.run()


In [51]:
eod_balance.to_csv('gs://berkabank/production/data/02_elaboration/eod_balance.csv', index=False)