In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
from collections import defaultdict

In [2]:
# Fetch data from Cloud Storage
from google.cloud import storage

client = storage.Client()
eod_balance = pd.read_csv(
    "gs://berkabank/production/data/03_primary/eod_balance_preprocessed.csv"
)
eod_balance["balance_date"] = pd.to_datetime(eod_balance["balance_date"])
accounts = pd.read_csv("gs://berkabank/production/data/01_raw/accounts.csv")
backup = {
    "eod_balance": eod_balance.copy(),
    "accounts": accounts.copy(),
}

In [15]:
eod_balance = backup["eod_balance"].copy()
accounts = backup["accounts"].copy()

In [9]:
from dataclasses import dataclass


@dataclass
class EodBalanceAggregation:
    eod_balance: pd.DataFrame
    accounts: pd.DataFrame
    incident_duration_days: int
    off_set_period_days: int
    column_mapping: dict

    def __post_init__(self):
        # Convert to datetime if not already
        self.eod_balance[self.column_mapping["balance_date"]] = pd.to_datetime(
            self.eod_balance[self.column_mapping["balance_date"]]
        )
        # Concat district information
        self.eod_balance = self.eod_balance.merge(
            self.accounts.loc[
                :,
                [self.column_mapping["account_id"], self.column_mapping["district_id"]],
            ],
            on=self.column_mapping["account_id"],
        )
        # Create incidents
        self.incidents = self.create_incidents()

    def create_incidents(self):
        # Collect incidents: Primary Account that stays for 20 consecutive days with eod_balance under amount.
        incidents = (
            self.eod_balance.loc[
                self.eod_balance[self.column_mapping["target"]] == True
            ]
            # Drop duplicates to keep only the first incident date
            .drop_duplicates(subset=self.column_mapping["account_id"], keep="first")
            .loc[
                :,
                [
                    self.column_mapping["account_id"],
                    self.column_mapping["balance_date"],
                    self.column_mapping["district_id"],
                ],
            ]
            .sort_values(self.column_mapping["balance_date"])
        )
        # Rename the balance date as incident date
        incidents = incidents.rename(
            {self.column_mapping["balance_date"]: self.column_mapping["incident_date"]},
            axis=1,
        )
        # Sort dataframes by date
        incidents.sort_values(self.column_mapping["incident_date"], inplace=True)
        self.eod_balance.sort_values(self.column_mapping["balance_date"], inplace=True)

        # Calculate t0: 1year and 20 days before day of incident, and t1: 20 days before incident; for each incident date
        incidents[self.column_mapping["t0"]] = incidents[
            self.column_mapping["incident_date"]
        ] - pd.DateOffset(days=self.incident_duration_days + self.off_set_period_days)
        incidents[self.column_mapping["t1"]] = incidents[
            self.column_mapping["incident_date"]
        ] - pd.DateOffset(days=self.incident_duration_days)

        # Filter out incidents that are before the beginning of the dataset, and therefore not usable.
        incidents = incidents.loc[
            incidents[self.column_mapping["t0"]]
            > self.eod_balance[self.column_mapping["balance_date"]].min(),
            :,
        ]

        return incidents

    def collect_eod_balance_incidents_accounts(self):
        # Collect district_id information for each account id
        self.eod_balance = self.eod_balance.merge(
            self.accounts.loc[:, [self.column_mapping["account_id"]]],
            on=self.column_mapping["account_id"],
        )

        # Extend with information of incident, connect to each period to t0 and t1
        eod_balance_incidents = pd.merge(
            self.eod_balance,
            self.incidents,
            on=[self.column_mapping["account_id"], self.column_mapping["district_id"]],
        )
        # Filter period between t0 and t1 for each account_id in incidents
        eod_balance_period = eod_balance_incidents.loc[
            eod_balance_incidents[self.column_mapping["balance_date"]].between(
                eod_balance_incidents[self.column_mapping["t0"]],
                eod_balance_incidents[self.column_mapping["t1"]],
            )
        ]
        eod_balance_incidents = eod_balance_period.sort_values(
            [self.column_mapping["balance_date"], self.column_mapping["account_id"]]
        )

        return eod_balance_incidents

    def fair_no_incidents_accounts(self):
        # Extend info about incidents and districts ( if account is in district id where incident was recorded )
        accounts_incident_flag = self.accounts.assign(
            incident_flag=self.accounts[self.column_mapping["account_id"]].isin(
                self.incidents[self.column_mapping["account_id"]]
            )
        )
        # Extend t0 and t1 on accounts_incident_flag to connect accounts of same district with different outcome.
        account_extended_info_incident = (
            accounts_incident_flag.merge(
                self.incidents,
                on=[
                    self.column_mapping["account_id"],
                    self.column_mapping["district_id"],
                ],
                how="left",
            )
            .sort_values(self.column_mapping["district_id"])
            .ffill()
            .dropna()
        )
        # Filter account id without an incident
        no_incident_accounts = account_extended_info_incident.loc[
            ~accounts_incident_flag[self.column_mapping["account_id"]].isin(
                self.incidents[self.column_mapping["account_id"]]
            ),
            :,
        ]
        # Declare linked incident date
        no_incident_accounts = no_incident_accounts.rename(
            {self.column_mapping["incident_date"]: "linked_incident_date"}, axis=1
        )
        return no_incident_accounts

    def collect_eof_balance_no_incidents_accounts(self):

        no_incident_accounts = self.fair_no_incidents_accounts()
        eod_balance_linked = self.eod_balance.merge(
            no_incident_accounts,
            on=[
                self.column_mapping["account_id"],
                self.column_mapping["district_id"],
                self.column_mapping["account_creation_date"],
            ],
        )
        # Filter relevant period if balance date between t0 and t1
        eod_balance_period_linked = eod_balance_linked.loc[
            eod_balance_linked[self.column_mapping["balance_date"]].between(
                eod_balance_linked[self.column_mapping["t0"]],
                eod_balance_linked[self.column_mapping["t1"]],
            )
        ]

        return eod_balance_period_linked

    def collect_fair_eod_balance(self):

        eod_balance_incidents = self.collect_eod_balance_incidents_accounts()
        eod_balance_period_linked = self.collect_eof_balance_no_incidents_accounts()

        fair_eod_balance_period = (
            pd.concat([eod_balance_period_linked, eod_balance_incidents], axis=0)
            .sort_values(self.column_mapping["balance_date"])
            .loc[
                :,
                [
                    self.column_mapping["account_id"],
                    self.column_mapping["balance_date"],
                    self.column_mapping["end_of_day_balance"],
                    self.column_mapping["daily_amount_flow"],
                    self.column_mapping["account_creation_date"],
                    self.column_mapping["n_transactions"],
                    self.column_mapping["days_since_account_creation"],
                    self.column_mapping["is_primary"],
                    self.column_mapping["low_balance_flag"],
                    self.column_mapping["streak_id"],
                    self.column_mapping["low_balance_streak"],
                    self.column_mapping["target"],
                    self.column_mapping["district_id"],
                ],
            ]
        )

        return fair_eod_balance_period

    def run(self):
        fair_eod_balance_period = self.collect_fair_eod_balance()
        return fair_eod_balance_period


aggregation = EodBalanceAggregation(
    eod_balance,
    accounts,
    20,
    365,
    column_mapping={
        "account_id": "account_id",
        "balance_date": "balance_date",
        "end_of_day_balance": "end_of_day_balance",
        "daily_amount_flow": "daily_amount_flow",
        "account_creation_date": "account_creation_date",
        "n_transactions": "n_transactions",
        "days_since_account_creation": "days_since_account_creation",
        "is_primary": "is_primary",
        "low_balance_flag": "low_balance_flag",
        "streak_id": "streak_id",
        "low_balance_streak": "low_balance_streak",
        "target": "target",
        "district_id": "district_id",
        "incident_date": "incident_date",
        "t0": "t0",
        "t1": "t1",
    },
)

fair_eod_balance_period = aggregation.run()

999


In [10]:
fair_eod_balance_period

Unnamed: 0,account_id,balance_date,end_of_day_balance,daily_amount_flow,account_creation_date,n_transactions,days_since_account_creation,is_primary,low_balance_flag,streak_id,low_balance_streak,target,district_id
28,1926,1993-01-06,700.0,700.0,1993-01-06,0.0,0,False,True,11946,1,False,37
38,1926,1993-01-07,700.0,0.0,1993-01-06,0.0,1,False,True,11946,2,False,37
53,1926,1993-01-08,700.0,0.0,1993-01-06,0.0,2,False,True,11946,3,False,37
61,1926,1993-01-09,700.0,0.0,1993-01-06,0.0,3,False,True,11946,4,False,37
85,1628,1993-01-10,700.0,700.0,1993-01-10,0.0,0,False,True,9926,1,False,60
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620227,620,1996-12-11,20000.0,0.0,1996-02-18,9.0,297,False,False,3846,0,False,26
1620162,880,1996-12-11,700.0,0.0,1996-12-08,0.0,3,False,True,5265,4,False,26
1620046,822,1996-12-11,21285.8,0.0,1996-10-04,5.0,68,False,False,4977,0,False,40
865394,2223,1996-12-11,17375.1,0.0,1996-01-05,46.0,341,False,True,13668,64,False,44


In [16]:
incident_duration_days = 20
# Convert to datetime if not already
eod_balance["balance_date"] = pd.to_datetime(eod_balance["balance_date"])
# Collect district_id information for each account id
eod_balance = eod_balance.merge(
    accounts.loc[:, ["account_id", "district_id"]], on="account_id"
)
# Collect incidets: Primary Account that stays for 20 consecutive days with eod_balance under amount.
incidents = (
    eod_balance.loc[eod_balance["target"] == True]
    # Drop duplicates to keep only the first incident date
    .drop_duplicates(subset="account_id", keep="first")
    .loc[:, ["account_id", "balance_date", "district_id"]]
    .sort_values("balance_date")
)
# Rename the balance date as incident date
incidents = incidents.rename({"balance_date": "incident_date"}, axis=1)
# Sort dataframes by date
incidents.sort_values("incident_date", inplace=True)
eod_balance.sort_values("balance_date", inplace=True)

# Calculate t0: 1year and 20 days before day of incident, and t1: 20 days before incident; for each incident date
incidents["t0"] = incidents["incident_date"] - pd.DateOffset(
    days=incident_duration_days + 365
)
incidents["t1"] = incidents["incident_date"] - pd.DateOffset(
    days=incident_duration_days
)


# Filter out incidents that are before the beginning of the dataset, and therefore not usable.
incidents = incidents.loc[incidents["t0"] > eod_balance["balance_date"].min(), :]
# Extend with information of incident, connect to each period to t0 and t1
eod_balance_incidents = pd.merge(
    eod_balance, incidents, on=["account_id", "district_id"]
)
# Filter period between t0 and t1 for each account_id in incidents
eod_balance_period = eod_balance_incidents.loc[
    eod_balance_incidents["balance_date"].between(
        eod_balance_incidents["t0"], eod_balance_incidents["t1"]
    )
]
eod_balance_period = eod_balance_period.sort_values(["balance_date", "account_id"])
# Extend info about incidents and districts ( if account is in district id where incident was recorded )
accounts_incident_flag = accounts.assign(
    incident_flag=accounts.account_id.isin(incidents.account_id)
)
# Extend t0 and t1 on accounts_incident_flag to connect accounts of same district with different outcome.
account_extended_info_incident = (
    accounts_incident_flag.merge(
        incidents, on=["account_id", "district_id"], how="left"
    )
    .sort_values("district_id")
    .ffill()
    .dropna()
)
# Filter account id without an incident
no_incident_accounts = account_extended_info_incident.loc[
    ~accounts_incident_flag.account_id.isin(incidents.account_id), :
]

no_incident_accounts = no_incident_accounts.rename(
    {"incident_date": "linked_incident_date"}, axis=1
)
eod_balance_linked = eod_balance.merge(
    no_incident_accounts, on=["account_id", "district_id", "account_creation_date"]
)
# Filter relevant period if balance date between t0 and t1
eod_balance_period_alias = eod_balance_linked.loc[
    eod_balance_linked["balance_date"].between(
        eod_balance_linked["t0"],
        eod_balance_linked["t1"],
    )
]

exp_fair_eod_balance_period = (
    pd.concat([eod_balance_period_alias, eod_balance_period], axis=0)
    .sort_values("balance_date")
    .loc[
        :,
        [
            "account_id",
            "balance_date",
            "end_of_day_balance",
            "daily_amount_flow",
            "account_creation_date",
            "n_transactions",
            "days_since_account_creation",
            "is_primary",
            "low_balance_flag",
            "streak_id",
            "low_balance_streak",
            "target",
            "district_id",
        ],
    ]
)

In [17]:
pd.testing.assert_frame_equal(fair_eod_balance_period, exp_fair_eod_balance_period)

In [20]:
fair_eod_balance_period.to_csv("gs://berkabank/production/data/04_processing/eod_balance_aggregation.csv",index=False)