In [None]:
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
## Specify columns to import
cols_to_import = ['serial_num_field', 'activation_date',
       'activation_amount', 'last_trans_date', 'cur_balance']

In [None]:
## Read .csv file and save as df
file_path = '/content/drive/My Drive/Colab Notebooks/3102469.cert_liabilities.20250102.1379839..csv'
data = pd.read_csv(file_path, usecols=cols_to_import)

In [None]:
data.shape

(27609695, 5)

In [None]:
## Filter out the $0 balances
nonzero_condition = data['cur_balance'] != 0
nonzero_data = data.loc[nonzero_condition]
nonzero_data.shape


(5568321, 5)

In [None]:
nonzero_data.head()

Unnamed: 0,serial_num_field,activation_date,activation_amount,last_trans_date,cur_balance
0,31066000-79637498,2024-07-10,25.0,2024-07-10,25.0
1,31066000-79637666,2024-07-10,25.0,2024-07-10,25.0
2,31066000-79994655,2024-07-10,25.0,2024-07-10,25.0
7,603571036-134000020,2013-12-24,100.0,2013-12-27,6.33
12,603571036-134000025,2014-01-01,50.0,2014-01-01,50.0


In [None]:
## Create a running total column using the "cur_balance" column
nonzero_data['running_total'] = nonzero_data['cur_balance'].cumsum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nonzero_data['running_total'] = nonzero_data['cur_balance'].cumsum()


In [None]:
nonzero_data.columns

Index(['serial_num_field', 'activation_date', 'activation_amount',
       'last_trans_date', 'cur_balance', 'running_total'],
      dtype='object')

In [None]:
## Trim the data to keep only essential columns
cols_to_keep = ['serial_num_field', 'activation_date',
       'activation_amount', 'last_trans_date', 'cur_balance','running_total']
trimmed_data = nonzero_data[cols_to_keep]

In [None]:
## Analyze the overall sum of the running_total column
Running_total_sum = trimmed_data['running_total'].sum()
Current_bal_sum = trimmed_data['cur_balance'].sum()

print("The Running Total Sum is: $" + str(Running_total_sum))
print("The Current Balance Sum is: $" + str(Current_bal_sum))

The Running Total Sum is: $464722001310986.44
The Current Balance Sum is: $164420625.87000015


In [None]:
## Create function to breakout dataset into subpopulations within KPMG Threshold
def breakout_df(df=trimmed_data, hit_threshold=1000):
  hit_condition = trimmed_data['running_total'] <= hit_threshold
  hit_data = trimmed_data.loc[hit_condition]
  row_count = len(hit_data)
  rows_needed = int(row_count) + 1
  df_save_name = 'BHN_breakout_hit_$'+str(hit_threshold)+'.csv'
  breakout_df = trimmed_data.head(rows_needed)
  breakout_df.to_csv(df_save_name,index=False)
  return breakout_df

In [None]:
## Create KPMG Thresholds as an iterable list
kpmg_thresholds = [39851498.91,
                   81724843.47,
                   123598188.02]

for hit in kpmg_thresholds:
  breakout_df(trimmed_data,hit) ## Run breakout function