In [25]:
import pandas as pd

In [26]:
def split_years(df):
    df['year'] = df['resources.description'].str.extract(r'(\d{4})')
    df['resources.description'] = df['resources.description'].str.replace(r'\d{4}', '', regex=True)
    df['year'] = df['year'].fillna(0).astype(int)

    df['year_2'] = df['resources.description'].str.extract(r'(\d{2})')
    df['resources.description'] = df['resources.description'].str.replace(r'\d{2}', '', regex=True)
    df['year_2'] = pd.to_datetime(df['year_2'], format='%y').dt.strftime('%Y')
    df['year_2'] = df['year_2'].fillna(0).astype(int)

    df['year'] = df['year'] + df['year_2']

    df = df.drop(['year_2'], axis=1)

    return df


In [27]:
def clean_subscriptions_data(subset, schedule):
    subset['schedule'] = schedule
    subset['resources.description'] = subset['resources.description'].replace({'\\(|\\)': '', '  ': ' '}, regex=True)
    subset['resources.description'] = subset['resources.description'].str.split(':', expand=False)
    subset['section_month'] = subset['resources.description'].str[1]
    subset['section_month'] = subset['section_month'].str.split(' ', expand=False)
    subset['section_month'] = subset['section_month'].apply(lambda x: [item for item in x if item])
    subset['section'] = subset['section_month'].str[0]
    subset['payment_name'] = subset['section_month'].str[1]
    subset = subset.drop(['section_month'], axis=1)

    return subset


In [28]:
def clean_activities_data(subset, schedule):
    subset['schedule'] = schedule
    subset['resources.description'] = subset['resources.description'].replace({'\\(|\\)': '', '  ': ' '}, regex=True)
    subset['resources.description'] = subset['resources.description'].str.split(':', expand=False)
    subset['payment_name'] = subset['resources.description'].str[1]
    subset['section'] = subset['payment_name'].str.split().str[0]
    subset['payment_name'] = subset['payment_name'].str.strip()

    return subset

In [29]:
def clean_data(file_path):
    df = pd.read_csv(file_path)

    columns_to_keep = ['resources.description', 'gross_amount', 'gocardless_fees', 'app_fees', 'net_amount', 'payouts.arrival_date', 'payments.metadata.Member', 'payments.metadata.References']
    df = df[columns_to_keep]
    
    # splits the years out into there own column
    df = split_years(df)
    df['total_fees'] = df['gocardless_fees'] + df['app_fees']

    payment_schedules = ['Subscriptions', 'Activities']

    dfs = []

    # for loops through teh schedules and treats each differently
    for schedule in payment_schedules:
        subset = df[df['resources.description'].str.contains(schedule)].copy()
        if schedule == 'Subscriptions':
            subset = clean_subscriptions_data(subset=subset, schedule=schedule)
        elif schedule == 'Activities':
            subset = clean_activities_data(subset=subset, schedule=schedule)
        dfs.append(subset)

    df = pd.concat(dfs, ignore_index=True)
    
    # Splits payment metadata out
    df[['payment_code', 'schedule_code', 'section_code']] = df['payments.metadata.References'].str.split('-', expand=True)
    df[['payment_code', 'schedule_code', 'section_code']] = df[['payment_code', 'schedule_code', 'section_code']].apply(lambda x: x.str.strip())

    # split out names
    df['payments.metadata.Member'] = df['payments.metadata.Member'].str.split('(', expand=False)
    df['member'] = df['payments.metadata.Member'].str[0]
    
    # df.rename(columns={'payout_arrival_date': 'payouts.arrival_date'}, inplace=True)
    
    # General tidy up
    df = df.drop(['resources.description','payments.metadata.References', 'gocardless_fees', 'app_fees', 'payments.metadata.Member'], axis=1)
    column_order = ['section', 'schedule', 'year', 'payment_name', 'gross_amount', 'total_fees', 'net_amount', 'member',
       'payouts.arrival_date', 'section_code', 'schedule_code', 'payment_code']
    
    df = df[column_order]

    
    return df

file_path='inputs_3/payout_transactions_reconciliation-export-EX00036AZABPPE.csv'
df = clean_data(file_path=file_path)
display(df)


Unnamed: 0,section,schedule,year,payment_name,gross_amount,total_fees,net_amount,member,payouts.arrival_date,section_code,schedule_code,payment_code
0,Scouts,Subscriptions,2024,January,12.0,0.52,11.48,Finley Jones,2024-02-12 00:00:00,19515,57594,714273
1,Scouts,Subscriptions,2024,January,12.0,0.52,11.48,Hudson Danks,2024-02-12 00:00:00,19515,57594,714273
2,Scouts,Subscriptions,2024,January,12.0,0.52,11.48,William Bain,2024-02-12 00:00:00,19515,57594,714273
3,Scouts,Subscriptions,2024,January,12.0,0.52,11.48,Molly Everall,2024-02-12 00:00:00,19515,57594,714273
4,Scouts,Subscriptions,2024,January,12.0,0.52,11.48,Dylan Mitchell,2024-02-12 00:00:00,19515,57594,714273
...,...,...,...,...,...,...,...,...,...,...,...,...
136,Scouts,Activities,2024,Scouts Log Cabin,58.0,1.84,56.16,Zoe Fellows,2024-02-12 00:00:00,19515,57593,730026
137,Scouts,Activities,2024,Scouts Log Cabin,58.0,1.84,56.16,William Bain,2024-02-12 00:00:00,19515,57593,730026
138,Scouts,Activities,2024,Scouts Log Cabin,58.0,1.84,56.16,Hudson Danks,2024-02-12 00:00:00,19515,57593,730026
139,Scouts,Activities,2024,Scouts Log Cabin,58.0,1.84,56.16,Henrietta Powell,2024-02-12 00:00:00,19515,57593,730026


In [30]:
df = df.drop('payments.metadata.References', axis=1)
df

KeyError: "['payments.metadata.References'] not found in axis"