<a href="https://colab.research.google.com/github/DrAdamDev/ETL-pipeline-for-UK-Employment-data/blob/main/ETL_Accounts_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
from google.colab import files
import pandas as pd

In [None]:
#drive.mount('content')
files.upload()

In [123]:
Accounts_filename = 'Accounts 22 - 23.csv'

# Define internal account names
internal_accounts = ['DR ADAM TUTORING LTD', 
                     'Adam Francis', 
                     'Blublocs Ltd', 
                     'NaN', 
                     'DR A TUTORING']

# Define business targets names
business_targets = ['STRIPE', 
                    'Blublocs Ltd', 
                    'Fiverreu', 
                    'Facebk', 
                    'Www.lalal.ai', 
                    'Imjas Kaloth', 
                    'Opn\*My Gogoprint', 
                    "Google \*Services", 
                    'Intechno Software', 
                    'DR ADAM TUTORING LTD', 
                    'Tutorboss', 
                    'Taxscouts'
                    ]

# Define columns to drop
dropped_columns = ['Status', 
                   'Reference', 
                   'Created on', 
                   'Source fee amount', 
                   'Source fee currency', 
                   'Target amount (after fees)', 
                   'Exchange rate', 'Source currency', 
                   'Target currency'
                   ]

# Define new column names
column_name_mapping = {
    "Finished on": "Transaction date",
    "Source amount (after fees)": "Amount (USD)"
    }

# Read in Accounts CSV file
accounts_df = pd.read_csv(Accounts_filename)

# Drop null columns
accounts_df.dropna(axis=1, how='all', inplace=True)

# Drop redundant columns
accounts_df.drop(columns=dropped_columns, inplace=True)

# Drop internal transactions
accounts_df = accounts_df[~((accounts_df['Source name'].isin(internal_accounts)) & (accounts_df['Target name'].isin(internal_accounts)))]

# Split ID column into two columns
accounts_df[['Transaction type', 'Transaction ID']] = accounts_df['ID'].str.split('-', n=1, expand=True)

# Create Business transaction columns
accounts_df['Business transaction'] = accounts_df['Target name'].str.contains('|'.join(business_targets)).replace({True: 'Yes', False: 'No'})

# Drop card cashback transactions
accounts_df = accounts_df[~(accounts_df['Transaction type'] == 'CARD_CASHBACK')]

# Update column names
accounts_df.rename(columns=column_name_mapping, inplace=True)

# Update datetime dtype
accounts_df['Transaction date'] = pd.to_datetime(accounts_df['Transaction date']).dt.date

# Drop the original ID column
accounts_df.drop('ID', axis=1, inplace=True)

# Drop internal transactions
accounts_df = accounts_df[accounts_df['Direction'] != 'NEUTRAL']

# Set index to Transaction ID
accounts_df.set_index('Transaction ID', inplace=True)

# Drop duplicate rows based on the index (Transaction ID)
accounts_df = accounts_df[~accounts_df.index.duplicated(keep=False)]

# Apply negative numbers to 'Amount USD'
accounts_df.loc[accounts_df['Direction'] == 'OUT', 'Amount (USD)'] *= -1

# Add row to including net profit
total_in_out = accounts_df.groupby('Direction')['Amount (USD)'].sum()
total_in = total_in_out.loc['IN']
total_out = total_in_out.loc['OUT']
net_profit = (total_in + total_out).round(2)

summary_row = pd.DataFrame({'Direction': 'Net Profit',
                            'Transaction date': '2022-07-26 -> 2023-01-26',
                            'Source name': 'STRIPE',
                            'Amount (USD)': net_profit,
                            'Target name': 'DR ADAM TUTORING LTD',
                            'Transaction type': 'ALL TRANSFERS',
                            'Business transaction': 'Yes'},
                            index=['Summary'])

summary_row.index.name = 'Transaction ID'
accounts_df = pd.concat([accounts_df, summary_row])

# Add category column
category_mapping = {'DR ADAM TUTORING LTD': 'Income',
                    'Blublocs Ltd': 'Income',
                    'Fiverreu': 'Services',
                    'Tutorboss': 'Training',
                    'Www.lalal.ai': 'Services',
                    'Taxscouts': 'Services',
                    'Imjas Kaloth': 'Marketing',
                    'STRIPE': 'REFUND',
                    'Opn*My Gogoprint': 'Marketing',
                    'Google *Services': 'Advertising',
                    'Intechno Software': 'IT Services'
                    }

accounts_df['Category'] = accounts_df['Target name'].map(category_mapping)
accounts_df.loc[accounts_df['Target name'].str.startswith('Facebk'), 'Category'] = 'Advertising'
accounts_df['Category'].fillna('Personal', inplace=True)


In [124]:
accounts_df.head(90)

Unnamed: 0_level_0,Direction,Transaction date,Source name,Amount (USD),Target name,Transaction type,Business transaction,Category
Transaction ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
593574615,IN,2023-01-26,STRIPE,291.00,DR ADAM TUTORING LTD,TRANSFER,Yes,Income
569292131,OUT,2023-01-22,DR ADAM TUTORING LTD,-15.05,Jaya Grocer-Gurneypar,CARD_TRANSACTION,No,Personal
562028748,OUT,2023-01-15,DR ADAM TUTORING LTD,-84.40,Fiverreu,CARD_TRANSACTION,Yes,Services
557899033,OUT,2023-01-12,DR ADAM TUTORING LTD,-68.82,Southern Bank Berhad,CARD_TRANSACTION,No,Personal
557896728,OUT,2023-01-12,DR ADAM TUTORING LTD,-35.87,Lotus's Penang Egate,CARD_TRANSACTION,No,Personal
...,...,...,...,...,...,...,...,...
378174060,OUT,2022-07-30,Blublocs Ltd,-50.00,Intechno Software,CARD_TRANSACTION,Yes,IT Services
462771956,IN,2022-07-28,STRIPE,524.04,Blublocs Ltd,TRANSFER,Yes,Income
374799445,OUT,2022-07-27,Blublocs Ltd,-0.45,Facebk Lf5fnj7zp2,CARD_TRANSACTION,Yes,Advertising
373857584,OUT,2022-07-26,Blublocs Ltd,-29.34,Facebk Ayjqmj7zp2,CARD_TRANSACTION,Yes,Advertising


In [126]:
# Define the filename for the Excel file
output_filename = 'Accounts 22 - 23.xlsx'

# Write the DataFrame to an Excel file
accounts_df.to_excel(output_filename, index=True)

# Download the Excel file
from google.colab import files
files.download(output_filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>