In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
PROCESSED_PATH = '../data/processed/'

In [3]:
df = pd.read_csv(f'{PROCESSED_PATH}/transactions.csv')
df.head()

Unnamed: 0,Date,Description,Withdrawals ($),Deposits ($),Balance ($)
0,"Jan 01, 2025",,OpeningBalance,,22.01
1,"Jan 02, 2025",Withdrawal 25991738 Free Interac E Transfer,8.00,,14.01
2,"Jan 02, 2025",MB Transfer from 232001691155,,60.0,74.01
3,"Jan 02, 2025",Point of sale purchase Apos Driver Services Va...,15.00,,59.01
4,"Jan 02, 2025",Point of sale purchase Apos Driver Services Va...,10.00,,49.01


#### Data Exploration

In [4]:
df.shape

(618, 5)

In [5]:
df.columns

Index(['Date', 'Description', 'Withdrawals ($)', 'Deposits ($)',
       'Balance ($)'],
      dtype='object')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 618 entries, 0 to 617
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Date             618 non-null    object
 1   Description      605 non-null    object
 2   Withdrawals ($)  509 non-null    object
 3   Deposits ($)     109 non-null    object
 4   Balance ($)      618 non-null    object
dtypes: object(5)
memory usage: 24.3+ KB


In [7]:
df.isnull().sum()

Date                 0
Description         13
Withdrawals ($)    109
Deposits ($)       509
Balance ($)          0
dtype: int64

In [8]:
renames = {
    'Date':'date',
    'Description':'description',
    'Withdrawals ($)':'withdrawals',
    'Deposits ($)':'deposits',
    'Balance ($)':'balance'
}
df = df.rename(columns=renames)
df.columns

Index(['date', 'description', 'withdrawals', 'deposits', 'balance'], dtype='object')

In [9]:
df.loc[df['withdrawals'] == 'OpeningBalance', 'description'] = 'Opening Balance'
df.loc[df['withdrawals'] == 'OpeningBalance', 'withdrawals'] = np.nan

df.loc[df['withdrawals'] == 'ClosingBalance', 'description'] = 'Closing Balance'
df.loc[df['withdrawals'] == 'ClosingBalance', 'withdrawals'] = np.nan

In [10]:
df['withdrawals'] = df['withdrawals'].str.replace(',','')
df['withdrawals'] = df['withdrawals'].str.replace('.','')
df['withdrawals'] = df['withdrawals'].str.replace('$','')

df['deposits'] = df['deposits'].str.replace(',','')
df['deposits'] = df['deposits'].str.replace('.','')
df['deposits'] = df['deposits'].str.replace('$','')

df['balance'] = df['balance'].str.replace(',','')
df['balance'] = df['balance'].str.replace('.','')
df['balance'] = df['balance'].str.replace('$','')

df['withdrawals'] = df['withdrawals'].astype(np.float32)
df['deposits'] = df['deposits'].astype(np.float32)
df['balance'] = df['balance'].astype(np.float32)
df['date'] = pd.to_datetime(df['date'])

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 618 entries, 0 to 617
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   date         618 non-null    datetime64[ns]
 1   description  618 non-null    object        
 2   withdrawals  484 non-null    float32       
 3   deposits     109 non-null    float32       
 4   balance      618 non-null    float32       
dtypes: datetime64[ns](1), float32(3), object(1)
memory usage: 17.0+ KB


In [11]:
df.head()

Unnamed: 0,date,description,withdrawals,deposits,balance
0,2025-01-01,Opening Balance,,,2201.0
1,2025-01-02,Withdrawal 25991738 Free Interac E Transfer,800.0,,1401.0
2,2025-01-02,MB Transfer from 232001691155,,6000.0,7401.0
3,2025-01-02,Point of sale purchase Apos Driver Services Va...,1500.0,,5901.0
4,2025-01-02,Point of sale purchase Apos Driver Services Va...,1000.0,,4901.0


In [12]:
df.to_csv(f"{PROCESSED_PATH}/clean_df.csv", index=False)

#### Feature Engineering

In [13]:
df_clean = pd.read_csv(f"{PROCESSED_PATH}/clean_df.csv")
df.head()

Unnamed: 0,date,description,withdrawals,deposits,balance
0,2025-01-01,Opening Balance,,,2201.0
1,2025-01-02,Withdrawal 25991738 Free Interac E Transfer,800.0,,1401.0
2,2025-01-02,MB Transfer from 232001691155,,6000.0,7401.0
3,2025-01-02,Point of sale purchase Apos Driver Services Va...,1500.0,,5901.0
4,2025-01-02,Point of sale purchase Apos Driver Services Va...,1000.0,,4901.0


In [14]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 618 entries, 0 to 617
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   date         618 non-null    object 
 1   description  618 non-null    object 
 2   withdrawals  484 non-null    float64
 3   deposits     109 non-null    float64
 4   balance      618 non-null    float64
dtypes: float64(3), object(2)
memory usage: 24.3+ KB
