In [7]:
import pandas as pd

# Load the provided data file
data = pd.read_csv('user1.csv')


In [8]:
# Data cleaning

# Deal with NaN values in our 'of-interest' fields
data['subClass_title'].fillna('', inplace=True)
data['description'].fillna('', inplace=True)

In [9]:
# Preprocessing the data

# Convert 'postDate' to datetime and sort the data
data['postDate'] = pd.to_datetime(data['postDate'])
data = data.sort_values(by='postDate')


In [10]:
from collections import defaultdict
import numpy as np

# Function to calculate the intervals in days between consecutive transactions
def calculate_intervals(dates):
    return [int((dates[i+1] - dates[i]).days) for i in range(len(dates)-1)]

# Identifying regular transactions
regular_transactions = defaultdict(list)

# Analyze each merchant and category for regular intervals (e.g., monthly)
for (merchant, category), group in data.groupby(['description', 'subClass_title']):
    if len(group) > 1:
        dates = group['postDate'].dt.date.unique()
        intervals = calculate_intervals(sorted(dates))
        
        # Check if intervals are consistent (e.g., around 30 days, allowing some tolerance)
        if intervals and np.std(intervals) <= 5: # Using standard deviation as a measure of consistency
            regular_transactions[(merchant, category)] = intervals


In [11]:
# Classifying transactions as 'bills' and calculating next expected date
bill_reminders = []

for (description, subClass_title), intervals in regular_transactions.items():
    # Calculate the average interval (assuming monthly bills)
    avg_interval = np.mean(intervals)
    
    # Identify the most recent transaction for this bill
    recent_transaction = data[(data['description'] == description) & (data['subClass_title'] == subClass_title)].iloc[-1]
    
    # Calculate next expected bill date
    next_bill_date = recent_transaction['postDate'] + pd.Timedelta(days=avg_interval)
    
    bill_reminders.append({
        'Description': description,
        'Category': subClass_title,
        'Most Recent Transaction': recent_transaction['postDate'],
        'Next Expected Date': next_bill_date,
        'Average Interval (days)': avg_interval
    })

# Convert to DataFrame for better visualization
bill_reminders_df = pd.DataFrame(bill_reminders)

# Display the first few rows of the reminders
bill_reminders_df.head()


Unnamed: 0,Description,Category,Most Recent Transaction,Next Expected Date,Average Interval (days)
0,BPAY MANLY COUNCIL,Regulatory Services,2021-11-25 00:00:00+00:00,2022-02-26 12:00:00+00:00,93.5
1,Funds Transfer transfer,Auxiliary Finance and Investment Services,2023-05-22 00:00:00+00:00,2025-01-26 00:00:00+00:00,615.0
2,Mortgage Interest Payment 746833,,2023-11-16 00:00:00+00:00,2023-12-16 08:48:00+00:00,30.366667
3,Payroll WFRMS 15439393,Unknown,2023-10-27 00:00:00+00:00,2023-11-26 08:16:33.103448276+00:00,30.344828
4,TFR Acc14000 TO 12389,Legal and Accounting Services,2023-11-11 00:00:00+00:00,2023-12-11 08:48:00+00:00,30.366667
