In [2]:
import pandas as pd
import re
from datetime import datetime
pd.set_option('display.max_colwidth', 200)

In [8]:
from masakali_data.exchange_rates import get_exchange_rates

rates = get_exchange_rates(2024)

filename = "january_24.csv"

df = pd.read_csv(filename).astype({"amount_idr": "float64"})
df = df.drop(columns=["rate"])

df["date"] = pd.to_datetime(df["date"]).dt.strftime("%Y-%m-%d")
rates["date"] = pd.to_datetime(rates["date"]).dt.strftime("%Y-%m-%d")
df = df.merge(rates, on="date", how="left")


usd_mask = df[df.amount_usd.notnull()]
idr_mask = df[df.amount_idr.notnull()]

print(usd_mask.head(1))


df.loc[usd_mask.index, "amount_idr"] = (usd_mask["amount_usd"] * usd_mask["rate"]).round()
df.loc[idr_mask.index, "amount_usd"] = (idr_mask["amount_idr"] / idr_mask["rate"]).round(2)

print(df.head(10))


df.to_csv(filename, index=False)

         date  amount_usd  amount_idr    description category  is_split  \
0  2024-01-01       19.19    295187.0  $19.19 Google  Amarose     False   

          rate  
0  15382.31503  
         date  amount_usd  amount_idr  \
0  2024-01-01       19.19    295187.0   
1  2024-01-01      845.13  13000056.0   
2  2024-01-02       32.00    494290.0   
3  2024-01-02        5.00     77233.0   
4  2024-01-02        9.00    139019.0   
5  2024-01-03     1453.50  22475028.0   
6  2024-01-03      600.00   9277618.0   
7  2024-01-03     2000.00  30925392.0   
8  2024-01-03       45.00    695821.0   
9  2024-01-03       25.00    386567.0   

                                         description  \
0                                      $19.19 Google   
1           13mm to Ira for salaries and op expenses   
2                                            $32 gas   
3                                          $5 drinks   
4                                            $9 food   
5          22,475mm Ira for

In [3]:
def convert_mm_to_int(mm_string):
    # Remove 'mm' suffix
    mm_string = mm_string.replace('mm', '')
    
    # Split the string at the last period
    parts = mm_string.split('.', 1)

    # Handle different formats
    if len(parts) == 2:  # If there's a decimal part
        whole, fraction = parts
        fraction = fraction.replace('.', '')  # Remove other periods from the whole part
        number = float(f"{whole}.{fraction}")
    else:
        number = float(parts[0].replace('.', ''))

    # # Convert to the desired integer scale
    return int(number * 1000000)

In [14]:
def process_data(data, year):
    # Split the data into lines
    lines = data.split('\n')

    # Prepare lists to store the extracted data
    dates = []
    amounts_usd = []
    amounts_idr = []
    descriptions = []
    categories = []
    is_splits = []

    # Regular expression for capturing amounts and descriptions
    amount_regex = r'(\$(\d+(?:\.\d+)?)|(\d+(?:\.\d+)*)mm)'
    desc_regex = r'(.*) \((.*)\)'

    # Function to add data to lists
    def add_data(date, amount_usd, amount_idr, description, category, is_split):
        try:
            # Parse and format the date
            parsed_date = datetime.strptime(date.strip(), '%m/%d').replace(year=year).strftime('%Y-%m-%d')
            dates.append(parsed_date)
            amounts_usd.append(float(amount_usd) if amount_usd else None)
            amounts_idr.append(amount_idr if amount_idr else None)
            descriptions.append(description)
            categories.append(category)
            is_splits.append(is_split)
        except:
            print(f"Error processing line: {date}, {amount_usd}, {amount_idr}, {description}, {category}, {is_split}")

    # Iterate over each line
    for line in lines:
        if line:
            # Check if the line is a date
            if re.match(r'\d{1,2}/\d{1,2}', line):
                print(line)
                current_date = line
            else:
                # Find all amounts in the line
                amounts = re.findall(amount_regex, line)
                total_usd = sum([float(amount[1]) for amount in amounts if amount[1]])
                total_idr = sum(convert_mm_to_int(amount[2]) for amount in amounts if amount[2])

                # Extract description and category
                match = re.search(desc_regex, line)
                if match:
                    description, category = match.groups()
                    is_split = 'split' in category.lower()

                    if is_split:
                        # Split the payment into two categories
                        split_categories = category.split('split between ')
                        split_categories = split_categories[1].split(' and ')
                        num_categories = len(split_categories)
                        split_amount_usd = total_usd / num_categories if total_usd else None
                        split_amount_idr = total_idr / num_categories if total_idr else None

                        for cat in split_categories:
                            add_data(current_date, split_amount_usd, split_amount_idr, description, cat.strip(), True)
                    else:
                        # Normal entry
                        add_data(current_date, total_usd, total_idr, description, category, False)

    # Create DataFrame
    df = pd.DataFrame({
        'date': dates,
        'amount_usd': amounts_usd,
        'amount_idr': amounts_idr,
        'description': descriptions,
        'category': categories,
        'is_split': is_splits
    })

    return df


filename = "january_24"

with open(filename + '.txt', 'r') as file:
    data = file.read()
    df = process_data(data, 2024)
    display(df)

    df.to_csv(filename + '.csv', index=False)

1/1
1/2
1/3
1/4
1/5
1/6 
1/7
1/8 
1/9
1/10 
1/11
1/12
1/13
1/14
1/15
1/16
1/17
1/18
1/19
1/20
1/21
1/22
1/23
1/24
1/25
1/26
1/27
1/28
1/29
1/30 
1/31


Unnamed: 0,date,amount_usd,amount_idr,description,category,is_split
0,2023-01-01,19.190,,$19.19 Google,Amarose,False
1,2023-01-01,,13000000.0,13mm to Ira for salaries and op expenses,Amarose,False
2,2023-01-02,32.000,,$32 gas,law firm,False
3,2023-01-02,5.000,,$5 drinks,personal,False
4,2023-01-02,9.000,,$9 food,personal,False
...,...,...,...,...,...,...
132,2023-01-30,,,20k water,personal,False
133,2023-01-30,,,200k massage,personal,False
134,2023-01-30,6.475,,$12.95 Express VPN,law firm,True
135,2023-01-30,6.475,,$12.95 Express VPN,Amarose,True
