### Importing Libraries and Source Functions

In [None]:
import pandas as pd
from typing import Union, List, Dict
import numpy as np
import openpyxl
import sys
import os

from src.python import check_and_pull_git_lfs, read_large_file, pivot_dfs_smart, set_columns_to_datetime, \
    filter_rows_by_value

sys.path.append(os.path.abspath(os.path.join('..', '..', 'src', 'python')))
from Functions import *


### Loading the Dataset from CSV

In [None]:
check_and_pull_git_lfs() # Source function to check and pull git-lfs files

# Load the dataset from CSV using source function
balance2025 = read_large_file('../../data/raw/firm-balance-statements/JAR_FA_RODIKLIAI_BLNS_2025_n.csv')
balance2024 = read_large_file('../../data/raw/firm-balance-statements/JAR_FA_RODIKLIAI_BLNS_2024_n.csv')
balance2023 = read_large_file('../../data/raw/firm-balance-statements/JAR_FA_RODIKLIAI_BLNS_2023.csv')
balance2022 = read_large_file('../../data/raw/firm-balance-statements/JAR_FA_RODIKLIAI_BLNS_2022.csv')
balance2021 = read_large_file('../../data/raw/firm-balance-statements/JAR_FA_RODIKLIAI_BLNS_2021.csv')
balance2020 = read_large_file('../../data/raw/firm-balance-statements/JAR_FA_RODIKLIAI_BLNS_2020.csv')

pnl_2025 = read_large_file('../../data/raw/firm-PnL-statements/JAR_FA_RODIKLIAI_PLNA_2025_n.csv')
pnl_2024 = read_large_file('../../data/raw/firm-PnL-statements/JAR_FA_RODIKLIAI_PLNA_2024_n.csv')
pnl_2023 = read_large_file('../../data/raw/firm-PnL-statements/JAR_FA_RODIKLIAI_PLNA_2023.csv')
pnl_2022 = read_large_file('../../data/raw/firm-PnL-statements/JAR_FA_RODIKLIAI_PLNA_2022.csv')
pnl_2021 = read_large_file('../../data/raw/firm-PnL-statements/JAR_FA_RODIKLIAI_PLNA_2021.csv')
pnl_2020 = read_large_file('../../data/raw/firm-PnL-statements/JAR_FA_RODIKLIAI_PLNA_2020.csv')


### Making backups of the original datasets

In [None]:
# Balance statements
B2025 = balance2025.copy()
B2024 = balance2024.copy()
B2023 = balance2023.copy()
B2022 = balance2022.copy()
B2021 = balance2021.copy()
B2020 = balance2020.copy()

# PnL statements
P2025 = pnl_2025.copy()
P2024 = pnl_2024.copy()
P2023 = pnl_2023.copy()
P2022 = pnl_2022.copy()
P2021 = pnl_2021.copy()
P2020 = pnl_2020.copy()

# Saving into list for further processing
balance_statements = [B2025, B2024, B2023, B2022, B2021, B2020]
pnl_statements = [P2025, P2024, P2023, P2022, P2021, P2020]

### Removing unnecessary columns

In [None]:
# Balance statements

bad_cols_bal = ['obj_pav', 'form_pav', 'stat_pav', 'template_name', 'standard_name', 'formavimo_data', 'ja_pavadinimas', 'form_pavadinimas', 'stat_pavadinimas', 'line_type_id' ]

balance_statements = remove_columns(balance_statements, bad_cols_bal, verbose=True, inplace=True)

# PnL statements

bad_cols_pnl = ['obj_pav', 'form_pav', 'stat_pav', 'template_name', 'standard_name', 'formavimo_data', 'ja_pavadinimas', 'form_pavadinimas', 'stat_pavadinimas', 'line_type_id' ]

pnl_statements = remove_columns(pnl_statements, bad_cols_pnl, verbose=True, inplace=True)

### Renaming columns to be the same across all datasets

In [None]:
# Balance statements
# General columns
column_list = ['obj_kodas', 'stat_statusas', 'laikotarpis_nuo', 'laikotarpis_iki']
rename_list = ['ja_kodas', 'stat_kodas','beginning_date', 'turning_date']

# Financial columns
financial_cols_bal = ['nuosavas_kapitalas', 'mok_sumos_ir_isipareigojimai', 'ilgalaikis_turtas', 'trumpalaikis_turtas']
financial_rename_bal = ['NUOSAVAS KAPITALAS','MOKĖTINOS SUMOS IR KITI ĮSIPAREIGOJIMAI', 'ILGALAIKIS TURTAS', 'TRUMPALAIKIS TURTAS']

# Rename columns
balance_statements = rename_columns_if_exist(balance_statements, column_list, rename_list)
balance_statements = rename_columns_if_exist(balance_statements, financial_cols_bal, financial_rename_bal)

# PnL statements

#General columns same as balance statements

# Financial columns
financial_cols_pnl = ['pelnas_pries_apmokestinima', 'grynasis_pelnas', 'pardavimo_pajamos']
financial_rename_pnl = ['PELNAS (NUOSTOLIAI) PRIEŠ APMOKESTINIMĄ', 'GRYNASIS PELNAS (NUOSTOLIAI)', 'PARDAVIMO PAJAMOS']

# Rename columns
pnl_statements = rename_columns_if_exist(pnl_statements, column_list, rename_list)
pnl_statements = rename_columns_if_exist(pnl_statements, financial_cols_pnl, financial_rename_pnl)

### Pivoting the data for new columns

In [None]:
# Balance statements

pivot_dfs_smart(balance_statements, date_column='reg_date', inplace=True)

# PnL statements

pivot_dfs_smart(pnl_statements, date_column='reg_date', inplace=True)

### Making some collumns datetime

In [None]:
# All dfs share the same collumns

cols_to_datetime = ['reg_date', 'beginning_date', 'turning_date']

# Balance statements

set_columns_to_datetime(balance_statements, cols_to_datetime, inplace=True)

# PnL statements

set_columns_to_datetime(pnl_statements, cols_to_datetime, inplace=True)

### Removing wrong financial year data

In [None]:
# Balance statements

# Inclusive of new companies starting businesses after the start of the financial year
B2025_all = filter_rows_by_value(
    data= B2025,
    column={
        'beginning_date': ('2024-01-01', '2024-12-31'),
        'turning_date': ('2024-01-01', '2024-12-31'),
    },
    verbose=True
    )

B2024_all = filter_rows_by_value(
    data= B2024,
    column={'beginning_date': ('2023-01-01', '2023-12-31'), 'turning_date': ('2023-01-01', '2023-12-31')},
    verbose=True
    )

B2023_all = filter_rows_by_value(
    data= B2023,
    column={'beginning_date': ('2022-01-01', '2022-12-31'), 'turning_date': ('2022-01-01', '2022-12-31')},
    verbose=True
    )

B2022_all = filter_rows_by_value(
    data= B2022,
    column={'beginning_date': ('2021-01-01', '2021-12-31'), 'turning_date': ('2021-01-01', '2021-12-31')},
    verbose=True
    )

B2021_all = filter_rows_by_value(
    data= B2021,
    column={'beginning_date': ('2020-01-01', '2020-12-31'), 'turning_date': ('2020-01-01', '2020-12-31')},
    verbose=True
    )

B2020_all = filter_rows_by_value(
    data= B2020,
    column={'beginning_date': ('2019-01-01', '2019-12-31'), 'turning_date': ('2019-01-01', '2019-12-31')},
    verbose=True
    )

# Saving in list for further processing
balance_statements_all = [B2025_all, B2024_all, B2023_all, B2022_all, B2021_all, B2020_all]


# Only full financial year data

B2025_full = filter_rows_by_value(
    data= B2025,
    column={'beginning_date': ['2024-01-01'],
            'turning_date': ['2024-12-31']},
    verbose=True
    )

B2024_full = filter_rows_by_value(
    data= B2024,
    column={'beginning_date': ['2023-01-01'],
            'turning_date': ['2023-12-31']},
    verbose=True
    )

B2023_full = filter_rows_by_value(
    data= B2023,
    column={'beginning_date': ['2022-01-01'],
            'turning_date': ['2022-12-31']},
    verbose=True
    )

B2022_full = filter_rows_by_value(
    data= B2022,
    column={'beginning_date': ['2020-01-01'],
            'turning_date': ['2020-12-31']},
    verbose=True
    )

B2021_full = filter_rows_by_value(
    data= B2021,
    column={'beginning_date': ['2020-01-01'],
            'turning_date': ['2020-12-31']},
    verbose=True
    )
B2020_full = filter_rows_by_value(
    data= B2020,
    column={'beginning_date': ['2019-01-01'],
            'turning_date': ['2019-12-31']},
    verbose=True
    )

# Saving in list for further processing
balance_statements_full = [B2025_full, B2024_full, B2023_full, B2022_full, B2021_full, B2020_full]


In [None]:
# PnL statements

# Inclusive of new companies starting businesses after the start of the financial year
P2025_all = filter_rows_by_value(
    data= P2025,
    column={
        'beginning_date': ('2024-01-01', '2024-12-31'),
        'turning_date': ('2024-01-01', '2024-12-31'),
    },
    verbose=True
    )

P2024_all = filter_rows_by_value(
    data= P2024,
    column={'beginning_date': ('2023-01-01', '2023-12-31'), 'turning_date': ('2023-01-01', '2023-12-31')},
    verbose=True
    )

P2023_all = filter_rows_by_value(
    data= P2023,
    column={'beginning_date': ('2022-01-01', '2022-12-31'), 'turning_date': ('2022-01-01', '2022-12-31')},
    verbose=True
    )

P2022_all = filter_rows_by_value(
    data= P2022,
    column={'beginning_date': ('2021-01-01', '2021-12-31'), 'turning_date': ('2021-01-01', '2021-12-31')},
    verbose=True
    )

P2021_all = filter_rows_by_value(
    data= P2021,
    column={'beginning_date': ('2020-01-01', '2020-12-31'), 'turning_date': ('2020-01-01', '2020-12-31')},
    verbose=True
    )

P2020_all = filter_rows_by_value(
    data= P2020,
    column={'beginning_date': ('2019-01-01', '2019-12-31'), 'turning_date': ('2019-01-01', '2019-12-31')},
    verbose=True
    )

# Saving in list for further processing
pnl_statements_all = [P2025_all, P2024_all, P2023_all, P2022_all, P2021_all, P2020_all]

# Only full financial year data

P2025_full = filter_rows_by_value(
    data= P2025,
    column={'beginning_date': ['2024-01-01'],
            'turning_date': ['2024-12-31']},
    verbose=True
    )

P2024_full = filter_rows_by_value(
    data= P2024,
    column={'beginning_date': ['2023-01-01'],
            'turning_date': ['2023-12-31']},
    verbose=True
    )

P2023_full = filter_rows_by_value(
    data= P2023,
    column={'beginning_date': ['2022-01-01'],
            'turning_date': ['2022-12-31']},
    verbose=True
    )

P2022_full = filter_rows_by_value(
    data= P2022,
    column={'beginning_date': ['2020-01-01'],
            'turning_date': ['2020-12-31']},
    verbose=True
    )

P2021_full = filter_rows_by_value(
    data= P2021,
    column={'beginning_date': ['2020-01-01'],
            'turning_date': ['2020-12-31']},
    verbose=True
    )

P2020_full = filter_rows_by_value(
    data= P2020,
    column={'beginning_date': ['2019-01-01'],
            'turning_date': ['2019-12-31']},
    verbose=True
    )

# Saving in list for further processing
pnl_statements_full = [P2025_full, P2024_full, P2023_full, P2022_full, P2021_full, P2020_full]

### Removing duplicates if they are present