## Importing the libraries:


In [1]:
import pandas as pd
from typing import Union, List, Dict
import numpy as np
import openpyxl
import os
import re
import xlsxwriter

## Importing the datasets:


In [2]:
# Importing dataset of balance and P&L function:
# Execution might take 10 minutes or more due to the large size of the datasets.

balance2025 = pd.read_excel('../../data/raw/firm-balance-statements/JAR_FA_RODIKLIAI_BLNS_2025.xlsx')
balance2024 = pd.read_excel('../../data/raw/firm-balance-statements/JAR_FA_RODIKLIAI_BLNS_2024.xlsx')
balance2023 = pd.read_excel('../../data/raw/firm-balance-statements/JAR_FA_RODIKLIAI_BLNS_2023.xlsx')
balance2022 = pd.read_excel('../../data/raw/firm-balance-statements/JAR_FA_RODIKLIAI_BLNS_2022.xlsx')
balance2021 = pd.read_excel('../../data/raw/firm-balance-statements/JAR_FA_RODIKLIAI_BLNS_2021.xlsx')
balance2020 = pd.read_excel('../../data/raw/firm-balance-statements/JAR_FA_RODIKLIAI_BLNS_2020.xlsx')

pnl2025 = pd.read_excel('../../data/raw/firm-PnL-statements/JAR_FA_RODIKLIAI_PLNA_2025.xlsx')
pnl2024 = pd.read_excel('../../data/raw/firm-PnL-statements/JAR_FA_RODIKLIAI_PLNA_2024.xlsx')
pnl2023 = pd.read_excel('../../data/raw/firm-PnL-statements/JAR_FA_RODIKLIAI_PLNA_2023.xlsx')
pnl2022 = pd.read_excel('../../data/raw/firm-PnL-statements/JAR_FA_RODIKLIAI_PLNA_2022.xlsx')
pnl2021 = pd.read_excel('../../data/raw/firm-PnL-statements/JAR_FA_RODIKLIAI_PLNA_2021.xlsx')
pnl2020 = pd.read_excel('../../data/raw/firm-PnL-statements/JAR_FA_RODIKLIAI_PLNA_2020.xlsx')

### Fallbacks for large dfs:

In [57]:

# Fallbacks for large dfs:
B2025 = balance2025.copy()
B2024 = balance2024.copy()
B2023 = balance2023.copy()
B2022 = balance2022.copy()
B2021 = balance2021.copy()
B2020 = balance2020.copy()

P2025 = pnl2025.copy()
P2024 = pnl2024.copy()
P2023 = pnl2023.copy()
P2022 = pnl2022.copy()
P2021 = pnl2021.copy()
P2020 = pnl2020.copy()

# Saving in lists for functions:
balance_list = [B2025, B2024, B2023, B2022, B2021, B2020]
pnl_list = [P2025, P2024, P2023, P2022, P2021, P2020]

## Cleaning the data with functions:
#### Removing unnecessary columns:

In [58]:
# Unnecessary column removal from list of dataframes:
def remove_mutual_unnecessary_columns(df_list):
    for df in df_list:
        remove_columns = ['ja_pavadinimas', 'obj_pav','form_pav','template_name',  'standard_name', 'form_pavadinimas','line_type_id', 'stat_pavadinimas', 'stat_pav']
        for col in remove_columns:
            if col in df.columns:
                df.drop(columns=col, inplace=True)
    return df_list

# Removing unnecessary rows from list of dataframes:
remove_mutual_unnecessary_columns(balance_list)
remove_mutual_unnecessary_columns(pnl_list)

# Renaming collumns to be the same across dataframes:
def rename_columns(df_list):
    for df in df_list:
        # Rename 'obj_kodas' to 'ja_kodas' if it exists
        if 'obj_kodas' in df.columns:
            df.rename(columns={'obj_kodas': 'ja_kodas'}, inplace=True)

        # Rename other columns if they exist
        column_mapping = {
            'nuosavas_kapitalas': 'NUOSAVAS KAPITALAS',
            'mok_sumos_ir_isipareigojimai': 'MOKĖTINOS SUMOS IR KITI ĮSIPAREIGOJIMAI',
            'trumpalaikis_turtas': 'TRUMPALAIKIS TURTAS',
            'ilgalaikis_turtas': 'ILGALAIKIS TURTAS',
            'pelnas_pries_apmokestinima': 'PELNAS (NUOSTOLIAI) PRIEŠ APMOKESTINIMĄ',
        'grynasis_pelnas': 'GRYNASIS PELNAS (NUOSTOLIAI)',
        'pardavimo_pajamos': 'PARDAVIMO PAJAMOS'
        }

        for old_col, new_col in column_mapping.items():
            if old_col in df.columns:
                df.rename(columns={old_col: new_col}, inplace=True)

    return df_list

rename_columns(balance_list)
rename_columns(pnl_list)




[         ja_kodas  form_kodas  stat_kodas template_id standard_id  \
 0       110003978         310           0      FS0329      IST024   
 1       110003978         310           0      FS0329      IST024   
 2       110003978         310           0      FS0329      IST024   
 3       110004884         310           0      FS0718      IST209   
 4       110004884         310           0      FS0718      IST209   
 ...           ...         ...         ...         ...         ...   
 423454  307123738         310           0      FS0329      IST024   
 423455  307193537         960           0      FS0522      IST024   
 423456  307193537         960           0      FS0522      IST024   
 423457  307438075         960           0      FS0522      IST118   
 423458  307438075         960           0      FS0522      IST118   
 
                                       line_name  reiksme beginning_date  \
 0                             PARDAVIMO PAJAMOS    97545     2024-01-01   
 1    

#### Extracting columns line_name, reiksme and ja_kodas:
This function is uneeded but if you need to view just the extracted columns, you can use it:

In [59]:
# Extracting columns line_name, reiksme and ja_kodas from dfs with all of this data:
def extract_line_name_reiksme_ja_kodas(df_list):
    """
    Extract columns 'line_name', 'reiksme', and 'ja_kodas' from DataFrames
    that contain all three columns.

    Parameters:
    -----------
    df_list : list of pandas.DataFrame
        List of DataFrames to process (will be modified in-place)

    Returns:
    --------
    list of pandas.DataFrame
        List of DataFrames containing only the three specified columns
    """
    extracted_dfs = []

    required_columns = ['line_name', 'reiksme', 'ja_kodas']

    for i, df in enumerate(df_list):
        # Check if all required columns exist in the current DataFrame
        if all(col in df.columns for col in required_columns):
            # Extract only the required columns
            extracted_df = df[required_columns].copy()
            extracted_dfs.append(extracted_df)


        else:
            missing_cols = [col for col in required_columns if col not in df.columns]
            print(f"DataFrame {i}: Missing columns {missing_cols} - skipped")

    print(f"\nExtracted {len(extracted_dfs)} out of {len(df_list)} DataFrames")
    return extracted_dfs

B_extracted = extract_line_name_reiksme_ja_kodas(balance_list)
P_extracted = extract_line_name_reiksme_ja_kodas(pnl_list)

# Renaming the extracted dfs to be more descriptive:
B_extracted_2025 = B_extracted[0]
B_extracted_2024 = B_extracted[1]

P_extracted_2025 = P_extracted[0]
P_extracted_2024 = P_extracted[1]

# New lists with extracted and renamed dfs:
B_extracted_renamed = [B_extracted_2025, B_extracted_2024]
P_extracted_renamed = [P_extracted_2025, P_extracted_2024]

DataFrame 2: Missing columns ['line_name', 'reiksme'] - skipped
DataFrame 3: Missing columns ['line_name', 'reiksme'] - skipped
DataFrame 4: Missing columns ['line_name', 'reiksme'] - skipped
DataFrame 5: Missing columns ['line_name', 'reiksme'] - skipped

Extracted 2 out of 6 DataFrames
DataFrame 2: Missing columns ['line_name', 'reiksme'] - skipped
DataFrame 3: Missing columns ['line_name', 'reiksme'] - skipped
DataFrame 4: Missing columns ['line_name', 'reiksme'] - skipped
DataFrame 5: Missing columns ['line_name', 'reiksme'] - skipped

Extracted 2 out of 6 DataFrames


#### Shifting the column data of extraced dfs so that every row is unique with new columns:

In [60]:
# Shifting the column data of extraced dfs so that every row is unique:
def pivot_dfs(df_list, aggfunc='first'):
    """
    Apply pivot transformation to multiple DataFrames with many columns.
    Only uses 'line_name', 'reiksme', 'ja_kodas' for pivoting, ignores other columns.

    Parameters:
    -----------
    df_list : list of pandas.DataFrame
        List of DataFrames to pivot (can have many additional columns)
    aggfunc : str or function, default 'first'
        Aggregation function for duplicates

    Returns:
    --------
    list of pandas.DataFrame
        List of pivoted DataFrames
    """

    def pivot_line_names_to_columns(df, aggfunc='first'):
        """
        Pivot using only the three required columns, ignoring all others.
        """
        # Select only the columns needed for pivoting
        pivot_data = df[['ja_kodas', 'line_name', 'reiksme']].copy()

        # Pivot the table
        pivoted_df = pivot_data.pivot_table(
            index='ja_kodas',
            columns='line_name',
            values='reiksme',
            aggfunc=aggfunc
        ).reset_index()

        # Reset column names and clean up the DataFrame
        pivoted_df.columns.name = None

        return pivoted_df

    pivoted_dfs = []

    for i, df in enumerate(df_list):
        try:
            # Check if required columns exist
            required_columns = ['line_name', 'reiksme', 'ja_kodas']
            if not all(col in df.columns for col in required_columns):
                missing_cols = [col for col in required_columns if col not in df.columns]
                print(f"DataFrame {i}: Missing columns {missing_cols}. Available: {list(df.columns)}")
                pivoted_dfs.append(df)
                continue

            # Show info about the DataFrame
            print(f"DataFrame {i}: Original columns: {len(df.columns)}")
            print(f"DataFrame {i}: Using 3 columns for pivoting, ignoring {len(df.columns) - 3} other columns")
            print(f"DataFrame {i}: Unique line_name values: {df['line_name'].nunique()}")
            print(f"DataFrame {i}: Unique ja_kodas values: {df['ja_kodas'].nunique()}")

            # Perform pivot (only uses the 3 required columns)
            pivoted_df = pivot_line_names_to_columns(df, aggfunc)
            pivoted_dfs.append(pivoted_df)
            print(f"DataFrame {i}: Successfully pivoted. Original shape: {df.shape}, Pivoted shape: {pivoted_df.shape}")

        except Exception as e:
            print(f"DataFrame {i}: Error during pivoting - {e}")
            pivoted_dfs.append(df)

    return pivoted_dfs

pivot_dfs(B_extracted_renamed)
pivot_dfs(P_extracted_renamed)

# Renaming the pivoted dfs to be more descriptive:
B_pivoted_2025 = pivot_dfs(B_extracted_renamed)[0]
B_pivoted_2024 = pivot_dfs(B_extracted_renamed)[1]

P_pivoted_2025 = pivot_dfs(P_extracted_renamed)[0]
P_pivoted_2024 = pivot_dfs(P_extracted_renamed)[1]

# Saving in lists:
B_pivoted = pivot_dfs(B_extracted_renamed)
P_pivoted = pivot_dfs(P_extracted_renamed)

DataFrame 0: Original columns: 3
DataFrame 0: Using 3 columns for pivoting, ignoring 0 other columns
DataFrame 0: Unique line_name values: 9
DataFrame 0: Unique ja_kodas values: 146512
DataFrame 0: Successfully pivoted. Original shape: (615188, 3), Pivoted shape: (146512, 10)
DataFrame 1: Original columns: 3
DataFrame 1: Using 3 columns for pivoting, ignoring 0 other columns
DataFrame 1: Unique line_name values: 9
DataFrame 1: Unique ja_kodas values: 141315
DataFrame 1: Successfully pivoted. Original shape: (604271, 3), Pivoted shape: (141315, 10)
DataFrame 0: Original columns: 3
DataFrame 0: Using 3 columns for pivoting, ignoring 0 other columns
DataFrame 0: Unique line_name values: 8
DataFrame 0: Unique ja_kodas values: 146512
DataFrame 0: Successfully pivoted. Original shape: (423459, 3), Pivoted shape: (146512, 9)
DataFrame 1: Original columns: 3
DataFrame 1: Using 3 columns for pivoting, ignoring 0 other columns
DataFrame 1: Unique line_name values: 8
DataFrame 1: Unique ja_kodas 

In [64]:
# Pivoting the big dataframes:
def pivot_dfs_smart(df_list):
    """
    Apply pivot transformation with smart aggregation for different column types.
    """

    def pivot_line_names_to_columns_smart(df):
        """
        Pivot with intelligent aggregation based on column data types.
        """
        # Identify column types for smart aggregation
        numeric_columns = []
        string_columns = []

        for col in df.columns:
            if col in ['line_name', 'reiksme']:
                continue
            if pd.api.types.is_numeric_dtype(df[col]):
                numeric_columns.append(col)
            else:
                string_columns.append(col)

        # Create aggregation dictionary
        aggregation_dict = {'reiksme': 'first'}

        # For numeric columns, use 'first' or 'mean' depending on context
        for col in numeric_columns:
            aggregation_dict[col] = 'first'

        # For string columns, use 'first' (take the first occurrence)
        for col in string_columns:
            aggregation_dict[col] = 'first'

        # Identify index columns (all columns except line_name and reiksme)
        index_columns = [col for col in df.columns if col not in ['line_name', 'reiksme']]

        # Perform pivot
        pivoted_df = df.pivot_table(
            index=index_columns,
            columns='line_name',
            values='reiksme',
            aggfunc='first'
        ).reset_index()

        # Reset column names
        pivoted_df.columns.name = None

        return pivoted_df

    pivoted_dfs = []

    for i, df in enumerate(df_list):
        try:
            required_columns = ['line_name', 'reiksme', 'ja_kodas']
            if not all(col in df.columns for col in required_columns):
                missing_cols = [col for col in required_columns if col not in df.columns]
                print(f"DataFrame {i}: Missing columns {missing_cols}")
                pivoted_dfs.append(df)
                continue

            print(f"DataFrame {i}: Preserving {len(df.columns) - 2} columns in result")

            pivoted_df = pivot_line_names_to_columns_smart(df)
            pivoted_dfs.append(pivoted_df)
            print(f"DataFrame {i}: Successfully pivoted. Original shape: {df.shape}, Pivoted shape: {pivoted_df.shape}")

        except Exception as e:
            print(f"DataFrame {i}: Error during pivoting - {e}")
            pivoted_dfs.append(df)

    return pivoted_dfs

# Balance data:
finalB = pivot_dfs_smart(balance_list)
# PnL data:
finalP = pivot_dfs_smart(pnl_list)

# Renaming the final dfs to be more descriptive:
finalB2025 = finalB[0]
finalB2024 = finalB[1]

finalP2025 = finalP[0]
finalP2024 = finalP[1]



DataFrame 0: Preserving 9 columns in result
DataFrame 0: Successfully pivoted. Original shape: (615188, 11), Pivoted shape: (156318, 18)
DataFrame 1: Preserving 9 columns in result
DataFrame 1: Successfully pivoted. Original shape: (604271, 11), Pivoted shape: (153607, 18)
DataFrame 2: Missing columns ['line_name', 'reiksme']
DataFrame 3: Missing columns ['line_name', 'reiksme']
DataFrame 4: Missing columns ['line_name', 'reiksme']
DataFrame 5: Missing columns ['line_name', 'reiksme']
DataFrame 0: Preserving 9 columns in result
DataFrame 0: Successfully pivoted. Original shape: (423459, 11), Pivoted shape: (156318, 17)
DataFrame 1: Preserving 9 columns in result
DataFrame 1: Successfully pivoted. Original shape: (406755, 11), Pivoted shape: (153607, 17)
DataFrame 2: Missing columns ['line_name', 'reiksme']
DataFrame 3: Missing columns ['line_name', 'reiksme']
DataFrame 4: Missing columns ['line_name', 'reiksme']
DataFrame 5: Missing columns ['line_name', 'reiksme']


## Adding PnL data to balance data: