## Importing the libraries:


In [4]:
import pandas as pd
from typing import Union, List, Dict
import numpy as np
import openpyxl
import os
import re
import xlsxwriter

## Importing the datasets:


In [5]:
# Importing dataset of balance and P&L function:
# Execution might take 10 minutes or more due to the large size of the datasets.

balance2025 = pd.read_excel('../../data/raw/firm-balance-statements/JAR_FA_RODIKLIAI_BLNS_2025.xlsx')
balance2024 = pd.read_excel('../../data/raw/firm-balance-statements/JAR_FA_RODIKLIAI_BLNS_2024.xlsx')
balance2023 = pd.read_excel('../../data/raw/firm-balance-statements/JAR_FA_RODIKLIAI_BLNS_2023.xlsx')
balance2022 = pd.read_excel('../../data/raw/firm-balance-statements/JAR_FA_RODIKLIAI_BLNS_2022.xlsx')
balance2021 = pd.read_excel('../../data/raw/firm-balance-statements/JAR_FA_RODIKLIAI_BLNS_2021.xlsx')
balance2020 = pd.read_excel('../../data/raw/firm-balance-statements/JAR_FA_RODIKLIAI_BLNS_2020.xlsx')

pnl2025 = pd.read_excel('../../data/raw/firm-PnL-statements/JAR_FA_RODIKLIAI_PLNA_2025.xlsx')
pnl2024 = pd.read_excel('../../data/raw/firm-PnL-statements/JAR_FA_RODIKLIAI_PLNA_2024.xlsx')
pnl2023 = pd.read_excel('../../data/raw/firm-PnL-statements/JAR_FA_RODIKLIAI_PLNA_2023.xlsx')
pnl2022 = pd.read_excel('../../data/raw/firm-PnL-statements/JAR_FA_RODIKLIAI_PLNA_2022.xlsx')
pnl2021 = pd.read_excel('../../data/raw/firm-PnL-statements/JAR_FA_RODIKLIAI_PLNA_2021.xlsx')
pnl2020 = pd.read_excel('../../data/raw/firm-PnL-statements/JAR_FA_RODIKLIAI_PLNA_2020.xlsx')

### Fallbacks for large dfs:

In [12]:

# Fallbacks for large dfs:
B2025 = balance2025.copy()
B2024 = balance2024.copy()
B2023 = balance2023.copy()
B2022 = balance2022.copy()
B2021 = balance2021.copy()
B2020 = balance2020.copy()

P2025 = pnl2025.copy()
P2024 = pnl2024.copy()
P2023 = pnl2023.copy()
P2022 = pnl2022.copy()
P2021 = pnl2021.copy()
P2020 = pnl2020.copy()

# Saving in lists for functions:
balance_list = [B2025, B2024, B2023, B2022, B2021, B2020]
pnl_list = [P2025, P2024, P2023, P2022, P2021, P2020]

## Cleaning the data with functions:
#### Removing unnecessary columns:

In [13]:
# Unnecessary column removal from list of dataframes:
def remove_mutual_unnecessary_columns(df_list):
    for df in df_list:
        remove_columns = ['ja_pavadinimas', 'obj_pav','form_pav','template_name',  'standard_name', 'form_pavadinimas','line_type_id', 'stat_pavadinimas', 'stat_pav']
        for col in remove_columns:
            if col in df.columns:
                df.drop(columns=col, inplace=True)
    return df_list

# Removing unnecessary rows from list of dataframes:
remove_mutual_unnecessary_columns(balance_list)
remove_mutual_unnecessary_columns(pnl_list)

# Renaming collumns to be the same across dataframes:
def rename_columns(df_list):
    for df in df_list:
        # Rename 'obj_kodas' to 'ja_kodas' if it exists
        if 'obj_kodas' in df.columns:
            df.rename(columns={'obj_kodas': 'ja_kodas'}, inplace=True)

        # Rename other columns if they exist
        column_mapping = {
            'nuosavas_kapitalas': 'NUOSAVAS KAPITALAS',
            'mok_sumos_ir_isipareigojimai': 'MOKĖTINOS SUMOS IR KITI ĮSIPAREIGOJIMAI',
            'trumpalaikis_turtas': 'TRUMPALAIKIS TURTAS',
            'ilgalaikis_turtas': 'ILGALAIKIS TURTAS',
            'pelnas_pries_apmokestinima': 'PELNAS (NUOSTOLIAI) PRIEŠ APMOKESTINIMĄ',
        'grynasis_pelnas': 'GRYNASIS PELNAS (NUOSTOLIAI)',
        'pardavimo_pajamos': 'PARDAVIMO PAJAMOS'
        }

        for old_col, new_col in column_mapping.items():
            if old_col in df.columns:
                df.rename(columns={old_col: new_col}, inplace=True)

    return df_list

rename_columns(balance_list)
rename_columns(pnl_list)




[         ja_kodas  form_kodas  stat_kodas template_id standard_id  \
 0       110003978         310           0      FS0329      IST024   
 1       110003978         310           0      FS0329      IST024   
 2       110003978         310           0      FS0329      IST024   
 3       110004884         310           0      FS0718      IST209   
 4       110004884         310           0      FS0718      IST209   
 ...           ...         ...         ...         ...         ...   
 423454  307123738         310           0      FS0329      IST024   
 423455  307193537         960           0      FS0522      IST024   
 423456  307193537         960           0      FS0522      IST024   
 423457  307438075         960           0      FS0522      IST118   
 423458  307438075         960           0      FS0522      IST118   
 
                                       line_name  reiksme beginning_date  \
 0                             PARDAVIMO PAJAMOS    97545     2024-01-01   
 1    

#### Extracting columns line_name, reiksme and ja_kodas:
This function is uneeded but if you need to view just the extracted columns, you can use it:

In [14]:
# Extracting columns line_name, reiksme and ja_kodas from dfs with all of this data:
def extract_line_name_reiksme_ja_kodas(df_list):
    """
    Extract columns 'line_name', 'reiksme', and 'ja_kodas' from DataFrames
    that contain all three columns.

    Parameters:
    -----------
    df_list : list of pandas.DataFrame
        List of DataFrames to process (will be modified in-place)

    Returns:
    --------
    list of pandas.DataFrame
        List of DataFrames containing only the three specified columns
    """
    extracted_dfs = []

    required_columns = ['line_name', 'reiksme', 'ja_kodas']

    for i, df in enumerate(df_list):
        # Check if all required columns exist in the current DataFrame
        if all(col in df.columns for col in required_columns):
            # Extract only the required columns
            extracted_df = df[required_columns].copy()
            extracted_dfs.append(extracted_df)


        else:
            missing_cols = [col for col in required_columns if col not in df.columns]
            print(f"DataFrame {i}: Missing columns {missing_cols} - skipped")

    print(f"\nExtracted {len(extracted_dfs)} out of {len(df_list)} DataFrames")
    return extracted_dfs

B_extracted = extract_line_name_reiksme_ja_kodas(balance_list)
P_extracted = extract_line_name_reiksme_ja_kodas(pnl_list)

# Renaming the extracted dfs to be more descriptive:
B_extracted_2025 = B_extracted[0]
B_extracted_2024 = B_extracted[1]

P_extracted_2025 = P_extracted[0]
P_extracted_2024 = P_extracted[1]

# New lists with extracted and renamed dfs:
B_extracted_renamed = [B_extracted_2025, B_extracted_2024]
P_extracted_renamed = [P_extracted_2025, P_extracted_2024]

DataFrame 2: Missing columns ['line_name', 'reiksme'] - skipped
DataFrame 3: Missing columns ['line_name', 'reiksme'] - skipped
DataFrame 4: Missing columns ['line_name', 'reiksme'] - skipped
DataFrame 5: Missing columns ['line_name', 'reiksme'] - skipped

Extracted 2 out of 6 DataFrames
DataFrame 2: Missing columns ['line_name', 'reiksme'] - skipped
DataFrame 3: Missing columns ['line_name', 'reiksme'] - skipped
DataFrame 4: Missing columns ['line_name', 'reiksme'] - skipped
DataFrame 5: Missing columns ['line_name', 'reiksme'] - skipped

Extracted 2 out of 6 DataFrames


#### Shifting the column data of extraced dfs so that every row is unique with new columns:

In [15]:
# Shifting the column data of extraced dfs so that every row is unique:
def pivot_dfs(df_list, aggfunc='first'):
    """
    Apply pivot transformation to multiple DataFrames with many columns.
    Only uses 'line_name', 'reiksme', 'ja_kodas' for pivoting, ignores other columns.

    Parameters:
    -----------
    df_list : list of pandas.DataFrame
        List of DataFrames to pivot (can have many additional columns)
    aggfunc : str or function, default 'first'
        Aggregation function for duplicates

    Returns:
    --------
    list of pandas.DataFrame
        List of pivoted DataFrames
    """

    def pivot_line_names_to_columns(df, aggfunc='first'):
        """
        Pivot using only the three required columns, ignoring all others.
        """
        # Select only the columns needed for pivoting
        pivot_data = df[['ja_kodas', 'line_name', 'reiksme']].copy()

        # Pivot the table
        pivoted_df = pivot_data.pivot_table(
            index='ja_kodas',
            columns='line_name',
            values='reiksme',
            aggfunc=aggfunc
        ).reset_index()

        # Reset column names and clean up the DataFrame
        pivoted_df.columns.name = None

        return pivoted_df

    pivoted_dfs = []

    for i, df in enumerate(df_list):
        try:
            # Check if required columns exist
            required_columns = ['line_name', 'reiksme', 'ja_kodas']
            if not all(col in df.columns for col in required_columns):
                missing_cols = [col for col in required_columns if col not in df.columns]
                print(f"DataFrame {i}: Missing columns {missing_cols}. Available: {list(df.columns)}")
                pivoted_dfs.append(df)
                continue

            # Show info about the DataFrame
            print(f"DataFrame {i}: Original columns: {len(df.columns)}")
            print(f"DataFrame {i}: Using 3 columns for pivoting, ignoring {len(df.columns) - 3} other columns")
            print(f"DataFrame {i}: Unique line_name values: {df['line_name'].nunique()}")
            print(f"DataFrame {i}: Unique ja_kodas values: {df['ja_kodas'].nunique()}")

            # Perform pivot (only uses the 3 required columns)
            pivoted_df = pivot_line_names_to_columns(df, aggfunc)
            pivoted_dfs.append(pivoted_df)
            print(f"DataFrame {i}: Successfully pivoted. Original shape: {df.shape}, Pivoted shape: {pivoted_df.shape}")

        except Exception as e:
            print(f"DataFrame {i}: Error during pivoting - {e}")
            pivoted_dfs.append(df)

    return pivoted_dfs

pivot_dfs(B_extracted_renamed)
pivot_dfs(P_extracted_renamed)

# Renaming the pivoted dfs to be more descriptive:
B_pivoted_2025 = pivot_dfs(B_extracted_renamed)[0]
B_pivoted_2024 = pivot_dfs(B_extracted_renamed)[1]

P_pivoted_2025 = pivot_dfs(P_extracted_renamed)[0]
P_pivoted_2024 = pivot_dfs(P_extracted_renamed)[1]

# Saving in lists:
B_pivoted = pivot_dfs(B_extracted_renamed)
P_pivoted = pivot_dfs(P_extracted_renamed)

DataFrame 0: Original columns: 3
DataFrame 0: Using 3 columns for pivoting, ignoring 0 other columns
DataFrame 0: Unique line_name values: 9
DataFrame 0: Unique ja_kodas values: 146512
DataFrame 0: Successfully pivoted. Original shape: (615188, 3), Pivoted shape: (146512, 10)
DataFrame 1: Original columns: 3
DataFrame 1: Using 3 columns for pivoting, ignoring 0 other columns
DataFrame 1: Unique line_name values: 9
DataFrame 1: Unique ja_kodas values: 141315
DataFrame 1: Successfully pivoted. Original shape: (604271, 3), Pivoted shape: (141315, 10)
DataFrame 0: Original columns: 3
DataFrame 0: Using 3 columns for pivoting, ignoring 0 other columns
DataFrame 0: Unique line_name values: 8
DataFrame 0: Unique ja_kodas values: 146512
DataFrame 0: Successfully pivoted. Original shape: (423459, 3), Pivoted shape: (146512, 9)
DataFrame 1: Original columns: 3
DataFrame 1: Using 3 columns for pivoting, ignoring 0 other columns
DataFrame 1: Unique line_name values: 8
DataFrame 1: Unique ja_kodas 

In [16]:
# Pivoting the big dataframes:
def pivot_dfs_smart(df_list):
    """
    Apply pivot transformation with smart aggregation for different column types.
    """

    def pivot_line_names_to_columns_smart(df):
        """
        Pivot with intelligent aggregation based on column data types.
        """
        # Identify column types for smart aggregation
        numeric_columns = []
        string_columns = []

        for col in df.columns:
            if col in ['line_name', 'reiksme']:
                continue
            if pd.api.types.is_numeric_dtype(df[col]):
                numeric_columns.append(col)
            else:
                string_columns.append(col)

        # Create aggregation dictionary
        aggregation_dict = {'reiksme': 'first'}

        # For numeric columns, use 'first' or 'mean' depending on context
        for col in numeric_columns:
            aggregation_dict[col] = 'first'

        # For string columns, use 'first' (take the first occurrence)
        for col in string_columns:
            aggregation_dict[col] = 'first'

        # Identify index columns (all columns except line_name and reiksme)
        index_columns = [col for col in df.columns if col not in ['line_name', 'reiksme']]

        # Perform pivot
        pivoted_df = df.pivot_table(
            index=index_columns,
            columns='line_name',
            values='reiksme',
            aggfunc='first'
        ).reset_index()

        # Reset column names
        pivoted_df.columns.name = None

        return pivoted_df

    pivoted_dfs = []

    for i, df in enumerate(df_list):
        try:
            required_columns = ['line_name', 'reiksme', 'ja_kodas']
            if not all(col in df.columns for col in required_columns):
                missing_cols = [col for col in required_columns if col not in df.columns]
                print(f"DataFrame {i}: Missing columns {missing_cols}")
                pivoted_dfs.append(df)
                continue

            print(f"DataFrame {i}: Preserving {len(df.columns) - 2} columns in result")

            pivoted_df = pivot_line_names_to_columns_smart(df)
            pivoted_dfs.append(pivoted_df)
            print(f"DataFrame {i}: Successfully pivoted. Original shape: {df.shape}, Pivoted shape: {pivoted_df.shape}")

        except Exception as e:
            print(f"DataFrame {i}: Error during pivoting - {e}")
            pivoted_dfs.append(df)

    return pivoted_dfs

# Balance data:
finalB = pivot_dfs_smart(balance_list)
# PnL data:
finalP = pivot_dfs_smart(pnl_list)

# Renaming the final dfs to be more descriptive:
finalB2025 = finalB[0]
finalB2024 = finalB[1]

finalP2025 = finalP[0]
finalP2024 = finalP[1]



DataFrame 0: Preserving 9 columns in result
DataFrame 0: Successfully pivoted. Original shape: (615188, 11), Pivoted shape: (156318, 18)
DataFrame 1: Preserving 9 columns in result
DataFrame 1: Successfully pivoted. Original shape: (604271, 11), Pivoted shape: (153607, 18)
DataFrame 2: Missing columns ['line_name', 'reiksme']
DataFrame 3: Missing columns ['line_name', 'reiksme']
DataFrame 4: Missing columns ['line_name', 'reiksme']
DataFrame 5: Missing columns ['line_name', 'reiksme']
DataFrame 0: Preserving 9 columns in result
DataFrame 0: Successfully pivoted. Original shape: (423459, 11), Pivoted shape: (156318, 17)
DataFrame 1: Preserving 9 columns in result
DataFrame 1: Successfully pivoted. Original shape: (406755, 11), Pivoted shape: (153607, 17)
DataFrame 2: Missing columns ['line_name', 'reiksme']
DataFrame 3: Missing columns ['line_name', 'reiksme']
DataFrame 4: Missing columns ['line_name', 'reiksme']
DataFrame 5: Missing columns ['line_name', 'reiksme']


## Adding PnL data to balance data:

In [17]:
# Adding PnL data to balance data:

def merge_pnl_and_balances(df_list1, df_list2, how='inner'):
    """
    Merge two lists of DataFrames on ja_kodas column with comprehensive diagnostics and validation.
    Each DataFrame from list1 is merged with corresponding DataFrame from list2.

    Parameters:
    -----------
    df_list1, df_list2 : list of pandas.DataFrame
        Lists of DataFrames to merge
    how : str, default 'inner'
        Type of merge: 'inner', 'left', 'right', 'outer'

    Returns:
    --------
    list of pandas.DataFrame
        List of merged DataFrames
    """
    if len(df_list1) != len(df_list2):
        print(f"Warning: List lengths differ - list1: {len(df_list1)}, list2: {len(df_list2)}")
        # Use the minimum length to avoid index errors
        min_length = min(len(df_list1), len(df_list2))
        df_list1 = df_list1[:min_length]
        df_list2 = df_list2[:min_length]

    merged_dfs = []

    for i, (df1, df2) in enumerate(zip(df_list1, df_list2)):
        try:
            # Check if ja_kodas exists in both DataFrames
            if 'ja_kodas' not in df1.columns:
                print(f"Pair {i}: ja_kodas not found in first DataFrame, skipping")
                continue
            if 'ja_kodas' not in df2.columns:
                print(f"Pair {i}: ja_kodas not found in second DataFrame, skipping")
                continue

            # Create copies to avoid modifying originals
            df1_clean = df1.copy()
            df2_clean = df2.copy()

            # Validate ja_kodas data types and convert if necessary
            if df1_clean['ja_kodas'].dtype != df2_clean['ja_kodas'].dtype:
                print(f"Pair {i}: ja_kodas data types differ - converting both to string")
                df1_clean['ja_kodas'] = df1_clean['ja_kodas'].astype(str)
                df2_clean['ja_kodas'] = df2_clean['ja_kodas'].astype(str)

            # Check for duplicate ja_kodas within each DataFrame
            df1_duplicates = df1_clean.duplicated(subset=['ja_kodas']).sum()
            df2_duplicates = df2_clean.duplicated(subset=['ja_kodas']).sum()

            if df1_duplicates > 0:
                print(f"Pair {i}: WARNING - {df1_duplicates} duplicate ja_kodas found in first DataFrame")
                # Keep first occurrence of duplicates
                df1_clean = df1_clean.drop_duplicates(subset=['ja_kodas'], keep='first')

            if df2_duplicates > 0:
                print(f"Pair {i}: WARNING - {df2_duplicates} duplicate ja_kodas found in second DataFrame")
                # Keep first occurrence of duplicates
                df2_clean = df2_clean.drop_duplicates(subset=['ja_kodas'], keep='first')

            # Check for NaN values in ja_kodas
            df1_nan = df1_clean['ja_kodas'].isna().sum()
            df2_nan = df2_clean['ja_kodas'].isna().sum()

            if df1_nan > 0:
                print(f"Pair {i}: WARNING - {df1_nan} NaN values in ja_kodas (first DataFrame), removing")
                df1_clean = df1_clean.dropna(subset=['ja_kodas'])

            if df2_nan > 0:
                print(f"Pair {i}: WARNING - {df2_nan} NaN values in ja_kodas (second DataFrame), removing")
                df2_clean = df2_clean.dropna(subset=['ja_kodas'])

            # Pre-merge diagnostics
            df1_unique = df1_clean['ja_kodas'].nunique()
            df2_unique = df2_clean['ja_kodas'].nunique()
            common_ja_kodas = set(df1_clean['ja_kodas']) & set(df2_clean['ja_kodas'])
            common_count = len(common_ja_kodas)

            print(f"\n--- Pair {i} Merge Diagnostics ---")
            print(f"DF1: {len(df1_clean)} rows, {df1_unique} unique ja_kodas")
            print(f"DF2: {len(df2_clean)} rows, {df2_unique} unique ja_kodas")
            print(f"Common ja_kodas: {common_count}")
            print(f"Merge type: {how}")

            # Calculate expected result sizes
            if how == 'inner':
                expected_rows = common_count
            elif how == 'left':
                expected_rows = len(df1_clean)
            elif how == 'right':
                expected_rows = len(df2_clean)
            else:  # outer
                expected_rows = len(df1_clean) + len(df2_clean) - common_count

            print(f"Expected result rows: {expected_rows}")

            # Perform the merge with indicator to track sources
            merged_df = pd.merge(
                df1_clean,
                df2_clean,
                on='ja_kodas',
                how=how,
                suffixes=('_pnl', '_balance'),
                indicator=True  # Add merge indicator column
            )

            # Post-merge diagnostics
            actual_rows = len(merged_df)
            merge_stats = merged_df['_merge'].value_counts()

            print(f"Actual result rows: {actual_rows}")
            print(f"Merge composition: {merge_stats.to_dict()}")

            if actual_rows != expected_rows:
                print(f"WARNING: Expected {expected_rows} rows but got {actual_rows} rows")

            # Remove the indicator column
            merged_df = merged_df.drop('_merge', axis=1)

            # Check for overlapping column names (besides ja_kodas)
            overlapping_cols = set(df1_clean.columns) & set(df2_clean.columns) - {'ja_kodas'}
            if overlapping_cols:
                print(f"Overlapping columns (received suffixes): {list(overlapping_cols)}")

            print(f"Pair {i}: Successfully merged. Shapes: {df1.shape} + {df2.shape} -> {merged_df.shape}")

            merged_dfs.append(merged_df)

        except Exception as e:
            print(f"Pair {i}: Error during merge - {e}")
            print(f"Pair {i}: DF1 columns: {list(df1.columns) if 'df1' in locals() else 'N/A'}")
            print(f"Pair {i}: DF2 columns: {list(df2.columns) if 'df2' in locals() else 'N/A'}")
            # Keep both original DataFrames if merge fails
            merged_dfs.extend([df1, df2])

    # Final summary
    print(f"\n=== MERGE SUMMARY ===")
    print(f"Successfully processed: {len(merged_dfs)} DataFrames")
    print(f"Total input pairs: {min(len(df_list1), len(df_list2))}")

    return merged_dfs
# Joining all the dfs:
joined_BP_list = merge_pnl_and_balances(finalB, finalP, how='inner')

# Renaming the joined dfs to be more descriptive:
joined_BP2025 = joined_BP_list[0]
joined_BP2024 = joined_BP_list[1]
joined_BP2023 = joined_BP_list[2]
joined_BP2022 = joined_BP_list[3]
joined_BP2021 = joined_BP_list[4]
joined_BP2020 = joined_BP_list[5]

# Joining all the dfs and keeping all unmatched rows:
joined_BP_all_list = merge_pnl_and_balances(finalB, finalP, how='outer')

# Renaming the joined dfs to be more descriptive:
joined_BP_all2025 = joined_BP_all_list[0]
joined_BP_all2024 = joined_BP_all_list[1]
joined_BP_all2023 = joined_BP_all_list[2]
joined_BP_all2022 = joined_BP_all_list[3]
joined_BP_all2021 = joined_BP_all_list[4]
joined_BP_all2020 = joined_BP_all_list[5]



--- Pair 0 Merge Diagnostics ---
DF1: 146512 rows, 146512 unique ja_kodas
DF2: 146512 rows, 146512 unique ja_kodas
Common ja_kodas: 146512
Merge type: inner
Expected result rows: 146512
Actual result rows: 146512
Merge composition: {'both': 146512, 'left_only': 0, 'right_only': 0}
Overlapping columns (received suffixes): ['template_id', 'stat_kodas', 'formavimo_data', 'standard_id', 'turning_date', 'beginning_date', 'reg_date', 'form_kodas']
Pair 0: Successfully merged. Shapes: (156318, 18) + (156318, 17) -> (146512, 34)

--- Pair 1 Merge Diagnostics ---
DF1: 141315 rows, 141315 unique ja_kodas
DF2: 141315 rows, 141315 unique ja_kodas
Common ja_kodas: 141315
Merge type: inner
Expected result rows: 141315
Actual result rows: 141315
Merge composition: {'both': 141315, 'left_only': 0, 'right_only': 0}
Overlapping columns (received suffixes): ['template_id', 'stat_kodas', 'formavimo_data', 'standard_id', 'turning_date', 'beginning_date', 'reg_date', 'form_kodas']
Pair 1: Successfully merg

#### File is getting long, so lets save the final dataframes to csv files and move to other notebooks where we will do some data analysis, create new variables:

In [None]:
### Notebook is gettig long, so lets save the final dataframes to csv files and move to other notebooks where we will do some data analysis, create new variables:

ImportError: cannot import name 'save_df_list_to_csv' from 'src.python.Functions' (/home/aidmantas/repos/Lithuanian-Innovation-Agency-Risk-Model/src/python/Functions.py)