# Preparing data for Machine Learning

In this notebook, we will prepare the data for machine learning by cleaning, transforming, and preparing the Informative Financial Tabbles for analysis.

### Loading necessary libraries

In [1]:
import pandas as pd
import numpy as np

# Import statements for Functions.py
import sys
import os

sys.path.append(os.path.abspath(os.path.join('..', '..', 'src', 'python')))
from Functions import *



### Data importing

### Unneeded columns removal

In [None]:
keep_columns = [
    # Company characteristics (for grouping only - remove as features later)
    'ja_kodas',
    'form_kodas',
    'stat_kodas',
    'Ekonominės veiklos pavadinimas',
    'Apskritis',
    'Savivaldybė',

    # Operational metrics
    'Vidutinis atlyginimas (EUR)',
    'Darbuotojų skaičius',
    'VĖLUOJANČIOS ATASKAITOS',

    # Financial position
    'ILGALAIKIS TURTAS',
    'TRUMPALAIKIS TURTAS',
    'MOKĖTINOS SUMOS IR KITI ĮSIPAREIGOJIMAI',
    'NUOSAVAS KAPITALAS',

    # Performance ratios (good predictors)
    'GRYNOJO PELNO MARŽA (%)',
    'ROA (%)',
    'TURTO APYVARTUMAS (santykis)',
    'SVERTINIS MOKUMO PAKAITALAS (santykis)'
]

target_columns = [
    'PARDAVIMO PAJAMOS',           # Next year's revenue
    'GRYNASIS PELNAS (NUOSTOLIAI)' # Next year's profit
]

# 1. Keeping these columns for processing
all_columns = keep_columns + ['beginning_date']  # Add date for sorting

# 2. Creating features dataset
features_df = df[all_columns].copy()

# 3. Creating targets
features_df['next_year_revenue'] = df.groupby('ja_kodas')['PARDAVIMO PAJAMOS'].shift(-1)
features_df['next_year_profit'] = df.groupby('ja_kodas')['GRYNASIS PELNAS (NUOSTOLIAI)'].shift(-1)

# 4. Removing identifier columns before training
features_for_training = features_df.drop(['ja_kodas'], axis=1)