# Importing Libraries

In [None]:
import pandas as pd
import sys
import os

In [None]:
project_path = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_path not in sys.path:
    sys.path.append(project_path)

import utils.get_data_utils as gdu

# Data indicators and filters metadata

In [None]:
# Get the list of years for analysis
time_period = [ str(t) for t in range(2010, 2024)]

# Get the list of countries for analysis
country_codes = [
    "BE",  # Belgium
    "EL",  # Greece
    "LT",  # Lithuania
    "PT",  # Portugal
    "BG",  # Bulgaria
    "ES",  # Spain
    "LU",  # Luxembourg
    "RO",  # Romania
    "CZ",  # Czechia
    "FR",  # France
    "HU",  # Hungary
    "SI",  # Slovenia
    "DK",  # Denmark
    "HR",  # Croatia
    "MT",  # Malta
    "SK",  # Slovakia
    "DE",  # Germany
    "IT",  # Italy
    "NL",  # Netherlands
    "FI",  # Finland
    "EE",  # Estonia
    "CY",  # Cyprus
    "AT",  # Austria
    "SE",  # Sweden
    "IE",  # Ireland
    "LV",  # Latvia
    "PL"   # Poland
]


# Get the list of indicators along with their filters for our analysis
indicators = {
    'prc_hpi_a': { # House price index
        "unit": "RCH_A_AVG", # Annual average rate of change
        "purchase": "TOTAL",
        "label": "house_price_index"
    },  
    'tipsho60': { # Standardised house price-to-income ratio
        "unit": "PTIR_LT_AVG", # Price-to-income ratio relative to long-term average
        "label": "price_income_ratio"
    }, 
    'sts_cobp_a': { # Building permits
        "unit": "I21", # Index, 2021=100
        "indic_bt": "BPRM_SQM", # Building permits - m2 of useful floor area
        "cpa2_1": "CPA_F41001", # Residential buildings
        "label": "building_permits"
    }, 
    'migr_imm1ctz': { # Immigration by age group, sex and citizenship
        "citizen": "TOTAL",
        "agedef": "REACH",
        "age": "TOTAL",
        "sex": "T",
        "label": "immigration"
    }, 
    'sts_copi_a': { # Construction producer prices or costs, new residential buildings
        "unit": "PCH_SM", # Percentage change compared to same period in previous year
        "indic_bt": "PRC_PRR", # Producer prices
        "label": "construction_prices"
    }, 
    'ilc_mddw03': { # Crime, violence or vandalism in the area
        "hhtyp": "TOTAL",
        "incgrp": "TOTAL",
        "label": "crime_rate"
    }, 
    'earn_nt_net': { # Annual net earnings
        "ecase": "P1_NCH_AW100", # Single person without children earning 100% of the average earning
        "currency": "EUR",
        "estruct": "NET", # Net earning
        "label": "net_earnings"
    }, 
    'ilc_li48': { # At-risk-of-poverty rate after deducting housing costs by degree of urbanisation
        "deg_urb": "TOTAL",
        "label": "poverty_rate"
    }, 
    'tps00003': { # Population density
        "label": "pop_density"
    }, 
    'tps00206': { # Crude marriage rate
        "label": "marriage_rate"
    }, 
    'prc_hicp_midx': { # HICP - inflation rate
        "unit": "I15",
        "coicop": "CP00",
        "freq": "M",
        "label": "inflation_rate"
    }, 
    'tps00019': { # Population change - crude rates of total change, natural change and net migration plus adjustment
        "indic_de": "GROWRT", # Crude rate of total population change
        "label": "pop_change"
    }, 
    'tesem120': { # Unemployment rate by sex
        "sex": "T",
        "label": "unemployment_rate"
    }, 
    'ilc_mdes06':{ # Arrears on mortgage or rent payments
        "hhtyp": "TOTAL",
        "incgrp": "TOTAL",
        "label": "late_payment_rate"
    }
}


for indicator in indicators:
    indicators[indicator]['geo'] = country_codes
    indicators[indicator]['time'] = time_period
    if 'freq' not in indicators[indicator]:
        indicators[indicator]['freq'] = 'A'

print(indicators)

# Downloading and saving raw data

In [None]:
df = gdu.get_eurostat_data(indicators)
print("Data retrieved from Eurostat")

In [None]:
df.to_csv("../data/raw/eu_analysis_data.csv", index=False)
print("Data saved to data/eu_analysis_data.csv")

# Data Cleaning

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()