In [1]:
import pandas as pd
import os

# ---------------------------------------------------
# 1. Load Excel File and Inspect Data
# ---------------------------------------------------

# Specify the path to your Excel file
file_path = 'C:/Users/danie/Nextcloud/Coding/Masterthesis/data/raw/Socio_Economic_Accounts.xlsx'

# Load the Excel file and specifically load the 'DATA' sheet
excel_data = pd.ExcelFile(file_path)

# Check available sheet names to confirm
print(f"Available sheets: {excel_data.sheet_names}")

# Load the 'DATA' sheet
df = pd.read_excel(file_path, sheet_name='DATA')

# ---------------------------------------------------
# 2. Clean Column Names (if necessary)
# ---------------------------------------------------

# Ensure all column names are strings
df.columns = df.columns.map(str)

# Strip any leading/trailing spaces from column names
df.columns = df.columns.str.strip()

# ---------------------------------------------------
# 3. Extract Available Data Information (Country, Variable, and Year Range)
# ---------------------------------------------------

# Check if the expected columns exist, adjust the names if they differ from expectations
if all(col in df.columns for col in ['country', 'variable']):
    # Extract unique countries
    unique_countries = df['country'].unique()
    print(f"Unique countries: {unique_countries}")

    # Initialize a dictionary to store information for each country
    country_data = {}

    # Identify valid year columns (those that are entirely numeric and represent valid years)
    year_columns = [col for col in df.columns if col.isdigit()]

    if not year_columns:
        print("Error: No valid year columns found.")
    else:
        print(f"Identified year columns: {year_columns}")

        # Loop through each country to gather available data
        for country in unique_countries:
            country_specific_data = df[df['country'] == country]

            # Extract unique variables for this country
            unique_variables = country_specific_data['variable'].unique()

            # Prepare a sub-dictionary for this country's data
            country_data[country] = {}

            for variable in unique_variables:
                variable_data = country_specific_data[country_specific_data['variable'] == variable]

                # Get year columns with data available for this variable
                available_years = variable_data[year_columns].dropna(axis=1, how='all').columns.tolist()

                if available_years:
                    year_min = min(available_years)
                    year_max = max(available_years)

                    # Store the variable and its year range
                    country_data[country][variable] = {'year_min': year_min, 'year_max': year_max}
                else:
                    print(f"No year data available for country {country}, variable {variable}")

    # Print the collected data in a structured way
    for country, variables in country_data.items():
        print(f"\nCountry: {country}")
        for variable, year_info in variables.items():
            print(f"  Variable: {variable} (Available from {year_info['year_min']} to {year_info['year_max']})")
else:
    print("Error: One or more required columns (country, variable) not found.")

# ---------------------------------------------------
# 4. Export Detailed Data Information to CSV
# ---------------------------------------------------

# Prepare a list to convert to a DataFrame for exporting
export_data = []

for country, variables in country_data.items():
    for variable, year_info in variables.items():
        export_data.append({
            'Country': country,
            'Variable': variable,
            'Year Start': year_info['year_min'],
            'Year End': year_info['year_max']
        })

# Convert the list to a DataFrame
df_export = pd.DataFrame(export_data)

# Define an output directory for CSV files
output_dir = 'C:/Users/danie/Nextcloud/Coding/Masterthesis/data/processed/SEA_cleaned_data'
os.makedirs(output_dir, exist_ok=True)

# Export the detailed data information to CSV
csv_output_path = os.path.join(output_dir, 'SEA_data_availability.csv')
df_export.to_csv(csv_output_path, index=False)

print(f"Data availability has been exported to: {csv_output_path}")


Available sheets: ['Notes', 'DATA']
Unique countries: ['AUS' 'AUT' 'BEL' 'BGR' 'BRA' 'CAN' 'CHE' 'CHN' 'CYP' 'CZE' 'DEU' 'DNK'
 'ESP' 'EST' 'FIN' 'FRA' 'GBR' 'GRC' 'HRV' 'HUN' 'IDN' 'IND' 'IRL' 'ITA'
 'JPN' 'KOR' 'LTU' 'LUX' 'LVA' 'MEX' 'MLT' 'NLD' 'NOR' 'POL' 'PRT' 'ROU'
 'RUS' 'SVK' 'SVN' 'SWE' 'TUR' 'TWN' 'USA']
Identified year columns: ['2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014']
No year data available for country CHN, variable EMPE
No year data available for country CHN, variable H_EMPE

Country: AUS
  Variable: CAP (Available from 2000 to 2014)
  Variable: COMP (Available from 2000 to 2014)
  Variable: EMP (Available from 2000 to 2014)
  Variable: EMPE (Available from 2000 to 2014)
  Variable: GO (Available from 2000 to 2014)
  Variable: GO_PI (Available from 2000 to 2014)
  Variable: GO_QI (Available from 2000 to 2014)
  Variable: H_EMPE (Available from 2000 to 2014)
  Variable: II (Available from 2000 t

In [7]:
# FIGARO countries (from both EU and non-EU lists)
figaro_countries = {
    "BE", "BG", "CZ", "DK", "DE", "EE", "IE", "EL", "ES", "FR", "HR", "IT", "CY", "LV", "LT", "LU", "HU", "MT", 
    "NL", "AT", "PL", "PT", "RO", "SI", "SK", "FI", "SE", "AR", "AU", "BR", "CA", "CH", "CN", "ID", "IN", "JP", 
    "KR", "MX", "NO", "RU", "SA", "TR", "UK", "US", "ZA", "FIGW1", "W2"
}

# SEA countries (based on the acronym list from the image)
sea_countries = {
    "AUS", "AUT", "BEL", "BGR", "BRA", "CAN", "CHE", "CHN", "CYP", "CZE", "DEU", "DNK", "ESP", "EST", "FIN", "FRA",
    "GBR", "GRC", "HRV", "HUN", "IDN", "IND", "IRL", "ITA", "JPN", "KOR", "LTU", "LUX", "LVA", "MEX", "MLT", "NLD", 
    "NOR", "POL", "PRT", "ROU", "RUS", "SVK", "SVN", "SWE", "TUR", "TWN", "USA"
}

# Convert FIGARO country codes to match SEA format (3-letter codes)
figaro_countries_sea_format = {  # Mapping 2-letter codes to 3-letter codes for SEA format
    "BE": "BEL", "BG": "BGR", "CZ": "CZE", "DK": "DNK", "DE": "DEU", "EE": "EST", "IE": "IRL", "EL": "GRC", "ES": "ESP",
    "FR": "FRA", "HR": "HRV", "IT": "ITA", "CY": "CYP", "LV": "LVA", "LT": "LTU", "LU": "LUX", "HU": "HUN", "MT": "MLT",
    "NL": "NLD", "AT": "AUT", "PL": "POL", "PT": "PRT", "RO": "ROU", "SI": "SVN", "SK": "SVK", "FI": "FIN", "SE": "SWE",
    "AR": "ARG", "AU": "AUS", "BR": "BRA", "CA": "CAN", "CH": "CHE", "CN": "CHN", "ID": "IDN", "IN": "IND", "JP": "JPN",
    "KR": "KOR", "MX": "MEX", "NO": "NOR", "RU": "RUS", "SA": "SAU", "TR": "TUR", "UK": "GBR", "US": "USA", "ZA": "ZAF",
    # FIGW1 and W2 are not applicable for comparison, they refer to Rest of the world and Domestic, exclude those
}

# Convert the FIGARO country codes to the SEA format
figaro_countries_sea_format = set(figaro_countries_sea_format.values())

# Perform the comparison
only_in_sea = sea_countries - figaro_countries_sea_format
only_in_figaro = figaro_countries_sea_format - sea_countries
common_countries = sea_countries & figaro_countries_sea_format

# Output the comparison results
print(f"Countries only in SEA:\n{only_in_sea}")
print(f"\nCountries only in FIGARO:\n{only_in_figaro}")
print(f"\nCommon Countries:\n{common_countries}")


Countries only in SEA:
{'TWN'}

Countries only in FIGARO:
{'SAU', 'ZAF', 'ARG'}

Common Countries:
{'SVK', 'POL', 'IND', 'SVN', 'BRA', 'JPN', 'PRT', 'FIN', 'IRL', 'LVA', 'ROU', 'CHN', 'BGR', 'ESP', 'IDN', 'CYP', 'BEL', 'NLD', 'DNK', 'GBR', 'LTU', 'CAN', 'FRA', 'SWE', 'AUS', 'ITA', 'MLT', 'HUN', 'HRV', 'LUX', 'NOR', 'CHE', 'GRC', 'MEX', 'EST', 'USA', 'KOR', 'TUR', 'CZE', 'DEU', 'AUT', 'RUS'}
