In [1]:
import numpy as np
import pandas as pd
from utils import dictionaries

In [2]:
filename="../data/Eurostat Enterprise Stats Latest (inc.2023) (1).xlsx"

In [3]:
sheets=pd.ExcelFile(path_or_buffer=filename).sheet_names

In [4]:
def sheet_finder(sheet:str,filepath:str,n=8)->None:
    df=pd.read_excel(
        io=filepath,
        sheet_name=sheet,
        nrows=n
    )
    try:
        if ("Sheet" in sheet) and ("Enterprises - number" in df["Unnamed: 2"].values) and ("Total" not in df["Unnamed: 2"].values):
            return True
        else:
            return False
    except:
        return False

In [5]:
def sheet_reader(sheet:str,filepath:str)->None:
    df=pd.read_excel(
        io=filepath,
        sheet_name=sheet
    )
    segment=df.iloc[6]["Unnamed: 2"]
    columns=df.iloc[8].dropna()
    columns=["country"]+list(columns[columns!="TIME"].values)
    
    df=df.iloc[11:46].dropna(axis=1)
    df.columns=columns
    df["segment"]=segment
    df=df.apply(lambda col: col.replace(":",np.nan),axis=1)
    return df

In [6]:
sheet_flags=[]
for sheet in sheets:
    sheet_flags.append(sheet_finder(sheet=sheet,filepath=filename))

In [7]:
dfs=[]
for sheet,annotation in zip(sheets,sheet_flags):
    if annotation:
        dfs.append(sheet_reader(sheet=sheet,filepath=filename))

In [8]:
eurostat_data=pd.concat(dfs,axis=0)

In [9]:
eurostat_data["s_level_segment"]=eurostat_data["segment"].apply(
    lambda segment: dictionaries["eurostat_segments"].get(segment)
)

In [10]:
eurostat_data=(
    eurostat_data
    .groupby(by=["country","s_level_segment"])
    [["2021","2022","2023"]]
    .sum()
    .reset_index()
)

In [11]:
segment_pairs=[
    ("s5-s6","s5"),
    ("s5-s6","s6"),
    ("s7+","s7"),
    ("s7+","s8"),
    ("s7+","s9"),
    ("s7+","s10")
]

In [12]:
def map_eu_segments(
        df_to_map:pd.DataFrame,
        segment_pairs:list
    )->pd.DataFrame:
    dividable_segments=[segment[0] for segment in segment_pairs]
    cols_to_scale=["2021","2022","2023"]
    subdfs=[df_to_map[~df_to_map["s_level_segment"].isin(dividable_segments)]]
    for from_segment,to_segment in segment_pairs:
        subdf=df_to_map[df_to_map["s_level_segment"]==from_segment].copy(deep=True)
        subdf[cols_to_scale]=subdf[cols_to_scale]*dictionaries["segment_ratios"][to_segment]
        subdf["s_level_segment"]=to_segment
        subdfs.append(subdf)
    return pd.concat(subdfs)

In [13]:
non_eu=pd.read_excel(
    io="../data/Data Landscaping Master (Draft) - PQ WIP.xlsx",
    sheet_name="Non-Eurostat Hardcoded",
    skiprows=6
)

In [14]:
non_eu["s_level_segment"]=non_eu["Size class"].apply(
    lambda segment: dictionaries["eurostat_segments"].get(segment)
)
non_eu=non_eu.drop(columns=["Size class"])

In [15]:
non_eu=(
    non_eu
    .groupby(by=["country","s_level_segment"])
    [["2021","2022","2023"]]
    .sum()
    .reset_index()
)

In [16]:
mapped_non_eu=map_eu_segments(
        df_to_map=non_eu,
        segment_pairs=segment_pairs
    )
mapped_eu=map_eu_segments(
    df_to_map=eurostat_data,
    segment_pairs=segment_pairs
)

In [18]:
eu_stat_processed=pd.concat([
    mapped_eu[mapped_eu["country"]!="Albania"],
    mapped_non_eu
])

In [19]:
eu_stat_processed.to_csv("../data/eu_stat_processed.csv",index=False)