In [3]:
import os
import re
import numpy as np
import pandas as pd

BASE_DIR = r"C:\Users\osaci\Desktop\Proiect Stiinta Datelor Flow Cosmin\Proiect Echipa"

def p(fname: str) -> str:
    return os.path.join(BASE_DIR, fname)

ECE_COUNTRIES = ["BG", "CZ", "HU", "PL", "RO", "SK", "SI", "HR", "EE", "LV", "LT"]
ECE_COUNTRIES


['BG', 'CZ', 'HU', 'PL', 'RO', 'SK', 'SI', 'HR', 'EE', 'LV', 'LT']

In [4]:
def load_eurostat(path: str) -> pd.DataFrame:
    """
    Încarcă un Eurostat .tsv.gz:
      - separator tab
      - ':' ca NA
      - păstrează ca string (curățăm ulterior)
    """
    return pd.read_csv(
        path,
        sep="\t",
        na_values=":",
        dtype=str,
        encoding="utf-8"
    )


In [6]:
def melt_years(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = [c.strip() for c in df.columns]

    year_pattern = re.compile(r"^\d{4}$")
    id_vars = [c for c in df.columns if not year_pattern.match(c)]
    value_vars = [c for c in df.columns if year_pattern.match(c)]

    print("Ani detectați:", value_vars[:10])

    df_long = df.melt(
        id_vars=id_vars,
        value_vars=value_vars,
        var_name="year",
        value_name="value"
    )

    return df_long


In [7]:
def parse_value(x):
    if pd.isna(x):
        return np.nan
    x = str(x).strip()
    if x == ":":
        return np.nan
    x_clean = re.sub(r"[^0-9\-,\.]", "", x)
    x_clean = x_clean.replace(",", ".")
    try:
        return float(x_clean)
    except:
        return np.nan


In [8]:
file_path = p("estat_nrg_cb_gas.tsv.gz")  # acesta este fișierul recomandat

raw = load_eurostat(file_path)
print("Formă inițială:", raw.shape)
raw.head()


Formă inițială: (20500, 36)


Unnamed: 0,"freq,nrg_bal,siec,unit,geo\TIME_PERIOD",1990,1991,1992,1993,1994,1995,1996,1997,1998,...,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
0,"A,DL,C0350,TJ_GCV,AL",0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,:
1,"A,DL,C0350,TJ_GCV,AT",0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,17.503,6.62,8.199,6.435,9.899,13.141,8.046,10.223,19.655,:
2,"A,DL,C0350,TJ_GCV,BA",:,:,:,:,:,:,:,:,:,...,0.0,0.0,0.0,0.0,0.0,86.8,39.0,104.0,104.0,:
3,"A,DL,C0350,TJ_GCV,BE",0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,:
4,"A,DL,C0350,TJ_GCV,BG",0.000,200.000,48.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,:


In [9]:
dims = split_dimension_column(raw)
print("După split dimensiuni:", dims.shape)
dims.head()


După split dimensiuni: (20500, 40)


Unnamed: 0,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,...,2020,2021,2022,2023,2024,freq,nrg_bal,siec,unit,geo
0,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.0,0.0,0.0,0.0,:,A,DL,C0350,TJ_GCV,AL
1,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,13.141,8.046,10.223,19.655,:,A,DL,C0350,TJ_GCV,AT
2,:,:,:,:,:,:,:,:,:,:,...,86.8,39.0,104.0,104.0,:,A,DL,C0350,TJ_GCV,BA
3,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.0,0.0,0.0,0.0,:,A,DL,C0350,TJ_GCV,BE
4,0.000,200.000,48.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.0,0.0,0.0,0.0,:,A,DL,C0350,TJ_GCV,BG


In [11]:
# Cell 8 – melt în format long

long_df = melt_years(dims)   # fără id_vars aici

long_df["year"] = long_df["year"].astype(int)
long_df["value"] = long_df["value"].apply(parse_value)

print("După melt:", long_df.shape)
long_df.head()


Ani detectați: ['1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999']
După melt: (717500, 7)


Unnamed: 0,freq,nrg_bal,siec,unit,geo,year,value
0,A,DL,C0350,TJ_GCV,AL,1990,0.0
1,A,DL,C0350,TJ_GCV,AT,1990,0.0
2,A,DL,C0350,TJ_GCV,BA,1990,
3,A,DL,C0350,TJ_GCV,BE,1990,0.0
4,A,DL,C0350,TJ_GCV,BG,1990,0.0


In [12]:
# Cell 9 – filtrare ECE
long_ece = long_df[long_df["geo"].isin(ECE_COUNTRIES)]

print("Total după filtru ECE:", len(long_ece))
long_ece.head()


Total după filtru ECE: 183960


Unnamed: 0,freq,nrg_bal,siec,unit,geo,year,value
4,A,DL,C0350,TJ_GCV,BG,1990,0.0
6,A,DL,C0350,TJ_GCV,CZ,1990,0.0
10,A,DL,C0350,TJ_GCV,EE,1990,0.0
17,A,DL,C0350,TJ_GCV,HR,1990,1035.0
18,A,DL,C0350,TJ_GCV,HU,1990,0.0


In [13]:
out_path = p("clean_nrg_cb_gas.csv")
long_ece.to_csv(out_path, index=False)
out_path


'C:\\Users\\osaci\\Desktop\\Proiect Stiinta Datelor Flow Cosmin\\Proiect Echipa\\clean_nrg_cb_gas.csv'

In [14]:
def clean_eurostat_file(input_file: str):
    """
    Încarcă, curăță și exportă un fișier Eurostat de tipul estat_nrg_cb_*.tsv.gz.
    """
    print(f"\n=== Procesez {input_file} ===")
    
    # 1. load
    raw = load_eurostat(p(input_file))
    
    # 2. split dimensions
    dims = split_dimension_column(raw)
    
    # 3. melt
    long_df = melt_years(dims)
    long_df["year"] = long_df["year"].astype(int)
    long_df["value"] = long_df["value"].apply(parse_value)
    
    # 4. filtrare ECE
    long_ece = long_df[long_df["geo"].isin(ECE_COUNTRIES)]
    
    # 5. export
    output_name = "clean_" + input_file.replace(".tsv.gz", ".csv")
    out_path = p(output_name)
    long_ece.to_csv(out_path, index=False)
    
    print(f"Exportat: {output_name} ({len(long_ece)} rânduri)")
    return out_path


In [15]:
files = [
    "estat_nrg_cb_bm.tsv.gz",
    "estat_nrg_cb_e.tsv.gz",
    "estat_nrg_cb_gas.tsv.gz",
    "estat_nrg_cb_h.tsv.gz",
    "estat_nrg_cb_oil.tsv.gz",
    "estat_nrg_cb_rw.tsv.gz",
    "estat_nrg_cb_sff.tsv.gz"
]

for f in files:
    clean_eurostat_file(f)



=== Procesez estat_nrg_cb_bm.tsv.gz ===
Ani detectați: ['2019', '2020', '2021', '2022', '2023']
Exportat: clean_estat_nrg_cb_bm.csv (7970 rânduri)

=== Procesez estat_nrg_cb_e.tsv.gz ===
Ani detectați: ['1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999']
Exportat: clean_estat_nrg_cb_e.csv (23030 rânduri)

=== Procesez estat_nrg_cb_gas.tsv.gz ===
Ani detectați: ['1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999']
Exportat: clean_estat_nrg_cb_gas.csv (183960 rânduri)

=== Procesez estat_nrg_cb_h.tsv.gz ===
Ani detectați: ['1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999']
Exportat: clean_estat_nrg_cb_h.csv (19495 rânduri)

=== Procesez estat_nrg_cb_oil.tsv.gz ===
Ani detectați: ['1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999']
Exportat: clean_estat_nrg_cb_oil.csv (2476705 rânduri)

=== Procesez estat_nrg_cb_rw.tsv.gz ===
Ani detectați: ['1990', '1991', '1992', '1993', '199

In [16]:
clean_files = {
    "gas": "clean_estat_nrg_cb_gas.csv",
    "oil": "clean_estat_nrg_cb_oil.csv",
    "solid_fuels": "clean_estat_nrg_cb_h.csv",    # hard coal etc.
    "solid_fossil": "clean_estat_nrg_cb_sff.csv", # other solid fossil
    "biomass": "clean_estat_nrg_cb_bm.csv",       # biofuels / biomass
    "renewables_other": "clean_estat_nrg_cb_rw.csv", # renewables & waste
    "electricity": "clean_estat_nrg_cb_e.csv",    # electricity-related
}

all_list = []
for fuel_group, fname in clean_files.items():
    path = p(fname)
    df = pd.read_csv(path)
    df["fuel_group"] = fuel_group
    all_list.append(df)

energy_all = pd.concat(all_list, ignore_index=True)
energy_all.head()


Unnamed: 0,freq,nrg_bal,siec,unit,geo,year,value,fuel_group
0,A,DL,C0350,TJ_GCV,BG,1990,0.0,gas
1,A,DL,C0350,TJ_GCV,CZ,1990,0.0,gas
2,A,DL,C0350,TJ_GCV,EE,1990,0.0,gas
3,A,DL,C0350,TJ_GCV,HR,1990,1035.0,gas
4,A,DL,C0350,TJ_GCV,HU,1990,0.0,gas


In [17]:
# agregăm pe țară–an–tip de combustibil
agg = (
    energy_all
    .groupby(["geo", "year", "fuel_group"], as_index=False)["value"]
    .sum()
)

# total energie pe țară–an (pentru share-uri, dacă vrei)
tot = (
    agg.groupby(["geo", "year"], as_index=False)["value"]
    .sum()
    .rename(columns={"value": "total_energy"})
)

agg = agg.merge(tot, on=["geo", "year"], how="left")
agg["share"] = agg["value"] / agg["total_energy"]

agg.head()


Unnamed: 0,geo,year,fuel_group,value,total_energy,share
0,BG,1990,electricity,268303.0,4908946.0,0.054656
1,BG,1990,gas,1939661.0,4908946.0,0.395128
2,BG,1990,oil,661852.0,4908946.0,0.134826
3,BG,1990,renewables_other,50781.0,4908946.0,0.010345
4,BG,1990,solid_fossil,586475.0,4908946.0,0.119471


In [18]:
mix_path = p("energy_mix_ece_country_year.csv")
agg.to_csv(mix_path, index=False)
mix_path


'C:\\Users\\osaci\\Desktop\\Proiect Stiinta Datelor Flow Cosmin\\Proiect Echipa\\energy_mix_ece_country_year.csv'

In [19]:
mix = agg.copy()
mix["year"] = mix["year"].astype(int)

latest_year = mix["year"].max()
latest_year


np.int64(2024)

In [20]:
mix_latest = mix[mix["year"] == latest_year].copy()
mix_latest.head()


Unnamed: 0,geo,year,fuel_group,value,total_energy,share
209,BG,2024,electricity,157171.044,914459.786,0.171873
210,BG,2024,gas,217057.794,914459.786,0.237362
211,BG,2024,oil,175774.718,914459.786,0.192217
212,BG,2024,renewables_other,124245.596,914459.786,0.135868
213,BG,2024,solid_fossil,99100.618,914459.786,0.108371


In [21]:
mix_wide = (
    mix_latest
    .pivot_table(
        index="geo",
        columns="fuel_group",
        values="share"
    )
    .reset_index()
)

mix_wide.columns.name = None
mix_wide.head()


Unnamed: 0,geo,electricity,gas,oil,renewables_other,solid_fossil,solid_fuels
0,BG,0.171873,0.237362,0.192217,0.135868,0.108371,0.15431
1,CZ,0.148339,0.30416,0.10067,0.194145,0.089303,0.163383
2,EE,0.101644,0.078192,0.074489,0.444008,0.062643,0.239023
3,HR,0.139812,0.424395,0.147719,0.204152,0.00357,0.080353
4,HU,0.136655,0.444891,0.140305,0.148551,0.017008,0.112589


In [22]:
mix_wide.to_csv(p("energy_mix_latest_pivot.csv"), index=False)
