In [1]:
# %%  ─────────────────────────  INITIALISIERUNG  ─────────────────────────
import pandas as pd
from pathlib import Path

excel_path = Path(
    r"C:\Users\claud\iCloudDrive\Dokumente\02_CLI\Studium\ZHAW\Masterarbeit\vocdata\data\bfs_data_abschlussquote.xlsx"
)
xls = pd.ExcelFile(excel_path)

DATA_SHEETS = [s for s in xls.sheet_names if s.endswith("_Data")]
print("DATA_SHEETS =", DATA_SHEETS)




DATA_SHEETS = ['T1_SekII_1st_25_Merkm_Data', 'T2_SekII_1st_25_Kant_Data', 'T3_Matura_Merkm_Data', 'T4_Matura_Kant_Data']


In [2]:
import pandas as pd
from sqlalchemy import create_engine, text
from pathlib import Path

# ------------------------------------------------------------------
# 1. Pfade zu allen harmonisierten Excel-Dateien
# ------------------------------------------------------------------
sources = [
    Path(r"C:\Users\claud\iCloudDrive\Dokumente\02_CLI\Studium\ZHAW\Masterarbeit\vocdata\data\bfs_data_lva.xlsx"),                 # LVA-Daten
    Path(r"C:\Users\claud\iCloudDrive\Dokumente\02_CLI\Studium\ZHAW\Masterarbeit\vocdata\data\bfs_data_abschlussquote.xlsx")   # Abschlussquoten
]

# ------------------------------------------------------------------
# 2. DB-Verbindung
# ------------------------------------------------------------------
engine = create_engine("mysql+pymysql://root:voc_root@localhost:3306/vocdata", echo=False)

# ------------------------------------------------------------------
# 3. Dimensionen aus MySQL in Lookup-Dictionaries laden
# ------------------------------------------------------------------
dim_tables = [
    "abschlussniveau", "lernform", "geschlecht", "mig_status",
    "anschlussart", "qv_status", "lva_zeitraum",
    "wiedereinst_dauer", "isced", "beruf"
]

lookups = {}
with engine.begin() as con:
    for dim in dim_tables:
        df = pd.read_sql(f"SELECT * FROM dim_{dim}", con)
        code_col = f"{dim}_code" if f"{dim}_code" in df.columns else f"{dim}_bez"
        lookups[dim] = (
            df[[code_col, f"{dim}_id"]]
            .set_index(code_col)
            .to_dict()[f"{dim}_id"]
        )

# ------------------------------------------------------------------
# 4. Faktentabelle anlegen (falls noch nicht vorhanden)
# ------------------------------------------------------------------
with engine.begin() as con:
    con.execute(text("""
        CREATE TABLE IF NOT EXISTS fact_lva_stats (
            fact_id BIGINT AUTO_INCREMENT PRIMARY KEY,
            abschlussniveau_id  INT,
            lernform_id         INT,
            geschlecht_id       INT,
            mig_status_id       INT,
            anschlussart_id     INT,
            qv_status_id        INT,
            lva_zeitraum_id     INT,
            wiedereinst_dauer_id INT,
            isced_id            INT,
            beruf_id            INT,
            anzahl_lernende_mit_lva        INT,
            anzahl_lernende_wiedereinstieg INT,
            anzahl_lva_vertraege           INT,
            total_lehrvertraege            INT,
            total_lernende                 INT
        );
    """))

# ------------------------------------------------------------------
# 5. Helfer: eine Zeile in IDs umwandeln
# ------------------------------------------------------------------
def map_row_to_ids(row):
    return {
        "abschlussniveau_id":  lookups["abschlussniveau"].get(str(row.get("abschlussniveau", "")).strip().upper()),
        "lernform_id":         lookups["lernform"].get(str(row.get("lernform", "")).strip().upper()),
        "geschlecht_id":       lookups["geschlecht"].get(str(row.get("geschlecht", "")).strip().upper()),
        "mig_status_id":       lookups["mig_status"].get(str(row.get("mig_status", "")).strip().upper()),
        "anschlussart_id":     lookups["anschlussart"].get(str(row.get("anschlussart_lva", "")).strip().upper()),
        "qv_status_id":        lookups["qv_status"].get(str(row.get("qv_status", "")).strip().upper()),
        "lva_zeitraum_id":     lookups["lva_zeitraum"].get(str(row.get("lva_zeitraum", "")).strip().upper()),
        "wiedereinst_dauer_id":lookups["wiedereinst_dauer"].get(str(row.get("wiedereinstieg_dauer", "")).strip().upper()),
        "isced_id":            lookups["isced"].get(str(row.get("ausbildungsfeld_isced_code", "")).strip().upper()),
        "beruf_id":            lookups["beruf"].get(str(row.get("beruf_bez", "")).strip().upper())
    }

# ------------------------------------------------------------------
# 6. Alle *_Data-Sheets einlesen und in fact_lva_stats schreiben
# ------------------------------------------------------------------
insert_rows = []

for src in sources:
    xls = pd.ExcelFile(src)
    for sh in xls.sheet_names:
        if sh.lower().endswith("_data"):
            # Header automatisch finden (erste Zeile mit >=3 Werten)
            head = pd.read_excel(xls, sheet_name=sh, nrows=15, header=None)
            header_row = next(i for i, r in head.iterrows() if r.notna().sum() >= 3)
            df = pd.read_excel(xls, sheet_name=sh, header=header_row)

            for _, row in df.iterrows():
                ids = map_row_to_ids(row)
                facts = {
                    "anzahl_lernende_mit_lva":        row.get("anzahl_lernende_mit_lva"),
                    "anzahl_lernende_wiedereinstieg": row.get("anzahl_lernende_wiedereinstieg"),
                    "anzahl_lva_vertraege":           row.get("anzahl_lva_vertraege"),
                    "total_lehrvertraege":            row.get("total_lehrvertraege"),
                    "total_lernende":                 row.get("total_lernende")
                }
                insert_rows.append({**ids, **facts})

# DataFrame → MySQL
fact_df = pd.DataFrame(insert_rows)
with engine.begin() as con:
    fact_df.to_sql("fact_lva_stats", con, if_exists="append", index=False)

print("✔ fact_lva_stats geladen:", len(insert_rows), "Zeilen")


✔ fact_lva_stats geladen: 1268 Zeilen


In [3]:
import pandas as pd

SRC_FILE = "../data/bfs_data_abschlussquote.xlsx"  # relativer Pfad zum Repo-Wurzelordner
xls = pd.ExcelFile(SRC_FILE)
DATA_SHEETS = [s for s in xls.sheet_names if s.endswith("_Data")]

print("DATA_SHEETS =", DATA_SHEETS)
for name in xls.sheet_names:
    print("  •", name)


DATA_SHEETS = ['T1_SekII_1st_25_Merkm_Data', 'T2_SekII_1st_25_Kant_Data', 'T3_Matura_Merkm_Data', 'T4_Matura_Kant_Data']
  • T1_SekII_1st_25_Merkm_Data
  • T1_SekII_1st_25_Merkm_Dict
  • T2_SekII_1st_25_Kant_Data
  • T2_SekII_1st_25_Kant_Dict
  • T3_Matura_Merkm_Data
  • T3_Matura_Merk_Dict
  • T4_Matura_Kant_Data
  • T4_Matura_Kant_Dict


In [4]:
  
DATA_SHEETS = [s for s in xls.sheet_names if s.endswith("_Data")]

# Sicherheits-Initialisierung (falls SRC_FILE noch nicht existiert)
# ------------------------------------------------------------------
try:
    SRC_FILE
except NameError:
    SRC_FILE = excel_path  # excel_path ist in deinem Kopf-Code bereits definiert

sheet = "T1_SekII_1st_25_Merkm_Data"   # Name des ersten *_Data-Blattes

df = pd.read_excel(SRC_FILE, sheet_name=sheet)

print(f"== Vorschau Blatt: {sheet} ==")
display(df.head(5))          # erste fünf Zeilen ansehen
print("\nSpalten-Info:")
print(df.info())             # Datentypen & Null-Zellen


== Vorschau Blatt: T1_SekII_1st_25_Merkm_Data ==


Unnamed: 0,jahr,merkmal,kategorie,total_anz_25J,total_anz_sekII_erstabschluss_25J,Lehre_anz_sekII_erstabschluss_25J,allg_bildg_anz_sekII_erstabschluss_25J,total_%_sekII_erstabschluss_25J,Lehre_%_sekII_erstabschluss_25J,allg_bildg_%_sekII_erstabschluss_25J
0,2022,Total,,80423,72437,48819,23618,90.1,60.7,29.4
1,2022,Geschlecht,Männer,41437,36684,27375,9309,88.529,66.064,22.465
2,2022,Geschlecht,Frauen,39018,35753,21449,14304,91.632,54.972,36.66
3,2022,Migrationskategorie,in der Schweiz geborene Schweizer/innen,60557,55798,37187,18611,92.142,61.409,30.733
4,2022,Migrationskategorie,in der Schweiz geborene Ausländer/innen,7745,6587,5265,1322,85.049,67.984,17.065



Spalten-Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13 entries, 0 to 12
Data columns (total 10 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   jahr                                    13 non-null     int64  
 1   merkmal                                 13 non-null     object 
 2   kategorie                               12 non-null     object 
 3   total_anz_25J                           13 non-null     int64  
 4   total_anz_sekII_erstabschluss_25J       13 non-null     int64  
 5   Lehre_anz_sekII_erstabschluss_25J       13 non-null     int64  
 6   allg_bildg_anz_sekII_erstabschluss_25J  13 non-null     int64  
 7   total_%_sekII_erstabschluss_25J         13 non-null     float64
 8   Lehre_%_sekII_erstabschluss_25J         13 non-null     float64
 9   allg_bildg_%_sekII_erstabschluss_25J    13 non-null     float64
dtypes: float64(3), int64(5), object(2)
memory usage: 