In [43]:
import numpy as np
import pandas as pd
from pathlib import Path

# Try SciPy for stable Φ/Φ^{-1}; fall back to numpy/erf if not present.
try:
    from scipy.stats import norm
    def phi(z): return norm.cdf(z)
    def phi_inv(u): return norm.ppf(np.clip(u, 1e-12, 1-1e-12))
    HAS_SCIPY = True
except Exception:
    from math import erf, sqrt
    def phi(z): return 0.5 * (1.0 + erf(z / sqrt(2.0)))
    # Simple, decent inverse via erfinv if available
    try:
        from scipy.special import erfinv
        def phi_inv(u):
            u = np.clip(u, 1e-12, 1-1e-12)
            return sqrt(2.0)*erfinv(2*u - 1)
    except Exception:
        raise ImportError("This notebook prefers SciPy. Please `pip install scipy` for norm.ppf.")

rng = np.random.default_rng(12345)

# Paths (use your actual export locations)
CONT_PATH = "data/input/simple_params_1030/cs_2019_continuous_by_nace2.csv"
CORR_PATH = "data/input/simple_params_1030/cs_2019_corr_long_by_nace2.csv"

# How many firms to simulate per NACE2?
# We'll match the max 'N' reported across continuous vars for that NACE2 (below).
SIM_SCALE = 1.0  # 1.0 = match counts; e.g. 0.5 = half, 2.0 = double


In [44]:
cont = pd.read_csv(CONT_PATH)
corr_long = pd.read_csv(CORR_PATH)

# Clean column names we’ll use
cont.columns = [c.strip() for c in cont.columns]
corr_long.columns = [c.strip() for c in corr_long.columns]

# Percentile columns available in your export
pct_cols = ["p0001","p01","p05","p10","p25","p50","p75","p90","p95","p99","p999"]
base_cols = ["nace2","var","N","min","max","median","mean","sd","skewness","excess_kurtosis"]
assert all(pc in cont.columns for pc in pct_cols), "Some percentile columns missing."

# Build a quick lookup of group sizes per NACE2
N_by_nace = cont.groupby("nace2")["N"].max().astype(int)
nace_list = list(N_by_nace.index)

# Continuous variables present (union over all groups)
all_vars = cont["var"].unique().tolist()


In [45]:
# For each (nace2, var) we’ll store U-grid and X-grid in dicts
inv_grids = {}  # key: (nace2, var) → dict with 'u' and 'x' numpy arrays

# Probability grid that matches the columns we have, plus endpoints
u_grid = np.array([0.0,   0.001, 0.01, 0.05, 0.10, 0.25, 0.50, 0.75, 0.90, 0.95, 0.99, 0.999, 1.0])

for g in nace_list:
    sub = cont[cont["nace2"] == g]
    for v in sub["var"].unique():
        row = sub[sub["var"] == v].iloc[0]
        # Build X-grid in the same order as u_grid
        x_vals = [
            row["min"],
            row["p0001"], row["p01"], row["p05"], row["p10"], row["p25"],
            row["p50"], row["p75"], row["p90"], row["p95"], row["p99"], row["p999"],
            row["max"],
        ]
        x_grid = np.asarray(x_vals, dtype=float)

        # Ensure monotonicity (just in case of minor rounding noise)
        x_grid = np.maximum.accumulate(x_grid)

        inv_grids[(g, v)] = {"u": u_grid, "x": x_grid}


In [46]:
def build_psd_corr_for_group(g, var_list):
    # Take rows for this nace2
    g_corr = corr_long[corr_long["nace2"] == g].copy()
    # Ensure we only include requested var_list
    g_corr = g_corr[g_corr["var1"].isin(var_list) & g_corr["var2"].isin(var_list)]
    # Wide matrix
    corr = pd.pivot_table(g_corr, index="var1", columns="var2", values="cor")
    # Reindex to full var_list on both axes
    corr = corr.reindex(index=var_list, columns=var_list)
    # Fill diag with 1, off-diag missing with 0
    np.fill_diagonal(corr.values, 1.0)
    corr = corr.fillna(0.0)

    # Symmetrize
    M = corr.values
    M = 0.5 * (M + M.T)

    # Eigenvalue clipping to ensure PSD
    w, V = np.linalg.eigh(M)
    w_clipped = np.clip(w, 1e-8, None)
    M_psd = (V * w_clipped) @ V.T

    # Normalize back to correlation (set diag = 1 and rescale slight drift)
    d = np.sqrt(np.diag(M_psd))
    M_psd = M_psd / np.outer(d, d)
    np.fill_diagonal(M_psd, 1.0)

    return pd.DataFrame(M_psd, index=var_list, columns=var_list)


In [47]:
synthetic_blocks = []

for g in nace_list:
    # Variables with a marginal (grid) in this group:
    vars_marg = cont.loc[cont["nace2"] == g, "var"].unique().tolist()

    # Variables present in correlation for this group:
    g_vars_corr = pd.unique(
        corr_long.loc[corr_long["nace2"] == g, ["var1","var2"]].values.ravel("K")
    ).tolist()

    # Intersection = variables we'll simulate for this group
    var_list = [v for v in all_vars if v in vars_marg and v in g_vars_corr]
    if len(var_list) == 0:
        continue

    # Correlation matrix (PSD)
    C = build_psd_corr_for_group(g, var_list).values

    # Sample size for the group
    N_sim = int(max(1, round(N_by_nace[g] * SIM_SCALE)))

    # Draw correlated normals via eigen or cholesky
    # Cholesky can occasionally fail if near-singular; use eigh instead
    w, V = np.linalg.eigh(C)
    w = np.clip(w, 0, None)
    A = V @ np.diag(np.sqrt(w)) @ V.T

    Z = rng.standard_normal(size=(N_sim, len(var_list)))
    Z = Z @ A.T

    # Map to uniforms
    U = phi(Z)

    # Map uniforms to each variable via its (nace2,var) inverse CDF grid
    data = {}
    for j, v in enumerate(var_list):
        grid = inv_grids[(g, v)]
        # linear interpolation on the grid
        x_sim = np.interp(U[:, j], grid["u"], grid["x"])
        data[v] = x_sim

    block = pd.DataFrame(data)
    block.insert(0, "nace2", g)
    block.insert(1, "row_id", np.arange(N_sim, dtype=int))
    synthetic_blocks.append(block)

synthetic = pd.concat(synthetic_blocks, ignore_index=True)
synthetic.head()


Unnamed: 0,nace2,row_id,age,tanass_clean,eszk,sales_clean,pretax,persexp_clean,satok,export_value,ereduzem,emp,liabilities
0,10,0,0.006364,66772.59476,29923.832264,18529.912941,-37.10427,94.336264,-9188.041831,1180157.0,438.422491,0.0,191021.758624
1,10,1,25.299281,10151.716078,78304.88678,158273.631374,12682.838454,86948.076892,8450.160706,3265402.0,22435.187059,40.529525,37910.896832
2,10,2,25.063239,151.088562,113422.626325,786718.913072,4273.047811,60718.671378,737127.712029,45450770.0,6657.828324,21.728193,10879.315308
3,10,3,1.469206,107.77881,27571.399007,69855.839155,-187.071126,6329.708135,18738.391276,599694.1,-74.706535,8.954354,11129.220485
4,10,4,10.68995,5465.810743,250913.38658,879128.63282,8007.565266,66960.188081,152867.314478,7389195.0,15639.38314,24.826914,98656.285291


print("Synthetic shape:", synthetic.shape)
print("Per NACE2 counts:")
print(synthetic["nace2"].value_counts().sort_index())

# Save
Path("data/synthetic").mkdir(parents=True, exist_ok=True)
# synthetic.to_parquet("data/synthetic/sim_cs2019_by_nace2_gausscop.parquet", index=False)
# synthetic.to_csv("data/synthetic/sim_cs2019_by_nace2_gausscop.csv", index=False)

# Optional: a tiny sanity peek at means vs targets for one group/var
g0 = synthetic["nace2"].unique()[0]
v0 = synthetic.columns[synthetic.columns.get_loc("row_id")+1]  # first simulated var
print(f"\nExample sanity — group {g0}, var {v0}")
print("simulated mean:", synthetic.loc[synthetic["nace2"]==g0, v0].mean())
target_mean = cont.query("nace2 == @g0 and var == @v0")["mean"].iloc[0]
print("target mean   :", target_mean)


In [48]:
import numpy as np
import pandas as pd

rng = np.random.default_rng(42)

dummy_path = "data/input/simple_params_1030/cs_2019_dummy_shares_by_nace2.csv"
reg_path   = "data/input/simple_params_1030/cs_2019_category_shares_by_nace2_region.csv"
cnty_path  = "data/input/simple_params_1030/cs_2019_category_shares_by_nace2_county.csv"
own_path   = "data/input/simple_params_1030/cs_2019_category_shares_by_nace2_owner.csv"

dshare = pd.read_csv(dummy_path)
reg_sh = pd.read_csv(reg_path)   if Path(reg_path).exists()  else pd.DataFrame(columns=["nace2","region","N","share"])
cnty_sh= pd.read_csv(cnty_path)  if Path(cnty_path).exists() else pd.DataFrame(columns=["nace2","county","N","share"])
own_sh = pd.read_csv(own_path)   if Path(own_path).exists()  else pd.DataFrame(columns=["nace2","firm_owner","N","share"])

# Keep only what we need
dshare = dshare[["nace2","share_export","share_grant","share_exit"]].copy()

# Make sure shares are within [0,1]
for c in ["share_export","share_grant","share_exit"]:
    if c in dshare.columns:
        dshare[c] = dshare[c].clip(0, 1)


In [49]:
syn = synthetic.copy()  # from your previous cells
syn["has_export"] = np.nan
syn["has_grant"]  = np.nan
syn["exit"]       = np.nan

for g, gdf in syn.groupby("nace2"):
    idx = gdf.index
    row = dshare[dshare["nace2"] == g]
    if row.empty:
        # no shares for this group; skip
        continue
    p_export = float(row["share_export"].iloc[0]) if "share_export" in row else 0.0
    p_grant  = float(row["share_grant"].iloc[0])  if "share_grant"  in row else 0.0
    p_exit   = float(row["share_exit"].iloc[0])   if "share_exit"   in row else 0.0

    syn.loc[idx, "has_export"] = rng.binomial(1, p_export, size=len(idx))
    syn.loc[idx, "has_grant"]  = rng.binomial(1, p_grant,  size=len(idx))
    syn.loc[idx, "exit"]       = rng.binomial(1, p_exit,   size=len(idx))

# Cast to integers
syn["has_export"] = syn["has_export"].astype("Int64")
syn["has_grant"]  = syn["has_grant"].astype("Int64")
syn["exit"]       = syn["exit"].astype("Int64")


In [50]:
# Prepare containers
syn["region"]     = pd.NA
syn["county"]     = pd.NA
syn["firm_owner"] = pd.NA

# Region
if not reg_sh.empty:
    reg_sh = reg_sh.rename(columns={reg_sh.columns[0]: "region"} if reg_sh.columns[0] != "region" else {})
    # keep columns: nace2, region, share
    keep_cols = [c for c in ["nace2","region","share"] if c in reg_sh.columns]
    reg_sh = reg_sh[keep_cols].dropna(subset=["region"]).copy()
    # normalize shares per group (just in case)
    reg_sh["share"] = reg_sh["share"].clip(lower=0)
    reg_sh["share"] = reg_sh.groupby("nace2")["share"].transform(lambda s: s / s.sum() if s.sum() > 0 else s)

    for g, gdf in syn.groupby("nace2"):
        idx = gdf.index
        opts = reg_sh[reg_sh["nace2"] == g]
        if opts.empty:
            continue
        cats = opts["region"].astype(str).tolist()
        probs= opts["share"].astype(float).to_numpy()
        # guard for rounding: re-normalize
        probs = probs / probs.sum() if probs.sum() > 0 else np.ones_like(probs)/len(probs)
        draws = rng.choice(cats, size=len(idx), p=probs, replace=True)
        syn.loc[idx, "region"] = draws

# County
if not cnty_sh.empty:
    cnty_sh = cnty_sh.rename(columns={cnty_sh.columns[0]: "county"} if cnty_sh.columns[0] != "county" else {})
    keep_cols = [c for c in ["nace2","county","share"] if c in cnty_sh.columns]
    cnty_sh = cnty_sh[keep_cols].dropna(subset=["county"]).copy()
    cnty_sh["share"] = cnty_sh["share"].clip(lower=0)
    cnty_sh["share"] = cnty_sh.groupby("nace2")["share"].transform(lambda s: s / s.sum() if s.sum() > 0 else s)

    for g, gdf in syn.groupby("nace2"):
        idx = gdf.index
        opts = cnty_sh[cnty_sh["nace2"] == g]
        if opts.empty:
            continue
        cats = opts["county"].astype(str).tolist()
        probs= opts["share"].astype(float).to_numpy()
        probs = probs / probs.sum() if probs.sum() > 0 else np.ones_like(probs)/len(probs)
        draws = rng.choice(cats, size=len(idx), p=probs, replace=True)
        syn.loc[idx, "county"] = draws

# Firm owner
if not own_sh.empty:
    own_sh = own_sh.rename(columns={own_sh.columns[0]: "firm_owner"} if own_sh.columns[0] != "firm_owner" else {})
    keep_cols = [c for c in ["nace2","firm_owner","share"] if c in own_sh.columns]
    own_sh = own_sh[keep_cols].dropna(subset=["firm_owner"]).copy()
    own_sh["share"] = own_sh["share"].clip(lower=0)
    own_sh["share"] = own_sh.groupby("nace2")["share"].transform(lambda s: s / s.sum() if s.sum() > 0 else s)

    for g, gdf in syn.groupby("nace2"):
        idx = gdf.index
        opts = own_sh[own_sh["nace2"] == g]
        if opts.empty:
            continue
        cats = opts["firm_owner"].astype(str).tolist()
        probs= opts["share"].astype(float).to_numpy()
        probs = probs / probs.sum() if probs.sum() > 0 else np.ones_like(probs)/len(probs)
        draws = rng.choice(cats, size=len(idx), p=probs, replace=True)
        syn.loc[idx, "firm_owner"] = draws

for col in ["emp", "age"]:
    if col in syn.columns:
        vals = pd.to_numeric(syn[col], errors="coerce")          # ensure numeric
        vals = np.rint(vals)                                     # round to nearest int
        vals = np.clip(vals, 0, None)                            # no negatives
        syn[col] = pd.Series(vals).astype("Int64")  


In [51]:
lab = pd.read_excel("data/nace2_labels.xlsx")
lab["nace2"] = lab["nace2"].apply(str)
lab["nace2"] = lab["nace2"].str.extract(r"(\d+)", expand=False).fillna("").str.zfill(2).str[:2]

syn = syn.merge(lab, on="nace2", how="left")
syn["nace2_name_code"] = syn["name_hu"].fillna("NACE " + syn["nace2"]) + " (" + syn["nace2"] + ")"


In [52]:
# === Simulate sales growth from estimated regression (insert before final export) ===
import pandas as pd
import numpy as np
from pathlib import Path

# Files exported by your estimation step (edit if different)
coef_by_industry_path = "data/input/simple_params_1030/cs_2019_reg_growth_G2_coefs_by_nace2.csv"
coef_overall_path     = "data/input/simple_params_1030/cs_2019_reg_growth_G2_coefs.csv"

# Fallback residual SD if not provided
DEFAULT_SIGMA = 0.25
rng = np.random.default_rng(42)

# Ensure needed columns / types
df = syn.copy()
if "nace2" in df.columns:
    df["nace2"] = df["nace2"].astype(str)
else:
    raise ValueError("`nace2` column is required in df for per-industry growth simulation.")

if "sales_clean" not in df.columns:
    raise ValueError("`sales_clean` column is missing from df.")

# Core regressor used in growth model (adjust/add others if your model has them)
df["ln_sales"] = np.log(np.clip(df["sales_clean"].astype(float), 1e-9, None))

# Load coefficients: prefer by-industry, else overall
use_by_industry = Path(coef_by_industry_path).exists()
if use_by_industry:
    coefs_raw = pd.read_csv(coef_by_industry_path)
    if not {"nace2","term","estimate"}.issubset(coefs_raw.columns):
        raise ValueError("growth_lm_by_nace2.csv must have columns: nace2, term, estimate")
    # optional residual SD per industry
    sig_by = (
        coefs_raw.loc[coefs_raw["term"]=="__sigma__", ["nace2","estimate"]]
        .rename(columns={"estimate":"sigma"})
        .assign(nace2=lambda d: d["nace2"].astype(str))
    )
    coefs = coefs_raw.loc[coefs_raw["term"]!="__sigma__", ["nace2","term","estimate"]].copy()
else:
    if not Path(coef_overall_path).exists():
        raise FileNotFoundError("No growth coefficient file found. Expected one of:\n"
                                f"- {coef_by_industry_path}\n- {coef_overall_path}")
    coefs_all = pd.read_csv(coef_overall_path)
    if not {"term","estimate"}.issubset(coefs_all.columns):
        raise ValueError("growth_lm_overall.csv must have columns: term, estimate")
    sig_overall = coefs_all.loc[coefs_all["term"]=="__sigma__", "estimate"]
    sigma_overall = float(sig_overall.iloc[0]) if len(sig_overall)==1 else DEFAULT_SIGMA
    coefs = coefs_all.loc[coefs_all["term"]!="__sigma__", ["term","estimate"]].copy()

# Build predicted growth g_hat = X beta (match terms by column names; intercept "(Intercept)" if present)
g_hat = np.zeros(len(df), dtype=float)

if use_by_industry:
    # loop industries, apply their betas
    for gcode, gcoefs in coefs.groupby("nace2"):
        mask = (df["nace2"] == str(gcode)).values
        if not mask.any():
            continue
        part = np.zeros(mask.sum(), dtype=float)
        # intercept
        if (gcoefs["term"] == "(Intercept)").any():
            b0 = float(gcoefs.loc[gcoefs["term"]=="(Intercept)","estimate"].iloc[0])
            part += b0
        # other terms
        for _, r in gcoefs.iterrows():
            term = r["term"]
            if term == "(Intercept)":
                continue
            beta = float(r["estimate"])
            if term in df.columns:
                part += beta * df.loc[mask, term].astype(float).to_numpy()
            # silently ignore missing terms -> contribute 0
        g_hat[mask] = part

    # residual SD per industry if provided; else default
    if 'sig_by' in locals() and len(sig_by) > 0:
        sigma_map = dict(zip(sig_by["nace2"], sig_by["sigma"]))
        sigma_vec = np.array([sigma_map.get(str(c), DEFAULT_SIGMA) for c in df["nace2"]], dtype=float)
    else:
        sigma_vec = np.full(len(df), DEFAULT_SIGMA, dtype=float)

else:
    # overall model
    if (coefs["term"] == "(Intercept)").any():
        g_hat += float(coefs.loc[coefs["term"]=="(Intercept)","estimate"].iloc[0])
    for _, r in coefs.iterrows():
        term = r["term"]
        if term == "(Intercept)":
            continue
        beta = float(r["estimate"])
        if term in df.columns:
            g_hat += beta * df[term].astype(float).to_numpy()
    sigma_vec = np.full(len(df), sigma_overall, dtype=float)

# Simulate growth and implied next-period sales
eps = rng.normal(loc=0.0, scale=sigma_vec, size=len(df))
df["growth_sim"] = g_hat + eps                               # ln(S_{t+1}) - ln(S_t)
df["ln_sales_lead_sim"] = df["ln_sales"] + df["growth_sim"]  # ln S_{t+1}
df["sales_lead_sim"]    = np.exp(df["ln_sales_lead_sim"])    # S_{t+1}

# (Optional) keep integers for count-like vars
if "emp" in df.columns:
    df["emp"] = pd.to_numeric(df["emp"], errors="coerce").round().astype("Int64")
if "age" in df.columns:
    df["age"] = pd.to_numeric(df["age"], errors="coerce").round().astype("Int64")

# --- continue with your existing export right after this cell ---


In [53]:
syn = df
syn[["sales_clean","sales_lead_sim"]].head()

Unnamed: 0,sales_clean,sales_lead_sim
0,18529.912941,19641.117993
1,158273.631374,33006.796531
2,786718.913072,97877.144506
3,69855.839155,39084.585568
4,879128.63282,52071.88286


In [81]:
county_name_correspondances = pd.read_excel("data/county_names_codes.xlsx")
county_name_correspondances = county_name_correspondances.rename(columns={"CODE":"county","NAME":"county_name"})
county_name_correspondances.head()

Unnamed: 0,county_name,county
0,Budapest,1
1,Baranya,2
2,Bács-Kiskun,3
3,Békés,4
4,Borsod-Abaúj-Zemplén,5


In [82]:
syn["county"] = syn["county"].astype("float").astype("int")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  syn["county"] = syn["county"].astype("float").astype("int")


In [83]:
syn["county"].dtype

dtype('int64')

In [None]:
syn = syn.merge(county_name_correspondances,how="left",on="county")

Unnamed: 0,nace2,row_id,age,tanass_clean,eszk,sales_clean,pretax,persexp_clean,satok,export_value,...,region,county,firm_owner,name_hu,nace2_name_code,ln_sales,growth_sim,ln_sales_lead_sim,sales_lead_sim,county_name
0,10,0,0,66772.594760,29923.832264,18529.912941,-37.104270,94.336264,-9188.041831,1.180157e+06,...,1.0,12,domestic,Élelmiszergyártás,Élelmiszergyártás (10),9.827142,0.058239,9.885381,19641.117993,Nógrád
1,10,1,25,10151.716078,78304.886780,158273.631374,12682.838454,86948.076892,8450.160706,3.265402e+06,...,2.0,11,domestic,Élelmiszergyártás,Élelmiszergyártás (10),11.972081,-1.567612,10.404469,33006.796531,Komárom-Esztergom
2,10,2,25,151.088562,113422.626325,786718.913072,4273.047811,60718.671378,737127.712029,4.545077e+07,...,1.0,1,domestic,Élelmiszergyártás,Élelmiszergyártás (10),13.575626,-2.084158,11.491468,97877.144506,Budapest
3,10,3,1,107.778810,27571.399007,69855.839155,-187.071126,6329.708135,18738.391276,5.996941e+05,...,1.0,4,domestic,Élelmiszergyártás,Élelmiszergyártás (10),11.154189,-0.580706,10.573483,39084.585568,Békés
4,10,4,11,5465.810743,250913.386580,879128.632820,8007.565266,66960.188081,152867.314478,7.389195e+06,...,6.0,2,foreign,Élelmiszergyártás,Élelmiszergyártás (10),13.686687,-2.826306,10.860380,52071.882860,Baranya
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
414913,99,25,22,5637.313935,33748.108614,8349.250903,16096.562787,65.424295,16763.883115,9.407511e+04,...,7.0,8,domestic,Területen kívüli szervezet,Területen kívüli szervezet (99),9.029927,1.677681,10.707608,44694.596360,Győr-Moson-Sopron
414914,99,26,15,387.315369,10969.726738,2.570055,-23867.588024,300.247527,1971.378070,5.480631e+03,...,3.0,1,domestic,Területen kívüli szervezet,Területen kívüli szervezet (99),0.943927,8.921645,9.865572,19255.892366,Budapest
414915,99,27,12,0.000000,692.838910,0.000000,-7279.129478,0.000000,-9856.058545,1.014389e+04,...,1.0,5,domestic,Területen kívüli szervezet,Területen kívüli szervezet (99),-20.723266,26.767712,6.044446,421.764106,Borsod-Abaúj-Zemplén
414916,99,28,8,428.489215,14925.762420,1.638709,-10428.842289,0.000000,6677.473304,1.893923e+04,...,6.0,1,domestic,Területen kívüli szervezet,Területen kívüli szervezet (99),0.493909,9.144792,9.638701,15347.390228,Budapest


In [85]:
syn = syn[syn["nace2"] != "ALL"]
print("Synthetic with dummies + categoricals:", syn.shape)
print(syn.dtypes.head(12))

# Quick share checks (simulated vs target) for one group
g0 = syn["nace2"].unique()[0]
print("\nExample shares — group", g0)
sim_row = syn[syn["nace2"]==g0]
print("export", sim_row["has_export"].mean(), 
      "grant", sim_row["has_grant"].mean(), 
      "exit",  sim_row["exit"].mean())

# Save
Path("data/synthetic").mkdir(parents=True, exist_ok=True)
syn.to_parquet("data/synthetic/sim_cs2019_by_nace2_withcats.parquet", index=False)
# syn.to_csv("data/synthetic/sim_cs2019_by_nace2_withcats.csv", index=False)


Synthetic with dummies + categoricals: (414918, 25)
nace2             object
row_id             int64
age                Int64
tanass_clean     float64
eszk             float64
sales_clean      float64
pretax           float64
persexp_clean    float64
satok            float64
export_value     float64
ereduzem         float64
emp                Int64
dtype: object

Example shares — group 10
export 0.06861198738170347 grant 0.03680336487907466 exit 0.11277602523659307


In [55]:
syn["nace2"].value_counts()

nace2
ALL    414918
47      44539
68      36993
46      30452
43      21576
        ...  
98          6
34          3
76          2
57          1
67          1
Name: count, Length: 85, dtype: int64

In [56]:
syn["nace2_name_code"].value_counts()

nace2_name_code
NACE ALL (ALL)                                                         414918
Kiskereskedelem (kivéve: gépjármű, motorkerékpár) (47)                  44539
INGATLANÜGYLETEK (68)                                                   36993
Nagykereskedelem (kivéve: jármű, motorkerékpár) (46)                    30452
Speciális szaképítés (43)                                               21576
                                                                        ...  
Háztartás termék-előállítása, szolgáltatása saját fogyasztásra (98)         6
NACE 34 (34)                                                                3
NACE 76 (76)                                                                2
NACE 57 (57)                                                                1
NACE 67 (67)                                                                1
Name: count, Length: 85, dtype: int64

In [86]:
syn.columns

Index(['nace2', 'row_id', 'age', 'tanass_clean', 'eszk', 'sales_clean',
       'pretax', 'persexp_clean', 'satok', 'export_value', 'ereduzem', 'emp',
       'liabilities', 'has_export', 'has_grant', 'exit', 'region', 'county',
       'firm_owner', 'name_hu', 'nace2_name_code', 'ln_sales', 'growth_sim',
       'ln_sales_lead_sim', 'sales_lead_sim'],
      dtype='object')

Unnamed: 0,nace2,row_id,age,tanass_clean,eszk,sales_clean,pretax,persexp_clean,satok,export_value,...,exit,region,county,firm_owner,name_hu,nace2_name_code,ln_sales,growth_sim,ln_sales_lead_sim,sales_lead_sim
0,10,0,0,66772.594760,2.992383e+04,18529.912941,-37.104270,94.336264,-9188.041831,1.180157e+06,...,0,1.0,12.0,domestic,Élelmiszergyártás,Élelmiszergyártás (10),9.827142,0.058239,9.885381,19641.117993
1,10,1,25,10151.716078,7.830489e+04,158273.631374,12682.838454,86948.076892,8450.160706,3.265402e+06,...,1,2.0,11.0,domestic,Élelmiszergyártás,Élelmiszergyártás (10),11.972081,-1.567612,10.404469,33006.796531
2,10,2,25,151.088562,1.134226e+05,786718.913072,4273.047811,60718.671378,737127.712029,4.545077e+07,...,0,1.0,1.0,domestic,Élelmiszergyártás,Élelmiszergyártás (10),13.575626,-2.084158,11.491468,97877.144506
3,10,3,1,107.778810,2.757140e+04,69855.839155,-187.071126,6329.708135,18738.391276,5.996941e+05,...,0,1.0,4.0,domestic,Élelmiszergyártás,Élelmiszergyártás (10),11.154189,-0.580706,10.573483,39084.585568
4,10,4,11,5465.810743,2.509134e+05,879128.632820,8007.565266,66960.188081,152867.314478,7.389195e+06,...,0,6.0,2.0,foreign,Élelmiszergyártás,Élelmiszergyártás (10),13.686687,-2.826306,10.860380,52071.882860
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
829831,ALL,414913,10,27048.708160,3.218727e+06,106417.416774,2274.924108,1385.943617,84795.594718,1.166794e+06,...,0,5.0,1.0,domestic,,NACE ALL (ALL),11.575125,-0.856614,10.718510,45184.548969
829832,ALL,414914,26,379150.253431,1.536827e+05,22652.414229,-4840.886736,712.119993,22031.746310,1.153341e+04,...,0,1.0,6.0,domestic,,NACE ALL (ALL),10.028022,0.034348,10.062370,23443.995167
829833,ALL,414915,2,0.000000,2.877301e+03,3632.674157,16059.407579,544.090781,3772.882098,3.679572e+03,...,0,4.0,1.0,domestic,,NACE ALL (ALL),8.197724,0.488532,8.686257,5920.976909
829834,ALL,414916,23,17369.334923,1.695772e+05,11301.756419,12819.587776,0.000000,3428.827869,0.000000e+00,...,1,1.0,1.0,domestic,,NACE ALL (ALL),9.332713,0.200988,9.533702,13817.644585
