In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch

In [None]:
df = pd.read_csv("..\data\SERSIC\df_train.csv")

data = np.load("..\data\SERSIC\X_train_augmented_x30.npz")
y_train = data["pos"]

df_repetido = df.loc[df.index.repeat(30)].reset_index(drop=True)

pos = torch.tensor(df[["dx","dy"]].values)
dist_host_pix = torch.norm(pos, dim=1)

new_pos = torch.tensor(y_train)
new_dist_host_pix = torch.norm(new_pos, dim=1)

In [None]:
df_pasquet = pd.read_csv("..\data\SERSIC\df_pasquet_train.csv")

data_pasquet = np.load("..\data\SERSIC\X_train_pasquet_augmented_x10.npz")
y_train_2 = data_pasquet["pos"]

df_pasquet_repetido = df_pasquet.loc[df_pasquet.index.repeat(10)].reset_index(drop=True)

new_pos_2 = torch.tensor(y_train_2)
new_dist_host_pix_2 = torch.norm(new_pos_2, dim=1)

In [None]:
def get_balance_mask(df, seed=0, scale="linear"):

    n=12
    bins_arcsec = np.linspace(0,300*0.25,n)

    df['bin'] = pd.cut(df['rSerRadius'] * 3, bins=bins_arcsec, right=False)
    df['bin'] = df['bin'].astype(object)

    # Extraer límite izquierdo de cada bin
    df['bin_left'] = df['bin'].map(lambda x: x.left if pd.notnull(x) else np.nan)

    # Crear máscara inicial
    mask = pd.Series(False, index=df.index)

    # Bins < 40 → ordenar e interpolar de 10% a 50%
    bins_lt_40 = df[df['bin_left'] < 40]['bin'].dropna().unique()
    bins_lt_40 = sorted(bins_lt_40, key=lambda x: x.left)  # ordenarlos por el límite izquierdo

    n_bins = len(bins_lt_40)
    if scale=="linear":
        fracs = np.linspace(0.05, 0.5, n_bins)  
    else:
        fracs = np.logspace(np.log10(0.01), np.log10(0.5), n_bins)

    for bin_i, frac in zip(bins_lt_40, fracs):
        df_bin = df[df['bin'] == bin_i]
        n_samples = int(len(df_bin) * frac)
        sampled_idx = df_bin.sample(n=n_samples, replace=False, random_state=seed).index
        mask.loc[sampled_idx] = True

    # Bins >= 40 → conservar todos
    mask.loc[df[df['bin_left'] >= 40].index] = True

    return mask

In [None]:
def gen_mask(df, mask):

    idx_tiny = (df[mask]).index
    idx_tiny = np.hstack([range(idx*30,idx*30+30) for idx in idx_tiny])

    mask_tiny = np.isin(np.arange(len(df)), idx_tiny)

    return mask_tiny

In [None]:
log_mask = get_balance_mask(df, scale="log")
linear_mask = get_balance_mask(df, scale="linear")

In [None]:
log_mask2 = gen_mask(df, log_mask)
linear_mask2 = gen_mask(df, linear_mask)

In [None]:
n = 12
bins_arcsec = np.linspace(0, 300 * 0.25, n)

# Colores de la paleta 'cool'
colors = plt.cm.cool(np.linspace(0, 1, 4))

fig, axs = plt.subplots(1, 2, figsize=(12, 5), sharey=True, dpi=300)

# Primer histograma: Sérsic radius
axs[0].hist(df_pasquet_repetido["rSerRadius"]*3, bins=bins_arcsec, histtype="step", linewidth=3, color=colors[0], label="Augmented x10 Pasquet")
axs[0].hist(df_repetido["rSerRadius"]*3, bins=bins_arcsec, histtype="step", linewidth=3, color=colors[3], label="Augmented x30")


axs[0].set_xlabel("Sérsic Radius [\"]")
axs[0].set_ylabel("# ejemplos")
axs[0].set_yscale("log")
axs[0].legend()
axs[0].set_title("Distribución de Sérsic Radius")

# Segundo histograma: Host dist
axs[1].hist(new_dist_host_pix_2*0.25, bins=bins_arcsec, histtype="step", linewidth=3, color=colors[0], label="Augmented x10 Pasquet")
axs[1].hist(new_dist_host_pix*0.25, bins=bins_arcsec, histtype="step", linewidth=3, color=colors[3], label="Augmented x30")

axs[1].set_xlabel("Host Dist [\"]")
axs[1].set_yscale("log")
axs[1].legend()
axs[1].set_title("Distribución de Distancia al Host")

plt.tight_layout()
plt.show()

In [None]:
n = 12
bins_arcsec = np.linspace(0, 300 * 0.25, n)

# Colores de la paleta 'cool'
colors = plt.cm.cool(np.linspace(0, 1, 4))

fig, axs = plt.subplots(1, 2, figsize=(12, 5), sharey=True, dpi=300)

# Primer histograma: Sérsic radius
axs[0].hist(df["rSerRadius"]*3, bins=bins_arcsec, histtype="step", linewidth=3, color=colors[0], label="Original")
axs[0].hist(df[linear_mask]["rSerRadius"]*3, bins=bins_arcsec, histtype="step", linewidth=3, color=colors[1], label="Balanced v1")
axs[0].hist(df[log_mask]["rSerRadius"]*3, bins=bins_arcsec, histtype="step", linewidth=3, color=colors[2], label="Balanced v2")
axs[0].hist(df_repetido["rSerRadius"]*3, bins=bins_arcsec, histtype="step", linewidth=3, color=colors[3], label="Augmented x30")


axs[0].set_xlabel("Sérsic Radius [\"]")
axs[0].set_ylabel("# ejemplos")
axs[0].set_yscale("log")
axs[0].legend()
axs[0].set_title("Distribución de Sérsic Radius")

# Segundo histograma: Host dist
axs[1].hist((dist_host_pix*0.25), bins=bins_arcsec, histtype="step", linewidth=3, color=colors[0], label="Original")
axs[1].hist((new_dist_host_pix*0.25)[linear_mask2], bins=bins_arcsec, histtype="step", linewidth=3, color=colors[1], label="Balanced v1")
axs[1].hist((new_dist_host_pix*0.25)[log_mask2], bins=bins_arcsec, histtype="step", linewidth=3, color=colors[2], label="Balanced v2")
axs[1].hist(new_dist_host_pix*0.25, bins=bins_arcsec, histtype="step", linewidth=3, color=colors[3], label="Augmented x30")

axs[1].set_xlabel("Host Dist [\"]")
axs[1].set_yscale("log")
axs[1].legend()
axs[1].set_title("Distribución de Distancia al Host")

plt.tight_layout()
plt.show()