### import

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

### csv 불러오기

In [None]:
train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")
submission = pd.read_csv("./submission_sample.csv")

### train에서 선형보간 없애고 새로운 csv로 저장

In [None]:
def flag_linear_interps(df, y='nins', id_col='pv_id', t_col='time',
                        min_run=2, rtol=1e-7, atol=1e-9,
                        slope_eps=1e-8, level_eps=1e-6):
    df = df.sort_values([id_col, t_col]).reset_index(drop=True).copy()
    df[y] = pd.to_numeric(df[y], errors='coerce').astype('float64')

    g = df.groupby(id_col, sort=False)
    d_prev = g[y].diff()
    d_next = g[y].transform(lambda s: s.shift(-1) - s)

    diff1 = (d_next - d_prev).abs()
    scale = np.maximum(d_prev.abs(), d_next.abs())
    tol = atol + rtol * scale.fillna(0)
    lin_inside = np.isfinite(d_prev) & np.isfinite(d_next) & (diff1 <= tol)

    flat_zero = (df[y].abs() <= level_eps) & (d_prev.abs() <= slope_eps)

    df['_lin_inside'] = lin_inside.values
    df['_slope_step'] = d_prev.values
    df['_block']      = g['_lin_inside'].transform(lambda s: s.ne(s.shift()).cumsum())
    df['_run_len']    = df.groupby([id_col, '_block'])['_lin_inside'].transform('size')

    df['is_interp_like'] = df['_lin_inside'] & (df['_run_len'] >= min_run) & (~flat_zero)

    seg = (df[df['is_interp_like']]
           .groupby([id_col, '_block'], as_index=False)
           .agg(start_time=(t_col, 'first'),
                end_time=(t_col, 'last'),
                points=('is_interp_like', 'size'),
                slope_per_step=('_slope_step', 'median'))
           .sort_values([id_col, 'start_time']))
    return df, seg

tagged, segments = flag_linear_interps(
    train, y='nins', id_col='pv_id', t_col='time',
    min_run=10, rtol=1e-7, atol=1e-9,
    slope_eps=1e-8, level_eps=1e-6
)

cols = [c for c in [
    'pv_id',
    'left_anchor_time','missing_start_time','missing_end_time','right_anchor_time',
    'start_time','end_time',
    'points','duration_min','slope_per_step'
] if c in segments.columns]

print("총 보간 의심 구간 수:", len(segments))
print("고유 발전소 수:", segments['pv_id'].nunique())

train_nan = tagged.copy()
train_nan.loc[train_nan['is_interp_like'], 'nins'] = np.nan

helper_cols = ['_lin_inside','_slope_step','_block','_run_len','_row',
               'is_interp_like','pv_num','_d2']
train_nan.drop(columns=[c for c in helper_cols if c in train_nan.columns], inplace=True)

train_nan = train_nan.sort_values(['pv_id','time']).reset_index(drop=True)
train_nan.to_csv('train_clean_ver2.csv', index=False, encoding='utf-8-sig')

print(f"원본 행수: {len(train):,}")
print(f"NaN 처리 포함 행수(동일): {len(train_nan):,}")
print("저장 완료: train_clean_ver2.csv")


### 추론 코드

In [None]:
train = pd.read_csv("./train_clean_ver2.csv")

BLACKLIST = {f'PV_ID_{i}' for i in [49,39,28]}

def weighted_average_by_time(
    train: pd.DataFrame,
    target_pv: str = 'PV_ID_40',
    power: float = 2.0,
    eps: float = 1e-6,
    topk = 55,
    return_real=True,
    visualize=True
):
    target = test[test['pv_id'] == target_pv].copy()

    source = train.copy()
    target_times = target['time'].unique()
    source_f = source[source['time'].isin(target_times)].copy()
    source_f = source_f[~source_f['pv_id'].isin(BLACKLIST)]

    target_idx = target.index.to_numpy()
    preds = np.empty(len(target), dtype=float)

    src_g = source_f.groupby('time')
    tgt_g = target.groupby('time')

    common_times = np.intersect1d(source_f['time'].unique(), target['time'].unique())

    if len(common_times) >= 2:
        ct_sorted = np.sort(pd.to_datetime(common_times))
        step_sec = int((ct_sorted[1] - ct_sorted[0]).total_seconds())
    else:
        step_sec = 300

    for t in tqdm(common_times):
        src = src_g.get_group(t)
        tgt = tgt_g.get_group(t)

        S    = src[['coord1','coord2']].to_numpy(dtype=float)
        y    = src['nins'].to_numpy(dtype=float)
        S_pv = src['pv_id'].to_numpy()

        T = tgt[['coord1','coord2']].to_numpy(dtype=float)

        diff = T[:, None, :] - S[None, :, :]
        D = np.linalg.norm(diff, axis=2)

        if (topk is None) or (isinstance(topk, (int, float)) and topk <= 0):
            k = S.shape[0]
            idx_topk = np.tile(np.arange(S.shape[0]), (T.shape[0], 1))
        else:
            k = min(int(topk), S.shape[0])
            idx_topk = np.argpartition(D, kth=k-1, axis=1)[:, :k]

        Dk = np.take_along_axis(D, idx_topk, axis=1)
        yk = y[idx_topk]
        W  = 1.0 / (np.power(Dk, power) + eps)

        mask = ~np.isnan(yk)

        Wm  = np.where(mask, W, 0.0)
        ykm = np.where(mask, yk, 0.0)
        num = (Wm * ykm).sum(axis=1)
        den = Wm.sum(axis=1) + 1e-12
        pred_t = num / den

        preds[np.searchsorted(target_idx, tgt.index.to_numpy())] = pred_t

    if return_real:
        reals = target['nins'].to_numpy(dtype=float)
        return preds, reals
    else:
        return preds, common_times

def pick_closest_test_pv(current_train: pd.DataFrame, test_df: pd.DataFrame, candidates: list[str]) -> str:
    tr_coords = current_train.groupby('pv_id')[['coord1','coord2']].first().dropna()
    te_coords = test_df.groupby('pv_id')[['coord1','coord2']].first().dropna()

    tr_arr = tr_coords.to_numpy(dtype=float)
    best_pv, best_d = None, np.inf
    for pv in candidates:
        if pv not in te_coords.index:
            continue
        tx, ty = te_coords.loc[pv, ['coord1','coord2']].astype(float).to_numpy()
        d = np.linalg.norm(tr_arr - np.array([tx, ty], dtype=float), axis=1).min()
        if d < best_d:
            best_d, best_pv = d, pv
    return best_pv

current_train = train.copy()
remaining = list(test['pv_id'].unique())

step = 0
while remaining:
    step += 1
    target_id = pick_closest_test_pv(current_train, test, remaining)

    preds, common_times = weighted_average_by_time(
        current_train,
        target_pv=target_id,
        power=2.0,
        eps=1e-8,
        topk=55,
        return_real=False,
        visualize=False
    )

    mask_sub = (submission['pv_id'] == target_id) & (submission['time'].isin(common_times))
    submission.loc[mask_sub, 'nins'] = preds

    add_df = test[(test['pv_id'] == target_id) & (test['time'].isin(common_times))].copy()
    add_df['nins'] = preds
    current_train = pd.concat([current_train, add_df], ignore_index=True)

    print(f"[{step}] added {target_id} (|times|={len(common_times)}) → train size: {len(current_train)}")
    remaining.remove(target_id)

In [None]:
def two_sided_ma3(g):
    s = g['nins']
    s_ma = s.rolling(window=3, center=True, min_periods=1).mean()
    s_fwd = s_ma.rolling(window=3, min_periods=1).mean()
    s_bwd = s_fwd[::-1].rolling(window=3, min_periods=1).mean()[::-1]
    g['nins_smooth'] = s_bwd
    return g

df = submission.sort_values(['pv_id','time'])
df = df.groupby('pv_id', group_keys=False).apply(two_sided_ma3)

submission['nins'] = df['nins_smooth']
submission.to_csv("my_submission.csv", index=False)
