In [1]:
import numpy as np
import itertools
import pandas as pd
from math import ceil
from typing import List, Tuple, Set, Optional, Dict
import random
import time
from math import comb

In [2]:
df = pd.read_csv("data.csv", index_col=0)
df.tail()

Unnamed: 0,m1_0001,m1_0002,m1_0003,m1_0004,m1_0005,m1_0006,m1_0007,m1_0008,m1_0009,m1_0010,...,m1_1431,m1_1432,m1_1433,m1_1434,m1_1435,m1_1436,m1_1437,m1_1438,m1_1439,m1_1440
28,20.4946,21.947688,18.980332,21.228592,19.8051,20.879488,20.699952,17.79582,14.51408,12.34352,...,36.65368,21.7091,24.159072,25.510604,24.901772,17.027136,9.634464,6.302212,6.38232,8.345176
29,173.283012,160.870948,151.662308,138.247648,153.492276,147.209272,161.401408,144.81544,136.657136,152.239164,...,119.536396,80.533376,149.231292,146.60618,152.06702,147.2478,157.819648,170.120552,139.415808,138.99382
30,53.779012,54.77374,48.384392,39.384268,30.545284,42.29666,41.167784,47.725272,43.371468,29.093904,...,31.8493,20.235264,37.124892,29.908788,21.70462,26.871096,42.599844,40.83324,55.500592,54.3053
31,7.52864,4.775988,4.111296,3.214876,2.343404,6.64622,3.0191,4.129328,4.455248,2.295888,...,8.70814,4.53698,3.364172,4.013436,5.977384,4.663596,6.676152,5.798912,7.241248,9.090592
32,6.900628,4.594548,4.295564,3.189032,4.03508,4.543616,3.45968,3.32486,4.520684,3.918796,...,26.703768,4.234524,4.833808,3.721984,3.550232,5.1604,2.845752,4.53936,3.67248,3.271828


In [3]:
def add_noise(S: np.ndarray) -> np.ndarray:
    S = np.asarray(S, dtype=float)
    noise = np.random.random(size=S.shape) * (0.1 * S)
    return S + noise

def compute_k_and_rss(x_list: List[np.ndarray], S_prime: np.ndarray, subset: Tuple[int, ...]) -> Tuple[float, List[Optional[float]]]:
    T = S_prime.shape[0]
    n = len(x_list)
    X_all = x_list

    if len(subset) == 0:
        residuals = S_prime - sum(X_all)
        RSS = float(np.sum(residuals**2))
        k_full = [1.0] * n
        return RSS, k_full

    not_in_C = [i for i in range(n) if i not in subset]
    y = S_prime.copy()
    for i in not_in_C:
        y = y - X_all[i]

    X_cols = [X_all[i] for i in subset]
    X = np.vstack(X_cols).T

    try:
        beta_hat, residuals, rank, s = np.linalg.lstsq(X, y, rcond=None)
        if residuals.size > 0:
            RSS = float(residuals[0])
        else:
            resid_vec = y - X.dot(beta_hat)
            RSS = float(np.sum(resid_vec**2))
        k_full = [1.0] * n
        for idx, i in enumerate(subset):
            k_full[i] = float(beta_hat[idx])
        return RSS, k_full
    except np.linalg.LinAlgError:
        return float('inf'), [None] * n

In [None]:
def method_n_cheaters(x_list: List[np.ndarray], S_prime: np.ndarray, n_cheaters: int):
    n = len(x_list)
    if n_cheaters < 0 or n_cheaters > n:
        raise ValueError("n_cheaters out of range")
    candidates = list(itertools.combinations(range(n), n_cheaters))
    results = []
    for subset in candidates:
        RSS, k_full = compute_k_and_rss(x_list, S_prime, subset)
        results.append({'subset': set(subset), 'RSS': RSS, 'k_full': k_full})

    results_sorted = sorted(results, key=lambda r: r['RSS'])
    best = results_sorted[0]
    return best['subset']

def method_unknown_cheaters(x_list: List[np.ndarray], S_prime: np.ndarray, tau: float = 0.2, min_delta: float = 0.05):
    X = np.vstack(x_list).T  # T x n
    S = np.asarray(S_prime, dtype=float)
    k_hat, res, rank, s = np.linalg.lstsq(X, S, rcond=None)
    deltas = np.abs(k_hat - 1.0)
    inds = np.where((deltas > tau) & (deltas > min_delta))[0]
    pred = set(inds.tolist()) if inds.size > 0 else None
    return pred

def method_hybrid(x_list: List[np.ndarray], S_prime: np.ndarray, n_cheaters: int,
                  M: Optional[int] = None) -> Optional[Set[int]]:
    n = len(x_list)
    if n_cheaters < 0 or n_cheaters > n:
        raise ValueError("n_cheaters out of range")

    X = np.vstack(x_list).T  # T x n
    S = np.asarray(S_prime, dtype=float)
    try:
        k_hat, res, rank, s = np.linalg.lstsq(X, S, rcond=None)
    except Exception:
        return None

    deltas = np.abs(k_hat - 1.0)
    if M is None:
        M = min(n, max(2 * n_cheaters, 10))
    M = min(M, n)

    top_indices = np.argsort(-deltas)[:M]
    top_indices = list(map(int, top_indices.tolist()))

    if len(top_indices) < n_cheaters:
        best = method_n_cheaters(x_list, S_prime, n_cheaters)
        return set(best) if best is not None else None

    x_cand = [x_list[i] for i in top_indices]
    candidates = list(itertools.combinations(range(len(x_cand)), n_cheaters))
    best_subset = None
    best_RSS = float('inf')
    for sub in candidates:
        subset_orig = tuple(top_indices[idx] for idx in sub)
        RSS, k_full = compute_k_and_rss(x_list, S_prime, subset_orig)
        if RSS < best_RSS:
            best_RSS = RSS
            best_subset = set(subset_orig)

    return best_subset

In [5]:
def add_metrics(pred, metrics, true_set):
    metrics['total'] += 1
    if pred is None:
        metrics['abstained'] += 1
    else:
        pred_set = set(pred) if not isinstance(pred, set) else pred
        if pred_set == true_set:
            metrics['correct'] += 1
        else:
            metrics['incorrect'] += 1

def calculate_metrics(metrics):
    total = metrics['total']
    correct = metrics['correct']
    incorrect = metrics['incorrect']
    abst = metrics['abstained']
    acc = correct / total if total > 0 else 0.0
    fpr = incorrect / total if total > 0 else 0.0
    
    return {
            'total': total, 'correct': correct, 'incorrect': incorrect, 'abstained': abst,
            'accuracy': acc, 'false_positive_rate': fpr
        }

def sample_random_tests(n_rows, group_size, max_tests, n_cheaters):
    rng = np.random.default_rng(42)
    seen = set()
    tests = []

    while len(tests) < max_tests:
        group = tuple(sorted(rng.choice(n_rows, size=group_size, replace=False)))
        cheaters = tuple(sorted(rng.choice(group_size, size=n_cheaters, replace=False)))

        key = (group, cheaters)
        if key in seen:
            continue

        seen.add(key)
        tests.append((group, cheaters))

    return tests

In [6]:
def sample_random_tests(n_rows, group_size, max_tests, n_cheaters):
    rng = np.random.default_rng(42)
    seen = set()
    tests = []

    while len(tests) < max_tests:
        group = tuple(sorted(rng.choice(n_rows, size=group_size, replace=False)))
        cheaters = tuple(sorted(rng.choice(group_size, size=n_cheaters, replace=False)))

        key = (group, cheaters)
        if key in seen:
            continue

        seen.add(key)
        tests.append((group, cheaters))

    return tests

In [48]:
def evaluate_all_groups(df: pd.DataFrame,
                        group_size: int,
                        k: float,
                        n_cheaters: int,
                        max_groups: int,
                        tau: float = 0.2,
                        min_delta: float = 0.05):
    n_rows = df.shape[0]

    tests = sample_random_tests(
        n_rows=n_rows,
        group_size=group_size,
        n_cheaters=n_cheaters,
        max_tests=max_groups
    )

    metrics = {
        'unknow': {'total': 0, 'correct': 0, 'incorrect': 0, 'abstained': 0},
        'hybrid': {'total': 0, 'correct': 0, 'incorrect': 0, 'abstained': 0},
        'know': {'total': 0, 'correct': 0, 'incorrect': 0, 'abstained': 0}
    }

    records = []
    for group, true_subset in tests:
        true_set = set(true_subset)
        x_list = [df.iloc[idx].to_numpy(dtype=float) for idx in group]

        x_ch = [arr.copy() for arr in x_list]
        S_prime = add_noise(sum(x_ch))
        for pos in true_set:
            x_ch[pos] = k * x_ch[pos]

        pred_know = method_n_cheaters(x_ch, S_prime, n_cheaters)
        add_metrics(pred_know, metrics['know'], true_set)

        pred_unknow = method_unknown_cheaters(x_ch, S_prime, tau=tau, min_delta=min_delta)
        add_metrics(pred_unknow, metrics['unknow'], true_set)

        pred_hybrid = method_hybrid(x_ch, S_prime, n_cheaters)
        add_metrics(pred_hybrid, metrics['hybrid'], true_set)

        records.append({
            'group': group,
            'true_subset': true_set,
            'pred_know': pred_know,
            'pred_unknow': pred_unknow,
            'pred_hybrid': pred_hybrid
        })

    df_res = pd.DataFrame.from_records(records)

    final_metrics = {
        'know': calculate_metrics(metrics['know']),
        'unknow': calculate_metrics(metrics['unknow']),
        'hybrid': calculate_metrics(metrics['hybrid']),
    }

    print("Эксперимент закончен. Параметры: group_size =", group_size, "k =", k)
    print(f"n_cheaters = {n_cheaters} | Know accuracy = {final_metrics['know']['accuracy']:.4f} | Unknow accuracy = {final_metrics['unknow']['accuracy']:.4f} | Hybrid accuracy = {final_metrics['hybrid']['accuracy']:.4f}")

    return df_res, final_metrics


In [None]:
all_results = {}

for g in [5, 10, 15, 20]:
    for t in [1, 4]:

        print("\n" + "=" * 80)
        print(f"RUN: group_size = {g}, n_cheaters = {t}, k = {0.2}")
        print("=" * 80)

        df_res, metrics = evaluate_all_groups(
            df=df,
            group_size=g,
            k=0.1,
            n_cheaters=t,
            max_groups=300,
            tau=3,
            min_delta=0.01
        )

        # сохраняем результаты для анализа
        all_results[(g, t)] = {
            "metrics": metrics,
            "details": df_res
        }



RUN: group_size = 5, n_cheaters = 1, k = 0.2
Эксперимент закончен. Параметры: group_size = 5 k = 0.1
n_cheaters = 1 | Know accuracy = 0.9500 | Unknow accuracy = 0.9367 | Hybrid accuracy = 0.9500

RUN: group_size = 5, n_cheaters = 4, k = 0.2
Эксперимент закончен. Параметры: group_size = 5 k = 0.1
n_cheaters = 4 | Know accuracy = 0.9033 | Unknow accuracy = 1.0000 | Hybrid accuracy = 0.9033

RUN: group_size = 10, n_cheaters = 1, k = 0.2
Эксперимент закончен. Параметры: group_size = 10 k = 0.1
n_cheaters = 1 | Know accuracy = 0.8433 | Unknow accuracy = 0.7967 | Hybrid accuracy = 0.8433

RUN: group_size = 10, n_cheaters = 4, k = 0.2
Эксперимент закончен. Параметры: group_size = 10 k = 0.1
n_cheaters = 4 | Know accuracy = 0.8367 | Unknow accuracy = 0.8500 | Hybrid accuracy = 0.8367

RUN: group_size = 15, n_cheaters = 1, k = 0.2
Эксперимент закончен. Параметры: group_size = 15 k = 0.1
n_cheaters = 1 | Know accuracy = 0.7367 | Unknow accuracy = 0.6133 | Hybrid accuracy = 0.7533

RUN: group_si

In [51]:
all_results = {}

for g in [30]:
    for t in [1, 4]:

        print("\n" + "=" * 80)
        print(f"RUN: group_size = {g}, n_cheaters = {t}, k = {0.2}")
        print("=" * 80)

        df_res, metrics = evaluate_all_groups(
            df=df,
            group_size=g,
            k=0.1,
            n_cheaters=t,
            max_groups=300,
            tau=3,
            min_delta=0.01
        )

        # сохраняем результаты для анализа
        all_results[(g, t)] = {
            "metrics": metrics,
            "details": df_res
        }



RUN: group_size = 30, n_cheaters = 1, k = 0.2
Эксперимент закончен. Параметры: group_size = 30 k = 0.1
n_cheaters = 1 | Know accuracy = 0.2933 | Unknow accuracy = 0.1300 | Hybrid accuracy = 0.3133

RUN: group_size = 30, n_cheaters = 4, k = 0.2
Эксперимент закончен. Параметры: group_size = 30 k = 0.1
n_cheaters = 4 | Know accuracy = 0.2333 | Unknow accuracy = 0.1833 | Hybrid accuracy = 0.2733


In [None]:
def tune_tau(df: pd.DataFrame,
             group_size: int,
             n_cheaters: int,
             k: float,
             tau_grid: List[float],
             max_tests: int = 300,
             min_delta: float = 0.05,
             seed: int = 42):

    results = []

    for tau in tau_grid:
        _, metrics = evaluate_all_groups(
            df=df,
            group_size=group_size,
            k=k,
            n_cheaters=n_cheaters,
            max_groups=max_tests,
            tau=tau,
            min_delta=min_delta
        )

        m = metrics['unknow']
        results.append({
            'tau': tau,
            'accuracy': m['accuracy'],
            'false_positive_rate': m['false_positive_rate']
        })

    return pd.DataFrame(results)


In [39]:
tau_grid = np.linspace(1, 5, 20)

df_tau = tune_tau(
    df=df,
    group_size=5,
    n_cheaters=1,
    k=0.01,
    tau_grid=tau_grid,
    max_tests=300
)

df_tau


Эксперимент закончен. Параметры: group_size = 5 k = 0.01
n_cheaters = 1 | Know accuracy = 0.9600 | Unknow accuracy = 0.8933 | Hybrid accuracy = 0.9600
Эксперимент закончен. Параметры: group_size = 5 k = 0.01
n_cheaters = 1 | Know accuracy = 0.9533 | Unknow accuracy = 0.9100 | Hybrid accuracy = 0.9533
Эксперимент закончен. Параметры: group_size = 5 k = 0.01
n_cheaters = 1 | Know accuracy = 0.9567 | Unknow accuracy = 0.9067 | Hybrid accuracy = 0.9567
Эксперимент закончен. Параметры: group_size = 5 k = 0.01
n_cheaters = 1 | Know accuracy = 0.9500 | Unknow accuracy = 0.9200 | Hybrid accuracy = 0.9500
Эксперимент закончен. Параметры: group_size = 5 k = 0.01
n_cheaters = 1 | Know accuracy = 0.9500 | Unknow accuracy = 0.9300 | Hybrid accuracy = 0.9500
Эксперимент закончен. Параметры: group_size = 5 k = 0.01
n_cheaters = 1 | Know accuracy = 0.9567 | Unknow accuracy = 0.9333 | Hybrid accuracy = 0.9567
Эксперимент закончен. Параметры: group_size = 5 k = 0.01
n_cheaters = 1 | Know accuracy = 0.95

Unnamed: 0,tau,accuracy,false_positive_rate
0,1.0,0.893333,0.106667
1,1.210526,0.91,0.09
2,1.421053,0.906667,0.093333
3,1.631579,0.92,0.08
4,1.842105,0.93,0.07
5,2.052632,0.933333,0.066667
6,2.263158,0.913333,0.086667
7,2.473684,0.943333,0.056667
8,2.684211,0.93,0.07
9,2.894737,0.926667,0.073333
