In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("data.csv", index_col=0)
df.tail()

Unnamed: 0,m1_0001,m1_0002,m1_0003,m1_0004,m1_0005,m1_0006,m1_0007,m1_0008,m1_0009,m1_0010,...,m1_1431,m1_1432,m1_1433,m1_1434,m1_1435,m1_1436,m1_1437,m1_1438,m1_1439,m1_1440
28,20.4946,21.947688,18.980332,21.228592,19.8051,20.879488,20.699952,17.79582,14.51408,12.34352,...,36.65368,21.7091,24.159072,25.510604,24.901772,17.027136,9.634464,6.302212,6.38232,8.345176
29,173.283012,160.870948,151.662308,138.247648,153.492276,147.209272,161.401408,144.81544,136.657136,152.239164,...,119.536396,80.533376,149.231292,146.60618,152.06702,147.2478,157.819648,170.120552,139.415808,138.99382
30,53.779012,54.77374,48.384392,39.384268,30.545284,42.29666,41.167784,47.725272,43.371468,29.093904,...,31.8493,20.235264,37.124892,29.908788,21.70462,26.871096,42.599844,40.83324,55.500592,54.3053
31,7.52864,4.775988,4.111296,3.214876,2.343404,6.64622,3.0191,4.129328,4.455248,2.295888,...,8.70814,4.53698,3.364172,4.013436,5.977384,4.663596,6.676152,5.798912,7.241248,9.090592
32,6.900628,4.594548,4.295564,3.189032,4.03508,4.543616,3.45968,3.32486,4.520684,3.918796,...,26.703768,4.234524,4.833808,3.721984,3.550232,5.1604,2.845752,4.53936,3.67248,3.271828


In [4]:
def method_least_squares(x1, x2, S_prime):
    x1 = np.asarray(x1, dtype=float)
    x2 = np.asarray(x2, dtype=float)
    S_prime = np.asarray(S_prime, dtype=float)

    denom1 = np.sum(x1 * x1)
    denom2 = np.sum(x2 * x2)

    k1_hat = np.nan if denom1 == 0 else np.sum(x1 * (S_prime - x2)) / denom1
    k2_hat = np.nan if denom2 == 0 else np.sum(x2 * (S_prime - x1)) / denom2

    residuals1 = S_prime - (k1_hat * x1 + x2) if not np.isnan(k1_hat) else np.full_like(S_prime, np.nan)
    residuals2 = S_prime - (x1 + k2_hat * x2) if not np.isnan(k2_hat) else np.full_like(S_prime, np.nan)

    RSS1 = np.sum(residuals1 ** 2)
    RSS2 = np.sum(residuals2 ** 2)
    
    if RSS1 < RSS2:
        pred = 0
    else:
        pred = 1

    return pred

In [5]:
import numpy as np

def add_noise(S):
    S = np.asarray(S, dtype=float)
    noise = np.random.random(size=S.shape) * (0.1 * S)
    return S + noise


In [6]:
# Запуск на одной паре
x1 = df.iloc[30].to_numpy(dtype=float)
x2 = df.iloc[0].to_numpy(dtype=float)

S_prime = x1 + x2
S_prime = add_noise(S_prime)

k=0.7
x1_ch = k * x1
x2_ch = x2.copy()

method_least_squares(x1_ch, x2_ch, S_prime)

0

In [7]:
# Перебор все возможных пар с заданным коэффицентом
from itertools import combinations

def evaluate_pairs(df, k):
    n_pairs = df.shape[0]
    pairs = list(combinations(range(n_pairs), 2))

    records = []
    total = 0
    correct = 0
    incorrect = 0
    abstained = 0

    for (i, j) in pairs:
        for first_cheater in (0, 1):
            
            x1 = df.iloc[i].to_numpy(dtype=float)
            x2 = df.iloc[j].to_numpy(dtype=float)

            if first_cheater == 0:
                x1_ch = k * x1
                x2_ch = x2.copy()
            else:
                x2_ch = k * x2
                x1_ch = x1.copy()
        
            S_prime = x1 + x2
            S_prime = add_noise(S_prime)

            pred = method_least_squares(x1_ch, x2_ch, S_prime)

            total += 1
            if pred is None:
                abstained += 1
            elif pred == first_cheater:
                correct += 1
            else:
                incorrect += 1

            records.append({
                'pairs': (i, j),
                'is_first_cheater': first_cheater,
                'pred': pred
            })

    df_res = pd.DataFrame.from_records(records)

    metrics = {
        'k': k,
        'total': total,
        'correct': correct,
        'incorrect': incorrect,
        'abstained': abstained,
        'accuracy': correct / total if total > 0 else 0.0,
        'false_positive_rate': incorrect / total if total > 0 else 0.0
    }

    print('--- Результаты:')
    print('Параметры: k =', k, '| total checks =', total)
    print('Accuracy =', metrics['accuracy'], '| False positive rate =', metrics['false_positive_rate'])


    return df_res, metrics

In [10]:
df_res, metrics = evaluate_pairs(df, 0.2)

--- Результаты:
Параметры: k = 0.2 | total checks = 1056
Accuracy = 0.9621212121212122 | False positive rate = 0.03787878787878788


In [11]:
metrics

{'k': 0.2,
 'total': 1056,
 'correct': 1016,
 'incorrect': 40,
 'abstained': 0,
 'accuracy': 0.9621212121212122,
 'false_positive_rate': 0.03787878787878788}