In [16]:
import optuna
import wandb
import torch
import os
import numpy as np
import pandas as pd
import torch.nn as nn
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from scipy.stats import entropy
import torch.optim as optim
from sklearn.metrics import accuracy_score, f1_score, log_loss, roc_auc_score
from model import LogisticRegressionModel, MLPModel
from tools import train, eval, prepare_data

In [8]:
train_data = pd.read_csv(os.path.join( "data", "IID.csv"))
test_data = pd.read_csv(os.path.join( "data", "TEST_SAMPLE.csv"))

df_1 = pd.read_csv(os.path.join( "data", "df1.csv"))
df_2 = pd.read_csv(os.path.join( "data", "df2.csv"))
df_3 = pd.read_csv(os.path.join( "data", "df3.csv"))
df_4 = pd.read_csv(os.path.join( "data", "df4.csv"))
df_5 = pd.read_csv(os.path.join( "data", "df5.csv"))
df_6 = pd.read_csv(os.path.join( "data", "df6.csv"))

non_iid_df_1 =pd.read_csv(os.path.join( "data", "noniid_df_1.csv"))
non_iid_df_2 =pd.read_csv(os.path.join( "data", "noniid_df_2.csv"))
non_iid_df_3 = pd.read_csv(os.path.join( "data", "noniid_df_3.csv"))
non_iid_df_4 = pd.read_csv(os.path.join( "data", "noniid_df_4.csv"))
non_iid_df_5 =pd.read_csv(os.path.join( "data", "noniid_df_5.csv"))
non_iid_df_6 = pd.read_csv(os.path.join( "data", "noniid_df_6.csv"))

In [None]:
def feature_skew_split(df, feature_cols, n_clients=6, n_clusters=10, seed=42):
    np.random.seed(seed)
  
    df_shuffled = df.sample(frac=1, random_state=seed).reset_index(drop=True)
    X_scaled = StandardScaler().fit_transform(df_shuffled[feature_cols].values)
    
    clusters = KMeans(n_clusters=n_clusters, random_state=seed).fit_predict(X_scaled)
    df_shuffled['cluster'] = clusters
    
    cluster_counts = (df_shuffled.groupby('cluster').size()
                      .reset_index(name='count')
                      .sort_values(by='count', ascending=False)
                      .reset_index(drop=True))
    
    client_cluster_num = [n_clusters // n_clients + (1 if i < n_clusters % n_clients else 0) for i in range(n_clients)]
   
    cluster_assignment = {
        cluster_counts.loc[idx, 'cluster']: client_id 
        for client_id in range(n_clients)
        for idx in range(sum(client_cluster_num[:client_id]), sum(client_cluster_num[:client_id]) + client_cluster_num[client_id])
    }

    client_dfs = {
        client_id: df_shuffled[df_shuffled['cluster'].map(cluster_assignment) == client_id].drop(columns='cluster').copy()
        for client_id in range(n_clients)
    }
    return client_dfs

In [27]:
(train_data.columns.to_list())[2:12]

['DayOfWeek',
 'Make',
 'AccidentArea',
 'DayOfWeekClaimed',
 'MonthClaimed',
 'WeekOfMonthClaimed',
 'Sex',
 'MaritalStatus',
 'Age',
 'Fault']

In [36]:
clients_dfs = feature_skew_split(train_data, (train_data.columns.to_list())[2:19])

In [37]:

def plotting(df):
    features = list(next(iter(df.values())).columns)
    sns.set(style="whitegrid", palette="muted", color_codes=True)

    for feature in features:
        fig, axes = plt.subplots(1, 3, figsize=(18, 5))

        [df[i][feature].hist(ax=axes[0], alpha=0.5, label=f'Client {i}', bins=20) for i in df]
        axes[0].set(title=f'Histogram of {feature}', xlabel=feature, ylabel="Frequency"); axes[0].legend()
        [sns.kdeplot(df[i][feature], ax=axes[1], label=f'Client {i}', fill=True, alpha=0.3) for i in df]
        axes[1].set(title=f'KDE Plot of {feature}', xlabel=feature, ylabel="Density"); axes[1].legend()
        sns.boxplot(x='client', y=feature, data=pd.concat([df[i].assign(client=f'Client {i}') for i in df]), ax=axes[2])
        axes[2].set(title=f'Boxplot of {feature}', xlabel="Client", ylabel=feature)
        
        plt.suptitle(f'Comparison of {feature} across clients', fontsize=16)
        plt.tight_layout(rect=[0, 0.03, 1, 0.95])
        plt.show()

In [39]:
# plotting(clients_dfs)

In [None]:


datasets = {
    "df1_iid": (df_1, test_data),
    "df2_iid": (df_2, test_data),
    "df3_iid": (df_3, test_data),
    "df4_iid": (df_4, test_data),
    "df5_iid": (df_5, test_data),
    "df6_iid": (df_6, test_data),
    "df1_noniid": (non_iid_df_1, test_data),
    "df2_noniid": (non_iid_df_2, test_data),
    "df3_noniid": (non_iid_df_3, test_data),
    "df4_noniid": (non_iid_df_4, test_data),
    "df5_noniid": (non_iid_df_5, test_data),
    "df6_noniid": (non_iid_df_6, test_data)
}

def objective(trial, train_data, test_data):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    train_loader, test_loader, input_dim = prepare_data(train_data, test_data.drop(columns="Fraud"), test_data.Fraud)
    model = LogisticRegressionModel(input_dim).to(device)
    
    optimizer_name = trial.suggest_categorical("optimizer", ["SGD", "AdamW"])
    lr = trial.suggest_loguniform("lr", 1e-5, 1e-2)
    momentum = trial.suggest_float("momentum", 0.5, 0.99)
    
    if optimizer_name == "SGD":
        optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum)
    else:
        optimizer = optim.AdamW(model.parameters(), lr=lr)
    
    criterion = nn.CrossEntropyLoss()
    model.train()
    for epoch in range(5):
        for batch in train_loader:
            x, y = batch
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            outputs = model(x)
            loss = criterion(outputs, y)
            loss.backward()
            optimizer.step()
    
    # Оценка модели
    model.eval()
    test_labels = []
    test_predictions = []
    with torch.no_grad():
        for batch in test_loader:
            x, y = batch
            x, y = x.to(device), y.to(device)
            outputs = model(x)
            probabilities = torch.softmax(outputs, dim=1)[:, 1]
            test_labels.extend(y.cpu().numpy())
            test_predictions.extend(probabilities.cpu().numpy())
    
    try:
        test_logloss = log_loss(test_labels, test_predictions)
        test_roc_auc = roc_auc_score(test_labels, test_predictions)
        test_pred_binary = (np.array(test_predictions) > 0.5).astype(int)
        test_accuracy = accuracy_score(test_labels, test_pred_binary)
        test_f1 = f1_score(test_labels, test_pred_binary)
        
        wandb.log({
            "lr": lr,
            "momentum": momentum,
            "optimizer": optimizer_name,
            "roc_auc": test_roc_auc,
            "accuracy": test_accuracy,
            "f1_score": test_f1,
            "loss": test_logloss,
        })
        
        return test_roc_auc  
    except ValueError:
        return float("-inf")  

for dataset_name, (train_data, test_data) in datasets.items():
    wandb.init(project="my-first-project", name=dataset_name)
    study = optuna.create_study(direction="maximize")
    study.optimize(lambda trial: objective(trial, train_data, test_data), n_trials=20)
    print(f"Лучшие параметры для {dataset_name}:", study.best_params)
    wandb.finish()
