# Stackoverflow

## Data Download

### Download the proprocessed dataset from [FedML](https://github.com/FedML-AI/FedML)

In [None]:
!cd ../benchmark/datasets/stackoverflow && mkdir -pv data/raw
!cd ../benchmark/datasets/stackoverflow/data/raw && wget --no-check-certificate --no-proxy  https://fedml.s3-us-west-1.amazonaws.com/stackoverflow.tag_count.tar.bz2
!cd ../benchmark/datasets/stackoverflow/data/raw && wget --no-check-certificate --no-proxy  https://fedml.s3-us-west-1.amazonaws.com/stackoverflow.word_count.tar.bz2
!cd ../benchmark/datasets/stackoverflow/data/raw && wget --no-check-certificate --no-proxy  https://fedml.s3-us-west-1.amazonaws.com/stackoverflow.tar.bz2
!cd ../benchmark/datasets/stackoverflow/data/raw && wget --no-check-certificate --no-proxy  https://fedml.s3-us-west-1.amazonaws.com/stackoverflow_nwp.pkl
    
!cd ../benchmark/datasets/stackoverflow/data/raw && tar -xvf stackoverflow.tag_count.tar.bz2 && rm -rf stackoverflow.tag_count.tar.bz2
!cd ../benchmark/datasets/stackoverflow/data/raw && tar -xvf stackoverflow.word_count.tar.bz2 && rm -rf stackoverflow.word_count.tar.bz2
!cd ../benchmark/datasets/stackoverflow/data/raw && tar -xvf stackoverflow.tar.bz2 && rm -rf stackoverflow.tar.bz2

### Valid Dataset

In [1]:
from benchmark.datasets.stackoverflow import get_stackoverflow
dataset = get_stackoverflow('../benchmark/datasets/stackoverflow/data', mode='tp')
print(dataset)
x, y = dataset[0]
print(x.shape, y.shape)

dataset = get_stackoverflow('../benchmark/datasets/stackoverflow/data', mode='nwp')
print(dataset)
x, y = dataset[0]
print(x.shape, y.shape)

StackOverFlowTP(total_parts: 342477, total_samples: <bound method StackOverFlowTP.total_samples of <benchmark.datasets.stackoverflow.stackoverflow.StackOverFlowTP object at 0x7fb9482ed810>>, current_parts: 0)
torch.Size([10003]) torch.Size([500])
StackOverFlowNWP(total_parts: 342477, total_samples: <bound method StackOverFlowNWP.total_samples of <benchmark.datasets.stackoverflow.stackoverflow.StackOverFlowNWP object at 0x7fb968f52ed0>>, current_parts: 0)
torch.Size([20]) torch.Size([20])


  return torch.tensor(x).float()


## FedAvg, FedSGD, FedEla, FedProx, FedScaffold

Run following commands in the root path of `benchmark-lightly`.

```bash
function cmd(){
    fed_optim=$1

    task_name="stackoverflow"
    exp_name=${fed_optim}_${task_name}

    # Delete cache file
    rm -rf /tmp/${exp_name}.share
    rm -rf /tmp/${exp_name}
    rm -rf ./logs/${task_name}/${fed_optim}

    # Run
    python -m openfed.tools.launch --nproc_per_node 6  --logdir /tmp benchmark/run.py\
        --fed_init_method file:///tmp/${exp_name}.share\
        --task ${task_name}\
        --data_root benchmark/datasets/${task_name}/data\
        --epochs 1\
        --rounds 20\
        --act_clts 100\
        --tst_act_clts 100\
        --max_acg_step -1\
        --optim ${fed_optim}\
        --optim_args momentum:0.9 weight_decay:1e-4\
        --follower_lr 1e-1\
        --leader_lr 1.0\
        --bz 10\
        --gpu\
        --log_level SUCCESS\
        --log_dir logs\
        --exp_name ${exp_name}\
        --seed 0
}
```

### Run All

```bash
cmd 'fedavg'; cmd 'fedsgd'; cmd 'fedela'; cmd 'fedprox'; cmd 'fedscaffold'
```

## Plot Curves

In [None]:
%matplotlib inline

from benchmark.utils.plot import plot

task_name = "synthetic"

items = dict(
    FedAvg=f'../logs/{task_name}/fedavg_{task_name}/{task_name}.json',
    FedSgd=f'../logs/{task_name}/fedsgd_{task_name}/{task_name}.json',
    FedEla=f'../logs/{task_name}/fedela_{task_name}/{task_name}.json',
    FedProx=f'../logs/{task_name}/fedprox_{task_name}/{task_name}.json',
    FedScaffold=f'../logs/{task_name}/fedscaffold_{task_name}/{task_name}.json',
)

files = items.values()
labels = items.keys()

### Train Accuracy

In [None]:
plot(
    files=files,
    labels=labels,
    attributes="accuracy",
    mode='train'
)

### Train Loss

In [None]:
plot(
    files=files,
    labels=labels,
    attributes="accuracy",
    mode='train'
)

### Test Accuracy

In [None]:
plot(
    files=files,
    labels=labels,
    attributes="accuracy",
    mode="test"
)

### Test Loss

In [None]:
plot(
    files=files,
    labels=labels,
    attributes="loss",
    mode='test'
)