# Run a single experiment

## On Google Colaboratory, also run:

```
!curl -LO http://download.tensorflow.org/example_images/flower_photos.tgz && \
  tar xzf flower_photos.tgz
  
!pip install -q six scipy Pillow matplotlib scikit-image opencv-python imageio Shapely \
  imgaug vega tqdm
  
!wget https://raw.githubusercontent.com/BraneShop/how-much-data-experiments/master/downsample_data.py && \
  wget https://raw.githubusercontent.com/BraneShop/how-much-data-experiments/master/augment_data.py && \
  wget https://raw.githubusercontent.com/BraneShop/how-much-data-experiments/master/label_folder.py && \
  wget https://raw.githubusercontent.com/BraneShop/how-much-data-experiments/master/retrain.py
 ```
 
 to get all the neccessary files onto the instance.

In [1]:
from downsample_data import main
import os
import pandas as pd
import numpy  as np

In [2]:
experiments_directory = "small-experiments"
holdout_directory     = "small-holdout"
csvs_directory        = "small-csvs"
ckpts_directory       = "small-ckpts"
holdout_percent       = 10
steps                 = 300 # 300 for small examples, 4000 for normal kind.
validation_percentage = 0   # 0 for small kind, 10 for normal kind
testing_percentage    = 0   # 0 for small kind, 10 for normal kind

In [3]:
os.system("mkdir -p {} && mkdir -p {} && mkdir -p {}".
          format(experiments_directory, 
                 holdout_directory,
                 csvs_directory));

In [4]:
def downsample(amounts, seed=None):
    main(amounts,
       seed=seed,
       holdout_percent=holdout_percent,
       experiments_directory=experiments_directory, 
       holdout_directory=holdout_directory)

### Train

In [5]:
def train(amounts):
    for amount in amounts:
        print("Training amount: {}".format(amount,))
        os.system(" ".join([ "python retrain.py"
                           , "--image_dir",               experiments_directory + "/" + str(amount)
                           , "--log_level",               "1"
                           , "--csv_folder",              csvs_directory
                           , "--summaries_dir",           ckpts_directory + "/" + str(amount)
                           , "--output_graph",            ckpts_directory + "/" + str(amount) + "/graph.pb"
                           , "--how_many_training_steps", str(steps)
                           , "--validation_percentage",   str(validation_percentage)
                           , "--testing_percentage",      str(testing_percentage)
                           ]))

### Evaluate

In [6]:
def evaluate(amounts):
    for amount in amounts:
        print("Inferring amount: {}".format(amount,))
        os.system(" ".join([ "python label_folder.py"
                           , "--folder",     holdout_directory
                           , "--prefix",     experiments_directory + "_" + str(amount)
                           , "--csv_folder", csvs_directory
                           , "--graph",      ckpts_directory + "/" + str(amount) + "/graph.pb"
                           ]))

### Analyse

In [7]:
def holdout_threshold (run, threshold = 0.5):
    df = pd.read_csv(f"{csvs_directory}/{experiments_directory}_{run}-results.csv")
    df["correct"] = df.apply(lambda row: row["predicted_" + row["true_label"]] 
                             > threshold, 1)
    return df 

def holdout_accuracy_threshold (run, threshold = 0.5):
    df  = holdout_threshold(run, threshold)
    acc = df["correct"].sum() / len(df)
    return np.round(acc, 2)


def holdout_max (run):
    df = pd.read_csv(f"{csvs_directory}/{experiments_directory}_{run}-results.csv")
    
    def was_right (row):
        cols = [c for c in row.keys() if c.startswith("predicted")]
        best = ""
        last = 0
        
        for c in cols:
            name = c.split("_")[1]
            v    = float(row[c])
            if v > last:
                best = name
                last = v
        
        return best == row["true_label"]
    
    df["correct"] = df.apply(was_right, 1)
    return df

def holdout_accuracy_max (run):
    df  = holdout_max(run)
    acc = df["correct"].sum() / len(df)
    return np.round(acc, 2)


In [8]:
def analyse (amounts):
    results = []
    for amount in amounts:
        print(amount)
        a1 = holdout_accuracy_max(amount)
        a2 = holdout_accuracy_threshold(amount)
        a3 = holdout_accuracy_threshold(amount, threshold=0.8)

        print("              max:", a1)
        print("  threshold (0.5):", a2)
        print("  threshold (0.8):", a3)
        print("")

        results.append( { "max": a1, ">0.5": a2, ">0.8": a3, "experiment": amount })

    return pd.DataFrame(results)

In [9]:
amounts = ["1", "3"]
trials  = 1

In [10]:
dfs = []

for trial in range(trials):
    downsample(amounts)
    train(amounts)
    evaluate(amounts)
    df = analyse(amounts)
    df["trial"] = trial
    dfs.append(df)

df = pd.concat(dfs)

  0%|          | 0/5 [00:00<?, ?it/s]
  0%|          | 0/2 [00:00<?, ?it/s][A

  0%|          | 0/1 [00:00<?, ?it/s][A[A

100%|██████████| 1/1 [00:00<00:00, 438.69it/s][A[A

  0%|          | 0/3 [00:00<?, ?it/s][A[A

100%|██████████| 3/3 [00:00<00:00, 861.78it/s][A[A
100%|██████████| 2/2 [00:00<00:00, 149.84it/s][A
  0%|          | 0/2 [00:00<?, ?it/s][A

  0%|          | 0/1 [00:00<?, ?it/s][A[A

100%|██████████| 1/1 [00:00<00:00, 645.67it/s][A[A

  0%|          | 0/3 [00:00<?, ?it/s][A[A

100%|██████████| 3/3 [00:00<00:00, 678.40it/s][A[A
 40%|████      | 2/5 [00:00<00:00, 15.21it/s]][A
  0%|          | 0/2 [00:00<?, ?it/s][A

  0%|          | 0/1 [00:00<?, ?it/s][A[A

100%|██████████| 1/1 [00:00<00:00, 708.50it/s][A[A

  0%|          | 0/3 [00:00<?, ?it/s][A[A

100%|██████████| 3/3 [00:00<00:00, 553.44it/s][A[A
100%|██████████| 2/2 [00:00<00:00, 140.98it/s][A
  0%|          | 0/2 [00:00<?, ?it/s][A

  0%|          | 0/1 [00:00<?, ?it/s][A[A

100%|███

Training amount: 1
Training amount: 3
Inferring amount: 1
Inferring amount: 3
1
              max: 0.58
  threshold (0.5): 0.26
  threshold (0.8): 0.04

3
              max: 0.7
  threshold (0.5): 0.55
  threshold (0.8): 0.23



In [11]:
df

Unnamed: 0,>0.5,>0.8,experiment,max,trial
0,0.26,0.04,1,0.58,0
1,0.55,0.23,3,0.7,0


In [12]:
df[["experiment", "max"]].groupby("experiment").describe()

Unnamed: 0_level_0,max,max,max,max,max,max,max,max
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
experiment,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
1,1.0,0.58,,0.58,0.58,0.58,0.58,0.58
3,1.0,0.7,,0.7,0.7,0.7,0.7,0.7
