In [None]:
# default_exp eda.overlap

# EDA Overlap

> API details.

In [1]:
import gdown
import math

import numpy as np
import pandas as pd

from joblib import Parallel, delayed
from pathlib import Path
from tqdm.auto import tqdm

In [2]:
#hide
from nbdev.showdoc import *

In [2]:
# download using gdown T5 extension dataset
data_dir = Path("/home/jovyan/data")
output_dir = data_dir / "t5-data.zip"
url = "https://drive.google.com/uc?id=1zKW5bOMjKHfX75d_uz8OF2teTb5Dse_M"
gdown.cached_download(url, str(output_dir), postprocess=gdown.extractall)

File exists: /home/jovyan/data/t5-data.zip


'/home/jovyan/data/t5-data.zip'

In [3]:
# download using gdown T5 extension test data with model performance
output_dir = data_dir / "t5-data-performance.zip"
url = "https://drive.google.com/uc?id=1EFPcdIpl-uez4e0918Yk4hQrLRyGczxf"
gdown.cached_download(url, str(output_dir), postprocess=gdown.extractall)

File exists: /home/jovyan/data/t5-data-performance.zip


'/home/jovyan/data/t5-data-performance.zip'

In [4]:
# This code was taken from https://gist.github.com/kylebgorman/1081951/bce3de986e4b05fc0b63d4d9e0cfa4bde6664365
def _dist(A, B, insertion, deletion, substitution):
    D = np.zeros((len(A) + 1, len(B) + 1))
    for i in range(len(A)):
        D[i + 1][0] = D[i][0] + deletion
    for j in range(len(B)):
        D[0][j + 1] = D[0][j] + insertion
    for i in range(len(A)): # fill out middle of matrix
        for j in range(len(B)):
            if A[i] == B[j]:
                D[i + 1][j + 1] = D[i][j] # aka, it's free.
            else:
                D[i + 1][j + 1] = min(D[i + 1][j] + insertion,
                                      D[i][j + 1] + deletion,
                                      D[i][j]     + substitution)
    return D

def levenshtein_distance(l1, l2, normalize=False):
    dist = _dist(l1, l2, 1, 1, 1)[-1][-1]
    if normalize:
        return 1. - dist / max(len(l1), len(l2))
    else:
        return dist

In [5]:
# export
# From: https://github.com/veekaybee/data/blob/master/samplesize.py
# SUPPORTED CONFIDENCE LEVELS: 50%, 68%, 90%, 95%, and 99%
confidence_level_constant = [50,.67], [68,.99], [90,1.64], [95,1.96], [99,2.57]
 
# CALCULATE THE SAMPLE SIZE
def sample_size(population_size, confidence_level, confidence_interval):
  Z = 0.0
  p = 0.5
  e = confidence_interval/100.0
  N = population_size
  n_0 = 0.0
  n = 0.0
 
  # LOOP THROUGH SUPPORTED CONFIDENCE LEVELS AND FIND THE NUM STD
  # DEVIATIONS FOR THAT CONFIDENCE LEVEL
  for i in confidence_level_constant:
    if i[0] == confidence_level:
      Z = i[1]
 
  if Z == 0.0:
    return -1
 
  # CALC SAMPLE SIZE
  n_0 = ((Z**2) * p * (1-p)) / (e**2)
 
  # ADJUST SAMPLE SIZE FOR FINITE POPULATION
  n = n_0 / (1 + ((n_0 - 1) / float(N)) )
 
  return int(math.ceil(n)) # THE SAMPLE SIZE

In [6]:
class Dataset:
    def __init__(self, trn_path, tst_path) -> None:
        self.trn_path = trn_path
        self.tst_path = tst_path
        
        self.trn_df = pd.read_csv(self.trn_path, names=["input", "target"], sep="\t")
        self.tst_df = pd.read_csv(self.tst_path)
    
    def compute_overlap(self, parallelize=True, sample=True):
        if sample:
            size = sample_size(len(self.trn_df), confidence_level=95., confidence_interval=5.)
            print(f"Sample size: {size}")
            self.trn_df = self.trn_df.sample(size)

        if not parallelize:
            self.all_dists = []
            for tst_trgt in tqdm(self.tst_df.Groundtruth.values):
                dists = []
                for trn_trgt in self.trn_df.target.values:
                    dists.append(levenshtein_distance(tst_trgt, trn_trgt))
                self.all_dists.append(dists)
        else:
            self.all_dists = Parallel(n_jobs=96)(
                delayed(levenshtein_distance)(tst_trgt, trn_trgt)
                for tst_trgt in tqdm(self.tst_df.Groundtruth.values)
                for trn_trgt in self.trn_df.target.values
            )
            self.all_dists = np.array(self.all_dists)
            self.all_dists = self.all_dists.reshape((len(self.tst_df), len(self.trn_df)))

        self.min_dists = np.min(self.all_dists, axis=1)
        self.median_dists = np.median(self.all_dists, axis=1)
        self.max_dists = np.max(self.all_dists, axis=1)

        score_column = self.tst_df.columns[-1]
        df = pd.DataFrame({
            # "dists": self.all_dists,
            "min_dist": self.min_dists,
            "median_dist": self.median_dists,
            "max_dist": self.max_dists,
            "score": self.tst_df[score_column].values
        })
        return df

In [7]:
finetune_ds_dir = data_dir / "datasets/tsv/fine-tuning"
performance_ds_dir = data_dir / "DataSnooping_Analysis_Data"

dfs = []
for dir in finetune_ds_dir.glob("*"):
    dataset_name = dir.name
    print(dataset_name)
    trn_path = finetune_ds_dir / f"{dataset_name}/training.tsv"
    if dataset_name == "MG":
        dataset_name = "Mutants"
    elif dataset_name == "CS":
        dataset_name = "CodeSummarization"
    tst_path = performance_ds_dir / f"{dataset_name}.csv"

    df = Dataset(trn_path, tst_path).compute_overlap(parallelize=True, sample=True)
    df["dataset_name"] = dataset_name
    dfs.append(df)

df = pd.concat(dfs)
df.to_csv(data_dir / "t5-data-performance-overlap.csv", index=False)

AGraw
Sample size: 384


  0%|          | 0/18815 [00:00<?, ?it/s]

AGabs
Sample size: 383


  0%|          | 0/15810 [00:00<?, ?it/s]

BFsmall
Sample size: 382


  0%|          | 0/5835 [00:00<?, ?it/s]

MG
Sample size: 383


  0%|          | 0/11559 [00:00<?, ?it/s]

CS
Sample size: 385


  0%|          | 0/90908 [00:00<?, ?it/s]

BFmedium
Sample size: 382


  0%|          | 0/6545 [00:00<?, ?it/s]

In [64]:
%%time
finetune_ds_dir = data_dir / "datasets/tsv/fine-tuning"
performance_ds_dir = data_dir / "DataSnooping_Analysis_Data"
sample_size=100

dfs = []
for dir in finetune_ds_dir.glob("*"):
    dataset_name = dir.name
    print(dataset_name)
    trn_path = finetune_ds_dir / f"{dataset_name}/training.tsv"
    if dataset_name == "MG":
        dataset_name = "Mutants"
    elif dataset_name == "CS":
        dataset_name = "CodeSummarization"
    tst_path = performance_ds_dir / f"{dataset_name}.csv"

    df = Dataset(trn_path, tst_path).compute_overlap(parallelize=True, sample_size=sample_size)
    df["dataset_name"] = dataset_name
    dfs.append(df)

df = pd.concat(dfs)

AGraw


  0%|          | 0/100 [00:00<?, ?it/s]

AGabs


  0%|          | 0/100 [00:00<?, ?it/s]

BFsmall


  0%|          | 0/100 [00:00<?, ?it/s]

MG


  0%|          | 0/100 [00:00<?, ?it/s]

CS


  0%|          | 0/100 [00:00<?, ?it/s]

BFmedium


  0%|          | 0/100 [00:00<?, ?it/s]

CPU times: user 23.5 s, sys: 1.73 s, total: 25.2 s
Wall time: 14min 6s


In [65]:
%%time
finetune_ds_dir = data_dir / "datasets/tsv/fine-tuning"
performance_ds_dir = data_dir / "DataSnooping_Analysis_Data"
sample_size=100

dfs = []
for dir in finetune_ds_dir.glob("*"):
    dataset_name = dir.name
    print(dataset_name)
    trn_path = finetune_ds_dir / f"{dataset_name}/training.tsv"
    if dataset_name == "MG":
        dataset_name = "Mutants"
    elif dataset_name == "CS":
        dataset_name = "CodeSummarization"
    tst_path = performance_ds_dir / f"{dataset_name}.csv"

    df = Dataset(trn_path, tst_path).compute_overlap(parallelize=False, sample_size=sample_size)
    df["dataset_name"] = dataset_name
    dfs.append(df)

df = pd.concat(dfs)

AGraw


  0%|          | 0/100 [00:00<?, ?it/s]

AGabs


  0%|          | 0/100 [00:00<?, ?it/s]

BFsmall


  0%|          | 0/100 [00:00<?, ?it/s]

MG


  0%|          | 0/100 [00:00<?, ?it/s]

CS


  0%|          | 0/100 [00:00<?, ?it/s]

BFmedium


  0%|          | 0/100 [00:00<?, ?it/s]

CPU times: user 55min 3s, sys: 1.2 s, total: 55min 4s
Wall time: 55min 4s


In [42]:
df[df["dataset_name"]=="Mutants"].head()

Unnamed: 0,min_dist,median_dist,max_dist,score,dataset_name
0,47.0,72.5,158.0,0.433893,Mutants
1,139.0,157.5,171.0,1.0,Mutants
2,33.0,67.5,161.0,1.0,Mutants
3,78.0,89.5,136.0,0.911597,Mutants
4,61.0,71.5,162.0,0.972414,Mutants


In [36]:
ds.compute_overlap(sample_size=10)

  0%|          | 0/10 [00:00<?, ?it/s]

Unnamed: 0,min_dist,median_dist,max_dist,score
0,10.0,23.0,79.0,False
1,8.0,23.5,80.0,True
2,9.0,27.5,92.0,True
3,10.0,28.5,92.0,True
4,25.0,32.0,74.0,False
5,27.0,32.0,83.0,True
6,19.0,24.5,79.0,False
7,13.0,26.5,87.0,True
8,26.0,50.5,75.0,False
9,9.0,27.5,92.0,False
