In [1]:
! pip install gdown sentencepiece statsmodels

Collecting gdown
  Downloading gdown-4.2.0.tar.gz (13 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h    Preparing wheel metadata ... [?25ldone
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.96-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 7.1 MB/s 
Building wheels for collected packages: gdown
  Building wheel for gdown (PEP 517) ... [?25ldone
[?25h  Created wheel for gdown: filename=gdown-4.2.0-py3-none-any.whl size=14262 sha256=39533dfee45f1890b982179c80a6ec4f15cadc64dc908395e371228e612a4af1
  Stored in directory: /tmp/pip-ephem-wheel-cache-wup76ozl/wheels/2b/3c/51/52c46deda5cd1d59c6ce3d441ea5f3d155495dc294c4535a25
Successfully built gdown
Installing collected packages: sentencepiece, gdown
Successfully installed gdown-4.2.0 sentencepiece-0.1.96


In [None]:
! gdown https://drive.google.com/uc?id=1zKW5bOMjKHfX75d_uz8OF2teTb5Dse_M # datasets.zip
! gdown https://drive.google.com/uc?id=1EFPcdIpl-uez4e0918Yk4hQrLRyGczxf # DataSnooping_Analysis_Data.zip

In [3]:
import gdown

ids = ["1zKW5bOMjKHfX75d_uz8OF2teTb5Dse_M", "1EFPcdIpl-uez4e0918Yk4hQrLRyGczxf"]
for id in ids:
    url = "https://drive.google.com/uc?id=" + id
    gdown.cached_download(url, "/home/jovyan/data/", postprocess=gdown.extractall)

File exists: /home/jovyan/data/
File exists: /home/jovyan/data/


In [None]:
import pandas as pd

from pathlib import Path

fine_tune_path = Path("datasets/tsv/fine-tuning")

def get_agab_dfs():
    trn_agab_df = pd.read_csv(
        fine_tune_path/"AGabs/training.tsv", sep="\t", 
        names=["input", "target"]
    )
    tst_agab_df = pd.read_csv(
        "DataSnooping_Analysis_Data/AGabs.csv", index_col=0
    ).sort_values("IS_Perfect")

    return trn_agab_df, tst_agab_df

def get_agraw_dfs():
    trn_agraw_df = pd.read_csv(
        fine_tune_path/"AGraw/training.tsv", sep="\t",
        names=["input", "target"]
    )
    tst_agraw_df = pd.read_csv(
        "DataSnooping_Analysis_Data/AGraw.csv", index_col=0
    ).sort_values("IS_Perfect")

    return trn_agraw_df, tst_agraw_df

def get_bfsm_dfs():
    trn_bfsm_df = pd.read_csv(
        "datasets/tsv/fine-tuning/BFsmall/training.tsv", sep="\t",
        names=["input", "target"]
    )
    tst_bfsm_df = pd.read_csv(
        "DataSnooping_Analysis_Data/BFsmall.csv", index_col=0
    ).sort_values("IS_Perfect")

    return trn_bfsm_df, tst_bfsm_df

def get_bfmed_dfs():
    trn_bfmed_df = pd.read_csv(
        "datasets/tsv/fine-tuning/BFmedium/training.tsv", sep="\t",
        names=["input", "target"]
    )
    tst_bfmed_df = pd.read_csv(
        "DataSnooping_Analysis_Data/BFmedium.csv", index_col=0
    ).sort_values("IS_Perfect")

    return trn_bfmed_df, tst_bfmed_df

def get_codesum_dfs():
    trn_codesum_df = pd.read_csv(
        "datasets/tsv/fine-tuning/CS/training.tsv", sep="\t",
        names=["input", "target"]
    )
    tst_codesum_df = pd.read_csv(
        "DataSnooping_Analysis_Data/CodeSummarization.csv", index_col=0
    ).sort_values("BLEU")

    return trn_codesum_df, tst_codesum_df

def get_muts_dfs():
    trn_muts_df = pd.read_csv(
        "datasets/tsv/fine-tuning/MG/training.tsv", sep="\t",
        names=["input", "target"]
    )
    tst_muts_df = pd.read_csv(
        "DataSnooping_Analysis_Data/Mutants.csv", index_col=0
    ).sort_values("BLEU")

    return trn_muts_df, tst_muts_df

def sample_bst_wrst(df, pop=1_000, n=100):
    bst = df.tail(pop)
    wrst = df.head(pop)

    return bst.sample(n), wrst.sample(n)

In [None]:
# This code was taken from https://gist.github.com/kylebgorman/1081951/bce3de986e4b05fc0b63d4d9e0cfa4bde6664365
def _dist(A, B, insertion, deletion, substitution):
    D = np.zeros((len(A) + 1, len(B) + 1))
    for i in range(len(A)):
        D[i + 1][0] = D[i][0] + deletion
    for j in range(len(B)):
        D[0][j + 1] = D[0][j] + insertion
    for i in range(len(A)): # fill out middle of matrix
        for j in range(len(B)):
            if A[i] == B[j]:
                D[i + 1][j + 1] = D[i][j] # aka, it's free.
            else:
                D[i + 1][j + 1] = min(D[i + 1][j] + insertion,
                                      D[i][j + 1] + deletion,
                                      D[i][j]     + substitution)
    return D

def levenshtein_distance(l1, l2, normalize=False):
    dist = _dist(l1, l2, 1, 1, 1)[-1][-1]
    if normalize:
        return 1. - dist / max(len(l1), len(l2))
    else:
        return dist

In [None]:
def get_dists(trn, tst):
    import sentencepiece as spm
    s = spm.SentencePieceProcessor(model_file='dl4se_vocab.model')
    dists = Parallel(n_jobs=-1)(
        delayed(levenshtein_distance)(s.encode(i), s.encode(j))
        for i in trn for j in tst
    )
    
    return dists