# Download the results of the experiments

We download the results that we have used in the paper.

In [None]:
!pip install --upgrade --no-cache-dir gdown
!gdown 17grO477tqFbYT3ISJRn18vqoVD0dbjcD
!tar -xf AstProbing.tar.gz
!apt-get install tree

The `dataset` folder contains each folder per language. Each folder contains the splits (train, test, and valid), the full dataset, and a serialized vocabulary with the constituency and unary labels.

In [None]:
!tree dataset | head -n 7

The runs `folders` contains the output of each one of the probing runs. In particular, the format of the folders is `plm_lang_layer_dimension`. Each run folder contains a log, a serialized file with the test results, and the pytorch model. Finally, if the run folder corresponds to a multilingual setting, it starts with `multilingual`.



In [None]:
!tree runs | head -n 12

Finally, the folder finetining_results contains the results for each PLM in the code related tasks.

In [None]:
!cat finetuning_results/code_search_go.json

# Reading and preprocessing the data

We load the results of the monolingual and multilingual probe into two pandas dataframe (`mono` and `mulit`).

In [None]:
ELEGANT_NAMES = {'codebert': 'CodeBERT',
                 'codebert-baseline': 'CodeBERTrand',
                 'codeberta': 'CodeBERTa',
                 'codet5': 'CodeT5',
                 'graphcodebert': 'GraphCodeBERT',
                 'roberta': 'RoBERTa',
                 'distilbert': 'DistilBERT',
                 'bert': 'BERT',
                 'distilroberta': 'DistilRoBERTa'
                 }
LANGUAGES = (
    'python',
    'java',
    'ruby',
    'javascript',
    'go',
    'c',
    'csharp',
    'php'
)

In [None]:
import pandas as pd
import pickle
import glob
import os

def read_results_mono(run_dir='runs'):
    data = {'model': [], 'lang': [], 'layer': [], 'rank': [],
            'precision': [], 'recall': [], 'f1': []}
    for file in glob.glob(run_dir + "/*/metrics.log"):
        parent = os.path.dirname(file).split('/')[-1]
        if 'multilingual' in parent:
            continue
        model, lang, layer, rank = parent.split('_')
        with open(file, 'rb') as f:
            results = pickle.load(f)
        data['model'].append(model)
        data['lang'].append(lang)
        data['layer'].append(int(layer))
        data['rank'].append(int(rank))
        data['precision'].append(results['test_precision'])
        data['recall'].append(results['test_recall'])
        data['f1'].append(results['test_f1'])
    df = pd.DataFrame(data)
    df_renamed = df.replace(ELEGANT_NAMES)
    return df_renamed

def read_results_multi(run_dir='runs'):
    data = {'model': [], 'lang': [], 'recall': [], 'f1': [], 'precision': []}
    for file in glob.glob(run_dir + "/*/metrics.log"):
        parent = os.path.dirname(file).split('/')[-1]
        if 'multilingual' not in parent:
            continue
        _, model = parent.split('_')
        with open(file, 'rb') as f:
            results = pickle.load(f)
        for lang in LANGUAGES:
            data['model'].append(model)
            data['lang'].append(lang)
            data['precision'].append(results[f'test_precision_{lang}'])
            data['recall'].append(results[f'test_recall_{lang}'])
            data['f1'].append(results[f'test_f1_{lang}'])
    df = pd.DataFrame(data)
    return df

In [None]:
mono = read_results_mono(run_dir='runs')
multi = read_results_multi(run_dir='runs')

The dataframe `mono` contains the results of the monolingual probe for each model, layer, and programming language.

In [None]:
mono.head(5)

The dataframe `multi` contains the results of the multilingual probe for each model and programing language. The mutlingual probe was run over the representative layer of the PLM (i.e., the layer that achieved the best average F1 score). 

In [None]:
multi.head(5)

# Computing the syntactic layer for each model

We now compute the layer that achieve the best average F1 score for each model (`best_layer_mono` dataframe).

In [None]:
# This selects the representative layer
def best_layer_for_each_model(results):
    group_by_model = results.groupby(['model', 'layer'])['f1'].mean().reset_index()
    best_layer_per_model = (
        group_by_model
        .groupby(['model'])
        .apply(lambda group: group.loc[group['f1'] == group['f1'].max()])
        .reset_index(level=-1, drop=True)
    )
    return best_layer_per_model

In [None]:
best_layer_mono = best_layer_for_each_model(mono)
best_layer_mono.head(10)

# Plots layer vs F1

Given a programming language, we plot the layer vs F1 score for each model.

In [None]:
from plotnine import *

lang = 'python'
assert lang in LANGUAGES

layer_vs_f1_lang = (
  ggplot(mono[(mono['lang'] == lang)])
  + aes(x="layer", y="f1", color='model')
  + geom_line()
  + scale_x_continuous(breaks=range(0, 13, 1))
  + labs(x="Layer", y="F1", color="Model")
  + theme(text=element_text(size=16))
  + theme_light()
)
layer_vs_f1_lang

Given a PLM, we plot the layer vs F1 for each language.

In [None]:
model = 'GraphCodeBERT'
assert model in ELEGANT_NAMES.values()

layer_vs_f1_model = (
  ggplot(mono[(mono['model'] == model)])
  + aes(x="layer", y="f1", color='lang')
  + geom_line()
  + scale_x_continuous(breaks=range(0, 13, 1))
  + labs(x="Layer", y="F1", color="Lang")
  + theme(text=element_text(size=16))
  + theme_light()
)
layer_vs_f1_model

# Correlations

We now compute the (Spearman) correlations of the multilingual AST-Probe and the performances on downstream tasks.

In [None]:
import scipy.stats as stats

def compute_correlation(rq_dataframe, results_finetuning, task_name, verbose=True):
    input_lang = results_finetuning["input_lang"]
    results_finetuning_pd = pd.DataFrame.from_dict({"model": results_finetuning["model"],
                                                    "performance": results_finetuning["performance"]})
    if len(input_lang) == 1:
        rq2_dataframe_lang = rq_dataframe[rq_dataframe["lang"] == input_lang[0]]
    else:
        rq2_dataframe_lang = rq_dataframe[rq_dataframe["lang"].isin(input_lang)] \
            .groupby(['model'])['f1'].mean().reset_index()

    df_cd = pd.merge(results_finetuning_pd, rq2_dataframe_lang, how='inner', on='model')
    f1s = list(df_cd["f1"])
    performances = list(df_cd["performance"])
    models = list(df_cd["model"])
    if verbose:
      print(f'Task: {task_name}')
      print(f'Models: {models}')
      print(f'F1s: {f1s}')
      print(f'Performances: {performances}')
      print(f'Correlation: {stats.spearmanr(f1s, performances)}')
    return stats.spearmanr(f1s, performances).correlation

In [None]:
import json

finetuning_results_path = 'finetuning_results/defect_prediction.json'
with open(finetuning_results_path) as json_file:
  results_finetuning = json.load(json_file)
_ = compute_correlation(multi, results_finetuning, 'defect prediction')

For each task and input programming language, the correlations are the following:

In [None]:
import glob
import os

code2nl = []
code_search = []
code2code = []
for file in glob.glob('finetuning_results/*.json'):
  with open(file) as json_file:
    results_finetuning = json.load(json_file)
  _, tail = os.path.split(file)
  task, _ = tail.split('.')
  corr = compute_correlation(multi, results_finetuning, task, False)
  if 'code2nl' in task:
    code2nl.append(corr)
  elif 'code_search' in task:
    code_search.append(corr)
  elif task == 'java_to_csharp' or task == 'csharp_to_java':
    code2code.append(corr)
  print(f'Correlation for {task}: {corr}')

The average correlations are:

In [None]:
import numpy as np
np.mean(code2nl)

In [None]:
np.mean(code_search)

In [None]:
np.mean(code2code)

The scatter plots (Performance vs F1) can be drawn using the following snippet. You can modify `finetuning_results_path`..

In [None]:
from scipy.stats import rankdata

def scatter_plot_correlations(rq_dataframe, results_finetuning, rank=True):
  input_lang = results_finetuning["input_lang"]
  results_finetuning_pd = pd.DataFrame.from_dict({"model": results_finetuning["model"],
                                                    "performance": results_finetuning["performance"]})
  if len(input_lang) == 1:
    rq2_dataframe_lang = rq_dataframe[rq_dataframe["lang"] == input_lang[0]]
  else:
    rq2_dataframe_lang = rq_dataframe[rq_dataframe["lang"].isin(input_lang)] \
            .groupby(['model'])['f1'].mean().reset_index()

  df_cd = pd.merge(results_finetuning_pd, rq2_dataframe_lang, how='inner', on='model')
  f1s = np.array(df_cd["f1"])
  if rank:
    f1s = rankdata(f1s, method='ordinal')
  performances = np.array(df_cd["performance"])
  if rank:
    performances = rankdata(performances, method='ordinal')
  p_dict = {"F1": f1s, "Performance": performances}
  df = pd.DataFrame(p_dict)
  scatter_plot = (
    ggplot(df)
    + aes(x="Performance", y="F1")
    + geom_point()
    + geom_smooth(method='lm', se=True)
    + theme(text=element_text(size=16))
    + theme_light()
    + labs(title="", x="Rank Performance" if rank else "Performance", 
           y="Rank F1" if rank else "F1")
  )
  return scatter_plot
  
finetuning_results_path = 'finetuning_results/code_search_go.json'
with open(finetuning_results_path) as json_file:
  results_finetuning = json.load(json_file)
scatter_plot_correlations(multi, results_finetuning, rank=False)

# Visualization

Now we present the visualization techniques presented in the paper.

## Angle between subspaces

To visualize angle between subspaces, the following code load the projections and compute the angles between each pair of programming languages.

In [None]:
import torch
from prettytable import PrettyTable
from scipy.linalg import subspace_angles

def load_vectors(run_folder):
    loaded_model = torch.load(os.path.join(run_folder, f'pytorch_model.bin'),
                              map_location=torch.device('cpu'))
    vectors_c = loaded_model['vectors_c'].cpu().detach().numpy().T
    vectors_u = loaded_model['vectors_u'].cpu().detach().numpy().T
    proj = loaded_model['proj'].cpu().detach().numpy()
    return vectors_c, vectors_u, proj
  
def compute_angle_model(best_layer_per_model, model):
    layer = best_layer_per_model[best_layer_per_model['model'] == ELEGANT_NAMES[model]].layer.values[0]
    subspaces = {}
    for lang in LANGUAGES:
        name_folder = '_'.join([model, lang, str(layer), '128'])
        run_folder = os.path.join('runs', name_folder)
        _, _, proj = load_vectors(run_folder)
        subspaces[lang] = proj

    table_sim_ang = PrettyTable()
    table_sim_ang.field_names = ["----"] + list(LANGUAGES)

    data = {'lang1': [], 'lang2': [], 'angle': [], 'text': [], 'model': []}
    for i, x in enumerate(LANGUAGES):
        row_ang = [x]
        for j, y in enumerate(LANGUAGES):
            subspace_sim_ang = np.rad2deg(np.mean(subspace_angles(subspaces[x], subspaces[y])))
            row_ang.append(round(subspace_sim_ang, 2))
            if x != y and j < i:
                data['lang1'].append(x)
                data['lang2'].append(y)
                data['angle'].append(subspace_sim_ang)
                data['text'].append(str(round(subspace_sim_ang, 2)))
                data['model'].append(model)
        table_sim_ang.add_row(row_ang)

    df = pd.DataFrame.from_dict(data)
    return df, table_sim_ang

In [None]:
model = 'codebert'
assert model in ELEGANT_NAMES

df, pt = compute_angle_model(best_layer_mono, model)
# print(pt)
angles_p9 = (
            ggplot(mapping=aes("lang1", "lang2", fill="angle"),
                   data=df)
            + geom_tile() + geom_label(aes(label="text"), fill="white", size=10)
            + scale_fill_distiller()
            + theme_minimal()
            + scale_x_discrete(limits=LANGUAGES)
            + scale_y_discrete(limits=LANGUAGES)
            + labs(title="", x="", y="", fill="angle\n")
            + theme(axis_text_x=element_text(rotation=45, hjust=1, size=12),
                    axis_text_y=element_text(size=12),
                    legend_title=element_text(size=12),
                    legend_title_align='center')
    )
angles_p9

## Visualization of constituency labels

In [None]:
from sklearn.manifold import TSNE

def load_labels(run_folder):
    labels_file_path_c = os.path.join(run_folder, 'global_labels_c.pkl')
    labels_file_path_u = os.path.join(run_folder, 'global_labels_u.pkl')
    with open(labels_file_path_c, 'rb') as f:
        labels_to_ids_c = pickle.load(f)
    with open(labels_file_path_u, 'rb') as f:
        labels_to_ids_u = pickle.load(f)
    ids_to_labels_c = {y: x for x, y in labels_to_ids_c.items()}
    ids_to_labels_u = {y: x for x, y in labels_to_ids_u.items()}
    return labels_to_ids_c, ids_to_labels_c, labels_to_ids_u, ids_to_labels_u

def run_tsne(vectors, ids_to_labels, perplexity=30, type_labels='constituency', seed=123):
    v_2d = TSNE(n_components=2, learning_rate='auto', perplexity=perplexity,
                init='random', random_state=seed).fit_transform(vectors)
    df = pd.DataFrame(v_2d, columns=['tsne1', 'tsne2'])
    langs = []
    const = []
    for ix, _ in enumerate(ids_to_labels):
        label = ids_to_labels[ix]
        l = label.split('--')[1]
        langs.append(l)
        const.append(label.split('--')[0])
    df['language'] = langs
    df['constituency'] = const
    scatter_tsne = (
            ggplot(df, aes(x='tsne1', y='tsne2', color='language')) + geom_point()
            + labs(title="", x="", y="", color="Languages")
    )
    return scatter_tsne

In [None]:
model = 'CodeBERT'
assert model in ELEGANT_NAMES.values()

run_folder = f'runs/multilingual_{model}/'

vectors_c, vectors_u, _ = load_vectors(run_folder)
labels_to_ids_c, ids_to_labels_c, labels_to_ids_u, ids_to_labels_u = load_labels(run_folder)
run_tsne(vectors_c, ids_to_labels_c, perplexity=30, type_labels='constituency', seed=123)

In [None]:
run_folder = 'runs/multilingual_CodeBERTrand-baseline/'
vectors_c, vectors_u, _ = load_vectors(run_folder)
labels_to_ids_c, ids_to_labels_c, labels_to_ids_u, ids_to_labels_u = load_labels(run_folder)
run_tsne(vectors_c, ids_to_labels_c, perplexity=30, type_labels='constituency', seed=123)