In [1]:
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold

from simpletransformers.classification import ClassificationModel
import os
import pandas as pd
from sklearn import preprocessing

In [2]:
input_file = "./data/corpus.csv"
corpus = pd.read_csv("./data/corpus.csv", dtype="string")

In [3]:
corpus.head()

Unnamed: 0,NaturalLanguageID,ProgrammingLanguageID,Comment,ClassM,ClassA,y8,y6,y2
0,EN,C,"#include ""map_in_map.h""",code,code,code,code,non-functional
1,EN,C,"- offsetof(struct bpf_array, value);",code,,code,code,non-functional
2,EN,C,array->index_mask = index_mask;,code,,code,code,non-functional
3,EN,C,"bpf_map_charge_move(&array->map.memory, &mem);",code,,code,code,non-functional
4,EN,C,array->elem_size = elem_size;,code,,code,code,non-functional


### SR data

In [4]:
SR_data = corpus[corpus.NaturalLanguageID == "SR"]
# Remove IDE to be consistent with other models.
SR_data = SR_data[SR_data.y8 != "ide"]

### EN data

In [5]:
EN_data = corpus[corpus.NaturalLanguageID == "EN"]

### Evaluation

In [6]:
def make_score_name(lang_name, score_name, model_name, num_classes):
    return f"{lang_name}-{score_name}-{model_name}-{num_classes}"

In [7]:
def f1_macro_score(y, y_pred):
    return f1_score(y, y_pred, average="macro")

In [8]:
def write_results(result_file, score_name, score_value):
    pd.DataFrame(
        {"score_name": [score_name],
        "score_value": [score_value]}
    ).to_csv(result_file, mode="a", decimal=",", header=False, index=False)

In [17]:
def evaluate_lang(data_train, data_test, test_lang_name, result_file, model_type, model_name, num_epochs, y_column_names=["y8","y6", "y2"], seeds=[11, 17, 23, 47, 62]):
    parameter_dict = {}
    parameter_dict["fp16"] = False
    parameter_dict["overwrite_output_dir"] = True
    parameter_dict["reprocess_input_data"] = True
    parameter_dict["no_cache"] = True
    parameter_dict["save_eval_checkpoints"] = False
    parameter_dict["save_model_every_epoch"] = False
    parameter_dict["use_cached_eval_features"] = False
    parameter_dict["output_dir"] = f"./Transformers/{model_name}/outputs/"
    parameter_dict["cache_dir"] = f"./Transformers/{model_name}/cache/"
    parameter_dict["tensorboard_dir"] = f"./Transformers/{model_name}/runs/"
    parameter_dict["silent"] = True
    parameter_dict["num_train_epochs"] = num_epochs
    parameter_dict["max_seq_length"] = 512

    X_train = data_train["Comment"].astype(str)
    X_test = data_test["Comment"].astype(str)

    for y_column_name in y_column_names:
        seed_scores = []
        
        for manual_seed in seeds:
            parameter_dict["manual_seed"] = manual_seed

            y_train = data_train[y_column_name]
            y_test = data_test[y_column_name]

            num_classes = y_test.nunique()

            score_name = make_score_name(test_lang_name, f"epochs{num_epochs}", model_name, num_classes)
            print(f"-------------------RUNNING {score_name}-seed{manual_seed} with {num_classes} classes.-------------------")

            print("X_train shape", X_train.shape, "y_train shape", y_train.shape)

            train_df = pd.DataFrame(list(zip(X_train, y_train)), columns=['text', 'labels'])
            eval_df = pd.DataFrame(list(zip(X_test, y_test)), columns=['text', 'labels'])

            # Create model.
            model = ClassificationModel(model_type, model_name, num_labels=y_train.nunique(), use_cuda=False, args=parameter_dict)  # You can set class weights by using the optional weight argument
            # Train model.
            global_step, training_details = model.train_model(train_df, show_running_loss=False, verbose=False)
            print(global_step, training_details)
            # Evaluate model.
            print(f"-------------------EVALUATE model-------------------")
            result, y_pred, wrong_predictions = model.eval_model(eval_df, f1=f1_macro_score, verbose=False)

            # Get results.
            print("RESULT ", result)
            macro_f1 = result["f1"]
            seed_scores.append(macro_f1)

            # Write result.
            write_results(result_file, f"0-seed{manual_seed}-{score_name}", macro_f1)
        
        # Write mean result for all seeds.
        seeds_means_f1_score = sum(seed_scores) / len(seed_scores)
        write_results(result_file, f"0-mean-{score_name}", seeds_means_f1_score)

In [18]:
import warnings
warnings.filterwarnings(action='ignore')

## Monolingual SR

In [19]:
result_file = "./results/transformers_SR_per_language.csv"

In [20]:
for lang_name in ['C', 'C++', 'C#', 'Java', 'TypeScript', 'Python', 'SQL']:
    train_data = SR_data[SR_data.ProgrammingLanguageID != lang_name]
    test_data = SR_data[SR_data.ProgrammingLanguageID == lang_name]
    print(lang_name, train_data.shape, test_data.shape, train_data.shape[0]+test_data.shape[0])

    evaluate_lang(train_data, test_data, lang_name, result_file, model_type="electra", model_name="classla/bcms-bertic", num_epochs=1)
    evaluate_lang(train_data, test_data, lang_name, result_file, model_type="electra", model_name="classla/bcms-bertic", num_epochs=3)
    evaluate_lang(train_data, test_data, lang_name, result_file, model_type="electra", model_name="classla/bcms-bertic", num_epochs=5)

C (4320, 8) (714, 8) 5034
-------------------RUNNING C-epochs1-classla/bcms-bertic-7-seed11 with 7 classes.-------------------
X_train shape (4320,) y_train shape (4320,)


Downloading: 100%|██████████| 443M/443M [00:08<00:00, 52.1MB/s] 
Some weights of the model checkpoint at classla/bcms-bertic were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at classla/bcms-bertic and 

ValueError: too many dimensions 'str'

In [None]:
# for lang_name in ['C', 'C++', 'C#', 'Java', 'JavaScript', 'TypeScript', 'PHP', 'Python', 'SQL']: