In [None]:
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold

from simpletransformers.classification import ClassificationModel
import pandas as pd
from sklearn import preprocessing

In [None]:
input_file = "./data/corpus.csv"
corpus = pd.read_csv("./data/corpus.csv", dtype="string")

In [None]:
corpus.head()

In [None]:
le = preprocessing.LabelEncoder()
corpus["y2"]=pd.Series(le.fit_transform(corpus["y2"]))
corpus["y6"]=pd.Series(le.fit_transform(corpus["y6"]))
corpus["y8"]=pd.Series(le.fit_transform(corpus["y8"]))

In [None]:
corpus.head()

### SR data

In [None]:
SR_data = corpus[corpus.NaturalLanguageID == "SR"]
# Remove IDE to be consistent with other models.
SR_data = SR_data[SR_data.y8 != "ide"]

### EN data

In [None]:
EN_data = corpus[corpus.NaturalLanguageID == "EN"]

In [None]:
# corpus = corpus.drop(corpus[(corpus.NaturalLanguageID == "SR") & (corpus.y8 == "ide")].index)

### Evaluation

In [None]:
def make_score_name(score_name, model_name, num_classes):
    return f"{score_name}-{model_name}-{num_classes}"

In [None]:
def f1_macro_score(y, y_pred):
    return f1_score(y, y_pred, average="macro")

In [None]:
def write_results(result_file, score_name, score_value):
    pd.DataFrame(
        {"score_name": [score_name],
        "score_value": [score_value]}
    ).to_csv(result_file, mode="a", decimal=",", header=False, index=False)

In [None]:
def evaluate(data_train_lang, data_test_lang, result_file, model_type, model_name, num_epochs, y_column_names=["y8","y6", "y2"], seeds=[11, 17, 23, 47, 62]):
    parameter_dict = {}
    parameter_dict["fp16"] = False
    parameter_dict["overwrite_output_dir"] = True
    parameter_dict["reprocess_input_data"] = True
    parameter_dict["no_cache"] = True
    parameter_dict["save_eval_checkpoints"] = False
    parameter_dict["save_model_every_epoch"] = False
    parameter_dict["use_cached_eval_features"] = False
    parameter_dict["output_dir"] = f"./Transformers/{model_name}/outputs/"
    parameter_dict["cache_dir"] = f"./Transformers/{model_name}/cache/"
    parameter_dict["tensorboard_dir"] = f"./Transformers/{model_name}/runs/"
    parameter_dict["silent"] = True
    parameter_dict["num_train_epochs"] = num_epochs
    parameter_dict["max_seq_length"] = 512

    X_train = data_train_lang["Comment"].astype(str)
    X_test = data_test_lang["Comment"].astype(str)

    for y_column_name in y_column_names:
        seed_scores = []
        
        for manual_seed in seeds:
            parameter_dict["manual_seed"] = manual_seed

            y_train = data_train_lang[y_column_name]
            y_test = data_test_lang[y_column_name]

            num_classes = y_test.nunique()

            score_name = make_score_name(f"epochs{num_epochs}", model_name, num_classes)
            print(f"-------------------RUNNING {score_name}-seed{manual_seed} with {num_classes} classes.-------------------")

            print("X_train shape", X_train.shape, "y_train shape", y_train.shape)

            train_df = pd.DataFrame(list(zip(X_train, y_train)), columns=['text', 'labels'])
            eval_df = pd.DataFrame(list(zip(X_test, y_test)), columns=['text', 'labels'])

            # Create model.
            model = ClassificationModel(model_type, model_name, num_labels=y_train.nunique(), use_cuda=True, args=parameter_dict)  # You can set class weights by using the optional weight argument
            # Train model.
            global_step, training_details = model.train_model(train_df, show_running_loss=False, verbose=False)
            print(global_step, training_details)
            # Evaluate model.
            print(f"-------------------EVALUATE model-------------------")
            result, y_pred, wrong_predictions = model.eval_model(eval_df, f1=f1_macro_score, verbose=False)

            # Get results.
            print("RESULT ", result)
            macro_f1 = result["f1"]
            seed_scores.append(macro_f1)

            # Write result.
            write_results(result_file, f"0-seed{manual_seed}-{score_name}", macro_f1)
        
        # Write mean result for all seeds.
        seeds_means_f1_score = sum(seed_scores) / len(seed_scores)
        write_results(result_file, f"0-mean-{score_name}", seeds_means_f1_score)

In [None]:
import warnings
warnings.filterwarnings(action='ignore')

### Test SR

In [None]:
result_file = "./results/transformers_cross_SR.csv"

### Multilingual bert

In [None]:
evaluate(data_train_lang=EN_data, data_test_lang=SR_data, result_file=result_file, 
        model_type="bert", model_name="bert-base-multilingual-cased", num_epochs=1)

In [None]:
evaluate(data_train_lang=EN_data, data_test_lang=SR_data, result_file=result_file, 
        model_type="bert", model_name="bert-base-multilingual-cased", num_epochs=3)

In [None]:
evaluate(data_train_lang=EN_data, data_test_lang=SR_data, result_file=result_file, 
        model_type="bert", model_name="bert-base-multilingual-cased", num_epochs=5)

### Roberta

In [None]:
evaluate(data_train_lang=EN_data, data_test_lang=SR_data, result_file=result_file, 
        model_type="xlmroberta", model_name="xlm-roberta-base", num_epochs=1)

In [None]:
evaluate(data_train_lang=EN_data, data_test_lang=SR_data, result_file=result_file, 
        model_type="xlmroberta", model_name="xlm-roberta-base", num_epochs=3)

In [None]:
evaluate(data_train_lang=EN_data, data_test_lang=SR_data, result_file=result_file, 
        model_type="xlmroberta", model_name="xlm-roberta-base", num_epochs=5)

### Test EN

In [None]:
result_file = "./results/transformers_cross_EN.csv"

### Bertic

In [None]:
evaluate(data_train_lang=SR_data, data_test_lang=EN_data, result_file=result_file, 
        model_type="bert", model_name="bert-base-multilingual-cased", num_epochs=1)

In [None]:
evaluate(data_train_lang=SR_data, data_test_lang=EN_data, result_file=result_file, 
        model_type="bert", model_name="bert-base-multilingual-cased", num_epochs=3)

In [None]:
evaluate(data_train_lang=SR_data, data_test_lang=EN_data, result_file=result_file, 
        model_type="bert", model_name="bert-base-multilingual-cased", num_epochs=5)

### Multilingual roberta

In [None]:
evaluate(data_train_lang=SR_data, data_test_lang=EN_data, result_file=result_file, 
        model_type="xlmroberta", model_name="xlm-roberta-base", num_epochs=1)

In [None]:
evaluate(data_train_lang=SR_data, data_test_lang=EN_data, result_file=result_file, 
        model_type="xlmroberta", model_name="xlm-roberta-base", num_epochs=3)

In [None]:
evaluate(data_train_lang=SR_data, data_test_lang=EN_data, result_file=result_file, 
        model_type="xlmroberta", model_name="xlm-roberta-base", num_epochs=5)