In [None]:
from simpletransformers.classification import ClassificationModel
import pandas as pd
from sklearn import preprocessing

In [None]:
def print_tokens_len(language, model_type, model_name, max_seq_len = 512):
    print(f"Language {language}, model_name = {model_name}, model_type = {model_type}")
    parameter_dict = {}
    parameter_dict["fp16"] = False
    # parameter_dict["manual_seed"] = 64
    parameter_dict["overwrite_output_dir"] = True
    parameter_dict["reprocess_input_data"] = True
    parameter_dict["no_cache"] = True
    parameter_dict["save_eval_checkpoints"] = False
    parameter_dict["save_model_every_epoch"] = False
    parameter_dict["use_cached_eval_features"] = False
    parameter_dict["output_dir"] = f"./Transformers/{model_name}/outputs/"
    parameter_dict["cache_dir"] = f"./Transformers/{model_name}/cache/"
    parameter_dict["tensorboard_dir"] = f"./Transformers/{model_name}/runs/"
    parameter_dict["silent"] = True
    # parameter_dict["do_lower_case"] = lowercase
    parameter_dict["num_train_epochs"] = 1
    parameter_dict["max_seq_length"] = max_seq_len
    # parameter_dict["sliding_window"] = sliding_window

    corpus = pd.read_csv("./data/corpus.csv", dtype=str)
    print(corpus.shape)
    data = corpus[corpus.NaturalLanguageID == language]
    X = data["Comment"].astype(str)
    y = data["y2"]
    num_classes = y.nunique()

    # Convert string classes to numbers.
    le = preprocessing.LabelEncoder()
    y=pd.Series(le.fit_transform(y))

    # Prepare data.
    train_df = pd.DataFrame(list(zip(X, y)), columns=['text', 'labels'])

    try:
        # Create model.
        model = ClassificationModel(model_type, model_name, num_labels=num_classes, use_cuda=True, args=parameter_dict)  # You can set class weights by using the optional weight argument
        # Train model.
        _, _ = model.train_model(train_df, show_running_loss=False, verbose=False)
    except Exception as e:
        print(e)

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
print_tokens_len(language="SR", model_type="electra", model_name="classla/bcms-bertic", max_seq_len=5000)

In [None]:
print_tokens_len(language="EN", model_type="electra", model_name="google/electra-base-discriminator", max_seq_len=5000)

In [None]:
print_tokens_len(language="SR", model_type="bert", model_name="bert-base-multilingual-cased", max_seq_len=5000)

In [None]:
print_tokens_len(language="EN", model_type="bert", model_name="bert-base-multilingual-cased", max_seq_len=5000)

In [None]:
print_tokens_len(language="SR", model_type="xlmroberta", model_name="xlm-roberta-base", max_seq_len=5000)

In [None]:
print_tokens_len(language="EN", model_type="xlmroberta", model_name="xlm-roberta-base", max_seq_len=5000)