In [7]:
import os
import parse
import fasttext
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
datadir = "/home/peterr/macocu/task5_webgenres/data/final/fasttext1"

dev_full = os.path.join(datadir, "dev_onlykeep_False_onlyprimary_True_dedup_False.fasttext")
test_full = os.path.join(datadir, "test_onlykeep_False_onlyprimary_True_dedup_False.fasttext")
train_full = os.path.join(datadir, "train_onlykeep_False_onlyprimary_True_dedup_False.fasttext")

def parse_fasttext_file(path: str):
    """Reads fasttext formatted file and returns dataframe."""
    with open(path, "r") as f:
        content = f.readlines()
    pattern = "{label} {text}\n"
    p = parse.compile(pattern)

    labels, texts = list(), list()
    for line in content:
        rez = p.parse(line)
        if rez is not None:
            labels.append(rez["label"])
            texts.append(rez["text"])
        else:
            print("error parsing line ", line)
    # labels = np.array(labels)
    # all_labels = np.array(['__label__Legal/Regulation', '__label__Opinionated_News',
    #    '__label__News/Reporting', '__label__Forum',
    #    '__label__Correspondence', '__label__Invitation',
    #    '__label__Instruction', '__label__Recipe',
    #    '__label__Opinion/Argumentation', '__label__Promotion_of_Services',
    #    '__label__Promotion', '__label__List_of_Summaries/Excerpts',
    #    '__label__Promotion_of_a_Product', '__label__Call',
    #    '__label__Review', '__label__Other',
    #    '__label__Information/Explanation', '__label__Interview',
    #    '__label__Prose', '__label__Research_Article',
    #    '__label__Announcement']).reshape((-1, 1))
    # from sklearn.preprocessing import OrdinalEncoder
    # enc = OrdinalEncoder()
    # enc.fit(all_labels)
    # labels = enc.transform(labels.reshape((-1,1))).reshape(-1)
    return pd.DataFrame(data={"text": texts, "labels": labels})

train_df = parse_fasttext_file(train_full)
eval_df = parse_fasttext_file(dev_full)

In [9]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import wandb


model_args = ClassificationArgs()
model_args.output_dir = "data/models"
model_args.use_early_stopping = True
model_args.early_stopping_delta = 0.01
model_args.early_stopping_metric = "mcc"
model_args.early_stopping_metric_minimize = False
model_args.early_stopping_patience = 3
model_args.evaluate_during_training_steps = 1000
model_args.save_eval_checkpoints = False,
model_args.save_model_every_epoch = False,
model_args.save_optimizer_and_scheduler = False,
model_args.save_steps = -1

model_args.evaluate_during_training = True
model_args.manual_seed = 4
model_args.use_multiprocessing = True
model_args.eval_batch_size = 32
model_args.wandb_project = "webgenres"
model_args.reprocess_input_data = True
model_args.overwrite_output_dir = True


sweep_config = {
    "method": "bayes",  # grid, random
    "metric": {"name": "mcc", "goal": "maximize"},
    "parameters": {
        "num_train_epochs": {"values": [5,6,7,8,9,10]},
        "learning_rate":  {"min": 1e-8, "max":1e-5},
        "train_batch_size": {"values": [32]}
    },
}

sweep_id = wandb.sweep(sweep_config, project="webgenres")



def train():
    # Initialize a new wandb run
    wandb.init()

    # Create a TransformerModel
    model = ClassificationModel(
        "camembert",
        "EMBEDDIA/sloberta",
        use_cuda=False,
        args=model_args,
        sweep_config=wandb.config,
    )

    # Train the model
    model.train_model(train_df, eval_df=eval_df)

    # Evaluate the model
    model.eval_model(eval_df)

    # Sync wandb
    wandb.join()
train()




Create sweep with ID: tzkqr4x8
Sweep URL: https://wandb.ai/5roop/webgenres/sweeps/tzkqr4x8


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

[34m[1mwandb[0m: wandb version 0.12.6 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Some weights of the model checkpoint at EMBEDDIA/sloberta were not used when initializing CamembertForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at EMBEDDIA/sloberta and are newly initialized: ['roberta.pooler.dense.bias', 'classifier.dense.weight',

ValueError: too many dimensions 'str'