## Librerías, datos y procesamiento inicial

In [1]:
#Importamos las librerías
import pandas as pd
import numpy as np
import evaluate
from imblearn.datasets import make_imbalance
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline, TrainingArguments, Trainer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
import mlflow, os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.environ["MLFLOW_EXPERIMENT_NAME"] = "topic"
os.environ["MLFLOW_FLATTEN_PARAMS"] = "1"
os.environ["HF_MLFLOW_LOG_ARTIFACTS"] = "0"
mlflow.set_tracking_uri('http://localhost:5000')
mlflow.set_experiment(experiment_name='topic')

2023/03/23 19:49:26 INFO mlflow.tracking.fluent: Experiment with name 'topic' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/433727646153651320', creation_time=1679618966216, experiment_id='433727646153651320', last_update_time=1679618966216, lifecycle_stage='active', name='topic', tags={}>

In [3]:
#Importamos los datos:
df = pd.read_excel(r"G:\Mi unidad\Universidad\EAFIT\2 semestre\PDG\modelos\TOPIC\data_depurada.xlsx")

In [4]:
df["label"].value_counts()

Post de interacción      2130
Dudas                    1062
Reclutamiento             489
Menciones                 436
Felicitaciones            400
Crítica                   393
Noticias                  194
Condiciones laborales     146
Alcance                   126
Oferta terceros           116
Pagos                      64
Otros                      48
Name: label, dtype: int64

In [5]:
topics = ["Post de interacción","Alcance","Dudas","Otros","Reclutamiento","Crítica","Condiciones laborales","Oferta terceros", "Menciones", "Noticias", "Felicitaciones", "Pagos"]
df["text"] = df["text"].apply(lambda x: str(x))

In [6]:
#Variables
check_point = "facebook/bart-large-mnli"
nombre_base = "bart-large-mnli_"

## Modelo Base

In [8]:
tokenizer = AutoTokenizer.from_pretrained(check_point)
model = AutoModelForSequenceClassification.from_pretrained(check_point,num_labels=12,ignore_mismatched_sizes=True)
classifier = pipeline("zero-shot-classification",model = model, tokenizer=tokenizer, padding="max_length", truncation=True,max_length=128,device=0)
pred = df["text"].apply(lambda x: classifier(x,topics)['labels'][0])
print(classification_report(df["label"],pred))

Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([12, 1024]) in the model instantiated
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([12]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.


                       precision    recall  f1-score   support

              Alcance       0.09      0.47      0.16       126
Condiciones laborales       0.07      0.04      0.05       146
              Crítica       0.12      0.47      0.19       393
                Dudas       0.08      0.02      0.04      1062
       Felicitaciones       0.47      0.32      0.38       400
            Menciones       0.09      0.03      0.04       436
             Noticias       0.17      0.06      0.08       194
      Oferta terceros       0.01      0.02      0.01       116
                Otros       0.00      0.02      0.00        48
                Pagos       0.03      0.55      0.06        64
  Post de interacción       0.05      0.01      0.02      2130
        Reclutamiento       0.10      0.10      0.10       489

             accuracy                           0.09      5604
            macro avg       0.11      0.17      0.09      5604
         weighted avg       0.10      0.09      0.07 

In [9]:
with mlflow.start_run(run_name=nombre_base+'BASELINE'):
    acc = accuracy_score(df["label"],pred)
    mlflow.log_metric("accuracy", acc)