In [16]:
# Dependecies
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from time import time
from tqdm.auto import tqdm

In [2]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("lxyuan/distilbert-base-multilingual-cased-sentiments-student")
model = AutoModelForSequenceClassification.from_pretrained("lxyuan/distilbert-base-multilingual-cased-sentiments-student")
model.config

DistilBertConfig {
  "_name_or_path": "lxyuan/distilbert-base-multilingual-cased-sentiments-student",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "positive",
    "1": "neutral",
    "2": "negative"
  },
  "initializer_range": 0.02,
  "label2id": {
    "negative": 2,
    "neutral": 1,
    "positive": 0
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.37.2",
  "vocab_size": 119547
}

In [3]:
df_ecoas=pd.read_csv('./datasets/df_ready.csv', index_col=0)
df_ecoas.head()

Unnamed: 0,APR,DOM,EVA,MEJ,MET,PRA,REC,RET,ASE,GÉNERO ALUMNO,PROM ACUMULADO EN PROFESIONAL,Género del profesor,Tipo Comentario,Comentarios,AVG,Lemm
0,5.0,4.0,9.0,0.0,5.0,10.0,4.0,10.0,8.0,0,94.428,0.0,0,"Sabe explicar muy bien las cosas teoricas, pe...",7.2,"['saber', 'explicar', 'bien', 'cosa', 'teorico..."
1,5.0,8.0,8.0,0.0,5.0,7.0,8.0,5.0,8.0,0,95.968,0.0,0,Método anticuado de enseñar. Los temas podría...,6.3,"['método', 'anticuado', 'enseñar', 'tema', 'po..."
2,10.0,10.0,5.0,1.0,8.0,10.0,10.0,10.0,8.0,0,96.408,0.0,1,"Sabe muchísimo del tema, muy preparada.",8.8,"['saber', 'muchísimo', 'tema', 'preparado']"
3,10.0,10.0,10.0,1.0,10.0,10.0,10.0,10.0,10.0,0,94.981,0.0,2,buena maestra si lo recomiendo,10.0,"['buen', 'maestro', 'si', 'recomer']"
4,9.0,10.0,9.0,1.0,9.0,9.0,9.0,9.0,9.0,0,89.04,0.0,1,Tiene mucho conocimiento sobre los temas.,9.0,"['conocimiento', 'tema']"


In [4]:
conditions = [
    (df_ecoas['Tipo Comentario'] == 0),
    (df_ecoas['Tipo Comentario'] == 1),
    (df_ecoas['Tipo Comentario'] == 2)
]

choices = ['negative', 'positive', 'neutral']
df_ecoas['Sentiment'] = np.select(conditions, choices, default='undefined')

In [5]:
# keep only comments and sentiment
df_ecoas = df_ecoas[['Comentarios', 'Sentiment']]
df_ecoas.head()

Unnamed: 0,Comentarios,Sentiment
0,"Sabe explicar muy bien las cosas teoricas, pe...",negative
1,Método anticuado de enseñar. Los temas podría...,negative
2,"Sabe muchísimo del tema, muy preparada.",positive
3,buena maestra si lo recomiendo,neutral
4,Tiene mucho conocimiento sobre los temas.,positive


In [6]:
# We will use the 'Comentarios' column as input for the model and the 'Tipo Comentario' as the target variable
train, test = train_test_split(df_ecoas, test_size=0.2, random_state=42)

X_train = train['Comentarios']
y_train = train['Sentiment']
X_test = test['Comentarios']
y_test = test['Sentiment']

In [7]:
X_train[2], y_train[2]

('Sabe muchísimo del tema, muy preparada. ', 'positive')

In [8]:
from transformers import TextClassificationPipeline
distilled_classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True)
distilled_classifier(X_train[2])



[[{'label': 'positive', 'score': 0.9138137102127075},
  {'label': 'neutral', 'score': 0.04455755650997162},
  {'label': 'negative', 'score': 0.041628628969192505}]]

In [10]:
start = time()
class_names = ['positive', 'neutral', 'negative']

batch_size = 128 # larger batch size bc distilled model is more memory efficient
distilled_classifier.return_all_scores = False
tokenizer_kwargs = {'padding':True, 'truncation':True, 'max_length':512}
preds = []
outputs = []
for i in tqdm(range(0, len(df_ecoas), batch_size)):
    examples = df_ecoas[i:i+batch_size]['Comentarios'].tolist()
    outputs.append(distilled_classifier(examples, **tokenizer_kwargs))
    # outputs = distilled_classifier(examples, **tokenizer_kwargs)
    # preds += [class_names.index(o['label'][0]) for o in outputs]


100%|██████████| 755/755 [40:58<00:00,  3.26s/it]


In [11]:
preds = []

for i in range(len(outputs)):
    for j in range(len(outputs[i])):
        preds.append(max(outputs[i][j], key=lambda x: x['score'])['label'])

In [12]:
df_ecoas[0:10]

Unnamed: 0,Comentarios,Sentiment
0,"Sabe explicar muy bien las cosas teoricas, pe...",negative
1,Método anticuado de enseñar. Los temas podría...,negative
2,"Sabe muchísimo del tema, muy preparada.",positive
3,buena maestra si lo recomiendo,neutral
4,Tiene mucho conocimiento sobre los temas.,positive
5,Una maestra con mucho conocimiento y visión,positive
6,Conocedora de tema,positive
7,Aunque no la conozco fisicamente ya que la cla...,positive
8,La clase esta super repetitiva todo el semest...,negative
9,Muchas actividades de aprendizaje. no aplica,neutral


In [13]:
preds[0:10]

['positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'neutral',
 'negative',
 'positive']

In [17]:
report = classification_report(df_ecoas['Sentiment'], preds, target_names=class_names)
print(report)

              precision    recall  f1-score   support

    positive       0.33      0.53      0.40      8823
     neutral       0.50      0.08      0.14     33131
    negative       0.66      0.93      0.77     54586

    accuracy                           0.60     96540
   macro avg       0.49      0.51      0.44     96540
weighted avg       0.57      0.60      0.52     96540



In [14]:
accuracy = np.mean(np.array(preds) == np.array(df_ecoas['Sentiment']))
print(f"Distilled model accuracy: {accuracy*100:0.2f}%")
print(f"Runtime: {time() - start : 0.2f} seconds")

Distilled model accuracy: 60.10%
Runtime:  13447.81 seconds
