1. **Descarga e Instalación de Librerías**

In [None]:
import pandas as pd
import torch
import numpy as np
from torch import nn, optim
from textwrap import wrap
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup

In [None]:
!pip install transformers



2. **Carga y visualización de Dataframe Preprocesado**

2.1 Carga de df preprocesado --> df_corpus_preproc

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Cargar el DataFrame desde Google Drive
df_corpus_preproc = pd.read_csv('/content/drive/MyDrive/Bootcamp AI&ML KC/NLP/df_corpus_preproc.csv')
df_corpus_preproc.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,overall,reviewText
0,4,material arrived early excellent condition how...
1,4,really enjoying book worksheet make review goa...
2,1,taking class waste money called book book isi ...
3,3,book missing page important page could answer ...
4,5,used learnsmart officially say amazing study t...


2.2 Visualización de No. de Reviews

In [None]:
#Número total de reviews del Dataset
total_reviews = df_corpus_preproc['reviewText'].count()
print("Número total de reviews en 'reviewText':", total_reviews)

Número total de reviews en 'reviewText': 458773


2.3 Columna --> sentiment: Categorización de Reviews Positivas y Negativas

In [None]:
#Nueva columna: 'sentiment' basada en 'overall'
#Calificaciones de sentiment: 1 a 3-->Negativas / 4 a 5-->Positivas
df_corpus_preproc['sentiment'] = df_corpus_preproc['overall'].apply(lambda x: 'Negativa' if x <= 3 else 'Positiva')
print(df_corpus_preproc.head())

   overall                                         reviewText sentiment
0        4  material arrived early excellent condition how...  Positiva
1        4  really enjoying book worksheet make review goa...  Positiva
2        1  taking class waste money called book book isi ...  Negativa
3        3  book missing page important page could answer ...  Negativa
4        5  used learnsmart officially say amazing study t...  Positiva


In [None]:
#Número de reviews negativas y positivas
num_reviews_negativas = len(df_corpus_preproc[df_corpus_preproc['sentiment'] == 'Negativa'])
num_reviews_positivas = len(df_corpus_preproc[df_corpus_preproc['sentiment'] == 'Positiva'])

print("Número de reviews Negativas:", num_reviews_negativas)
print("Número de reviews Positivas:", num_reviews_positivas)

Número de reviews Negativas: 173381
Número de reviews Positivas: 285989


2.4 Carga de df en bruto --> df_corpus

In [None]:
from google.colab import drive
drive.mount('/content/drive')

file_path = '//content/drive/MyDrive/Bootcamp AI&ML KC/NLP/Software.json.gz'
df_corpus = pd.read_json(file_path, compression='gzip', lines=True)
df_corpus.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,4,True,"03 11, 2014",A240ORQ2LF9LUI,77613252,{'Format:': ' Loose Leaf'},Michelle W,The materials arrived early and were in excell...,Material Great,1394496000,,
1,4,True,"02 23, 2014",A1YCCU0YRLS0FE,77613252,{'Format:': ' Loose Leaf'},Rosalind White Ames,I am really enjoying this book with the worksh...,Health,1393113600,,
2,1,True,"02 17, 2014",A1BJHRQDYVAY2J,77613252,{'Format:': ' Loose Leaf'},Allan R. Baker,"IF YOU ARE TAKING THIS CLASS DON""T WASTE YOUR ...",ARE YOU KIDING ME?,1392595200,7.0,
3,3,True,"02 17, 2014",APRDVZ6QBIQXT,77613252,{'Format:': ' Loose Leaf'},Lucy,This book was missing pages!!! Important pages...,missing pages!!,1392595200,3.0,
4,5,False,"10 14, 2013",A2JZTTBSLS1QXV,77775473,,Albert V.,I have used LearnSmart and can officially say ...,Best study product out there!,1381708800,,


In [None]:
#Eliminar NaN de reviewText
#La proporción de NaN es 0.01436% del total de reviews del df
df_corpus = df_corpus.dropna(subset=['reviewText'])
nan_reviewText = df_corpus['reviewText'].isna().sum()
print(f"NaN o nulos en 'reviewText': {nan_reviewText}")

NaN o nulos en 'reviewText': 0


4.1 Preprocesar Dataframe --> Codificar los Reviews en 0 y 1

In [None]:
#Columnas de Interés --> 'overall', 'reviewText'
columnas_a_conservar = ['overall', 'reviewText']
df_corpus = df_corpus[columnas_a_conservar]
df_corpus.head()

Unnamed: 0,overall,reviewText
0,4,The materials arrived early and were in excell...
1,4,I am really enjoying this book with the worksh...
2,1,"IF YOU ARE TAKING THIS CLASS DON""T WASTE YOUR ..."
3,3,This book was missing pages!!! Important pages...
4,5,I have used LearnSmart and can officially say ...


In [None]:
#Nueva columna: 'sentiment' basada en 'overall'
#Calificaciones de sentiment: 1 a 3-->Negativas / 4 a 5-->Positivas
df_corpus['sentiment'] = df_corpus['overall'].apply(lambda x: 'Negativa' if x <= 3 else 'Positiva')
df_corpus.rename(columns={'reviewText': 'review'}, inplace=True)
df_corpus.head()

Unnamed: 0,overall,review,sentiment
0,4,The materials arrived early and were in excell...,Positiva
1,4,I am really enjoying this book with the worksh...,Positiva
2,1,"IF YOU ARE TAKING THIS CLASS DON""T WASTE YOUR ...",Negativa
3,3,This book was missing pages!!! Important pages...,Negativa
4,5,I have used LearnSmart and can officially say ...,Positiva


In [None]:
#Crear Columna category --> Positiva=1, Negativa=0
df_corpus['label'] = (df_corpus['sentiment']=='Positiva').astype(int)
df_corpus.head()

Unnamed: 0,overall,review,sentiment,label
0,4,The materials arrived early and were in excell...,Positiva,1
1,4,I am really enjoying this book with the worksh...,Positiva,1
2,1,"IF YOU ARE TAKING THIS CLASS DON""T WASTE YOUR ...",Negativa,0
3,3,This book was missing pages!!! Important pages...,Negativa,0
4,5,I have used LearnSmart and can officially say ...,Positiva,1


In [None]:
#Eliminar Columnas overall y sentiment
df_corpus.drop(['overall', 'sentiment'], axis=1, inplace=True)
df_corpus.head()

Unnamed: 0,review,label
0,The materials arrived early and were in excell...,1
1,I am really enjoying this book with the worksh...,1
2,"IF YOU ARE TAKING THIS CLASS DON""T WASTE YOUR ...",0
3,This book was missing pages!!! Important pages...,0
4,I have used LearnSmart and can officially say ...,1


3. **Criterios de Selección de Modelos para Análisis de Sentimientos**

3.1 Modelo A --> BERT: (*Se usará el df_corpus*)
1. Captura representaciones contextuales del texto, permitiendo una comprensión profunda de las palabras y su contexto.
2. Ofrece un rendimiento sobresaliente.
3. Aprovecha mejor grandes conjuntos de datos.
4. Requiere más tiempo desarrollar e implementar debido a su complejidad.
5. Requiere el texto en bruto. Al pre-entrenar el Modelo, se realiza la tokenización y el preprocesamiento automáticamente. La tokenización divide el texto en subtokens y analiza variables tales como la puntuación, las mayúsculas y las stopwords.

3.2 Modelo B --> Regresión Logística: (*Se usará el df_corpus_preproc*)
1. Representación más simple de las palabras.
2. Generalmente, tiene un rendimiento inferior en comparación con modelos más avanzados como BERT.
3. No requiere preentrenamiento y se ajusta directamente al conjunto de datos de análisis de sentimientos.
4. Puede requerir técnicas de manejo de clases desequilibradas.
5. Más rápido de desarrollar e implementar.
6. Requiere preprocesado y puede requerir (BoW) para convertir texto en datos numéricos.

4. **Modelo A: BERT** (Bidirectional Encoder Representations from Transformers)

4.1 Configuración y Preparación del Entorno

In [None]:
#Parámetros para Inicialización
#Establecer y configurar semilla aleatoria
#Validar disponibilidad de GPU (En caso contrario usar CPU)

RANDOM_SEED = 7
MAX_LEN = 256
BATCH_SIZE = 32
NCLASSES = 2

np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [None]:
#Forma (shape) del df --> Filas (instancias) y Columnas (atributos)
#Review tomado aleatoriamente
print(df_corpus.shape)
print("\n".join(wrap(df_corpus['review'][123])))

(459370, 2)
This really is a simple and informative way of getting yourself
immersed in Japanese. The lessons are easy to understand and
everything is explained in a friendly, easy to understand manner. All
you really need to do is put your best effort into it and practice. If
you really want to get an advantage then Japanese Hiragana & Katakana
for Beginners: First Steps to Mastering the Japanese Writing System,
is a MUST. Also available on Amazon


In [None]:
#Reducir el número de Reviews para acelerar el desarrollo y la experimentación
df_corpus = df_corpus[0:12000]
print(df_corpus.shape)

(12000, 2)


In [None]:
#Configuración del modelo preentrenado --> Textos en inglés y distingue entre may-min
#Objeto tokenizador
pre_trained_model = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(pre_trained_model)

In [None]:
#Test del tokenizer
sample_txt = 'This really is a simple and informative way of getting yourself immersed in Japanese.'
tokens = tokenizer.tokenize(sample_txt)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print('Sentence: ', sample_txt)
print('Tokens: ', tokens)
print('Tokens num: ', token_ids)

Sentence:  This really is a simple and informative way of getting yourself immersed in Japanese.
Tokens:  ['This', 'really', 'is', 'a', 'simple', 'and', 'inform', '##ative', 'way', 'of', 'getting', 'yourself', 'im', '##mers', '##ed', 'in', 'Japanese', '.']
Tokens num:  [1188, 1541, 1110, 170, 3014, 1105, 12862, 5838, 1236, 1104, 2033, 3739, 13280, 19134, 1174, 1107, 1983, 119]


In [None]:
#Codificación para preparar el texto
encoding = tokenizer.encode_plus(
    sample_txt,
    max_length = 10,
    truncation = True,
    add_special_tokens = True,
    return_token_type_ids = False,
    pad_to_max_length = True,
    return_attention_mask = True,
    return_tensors = 'pt'
)



In [None]:
#Lista con claves (keys) disponibles en diccionario encoding
#Encoding -->  Representación codificada de un texto
encoding.keys()

dict_keys(['input_ids', 'attention_mask'])

In [None]:
#Representación de texto tokenizado para alimentar el un modelo
print(tokenizer.convert_ids_to_tokens(encoding['input_ids'][0]))
print(encoding['input_ids'][0])
print(encoding['attention_mask'][0])

['[CLS]', 'This', 'really', 'is', 'a', 'simple', 'and', 'inform', '##ative', '[SEP]']
tensor([  101,  1188,  1541,  1110,   170,  3014,  1105, 12862,  5838,   102])
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])


In [None]:
#Crear conjunto de datos personalizado --> Clase
#Cargar y procesar datos de Reviews
class ReviewsDataset(Dataset):

  def __init__(self,reviews,labels,tokenizer,max_len):
    self.reviews = reviews
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
      return len(self.reviews)

  def __getitem__(self, item):
    review = str(self.reviews[item])
    label = self.labels[item]
    encoding = tokenizer.encode_plus(
        review,
        max_length = self.max_len,
        truncation = True,
        add_special_tokens = True,
        return_token_type_ids = False,
        pad_to_max_length = True,
        return_attention_mask = True,
        return_tensors = 'pt'
        )

    return {
          'review': review,
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
          'label': torch.tensor(label, dtype=torch.long)
      }

In [None]:
#Función para cargar datos en (batches) y prepararlos para entrenar el modelo
def data_loader(df, tokenizer, max_len, batch_size):
  dataset = ReviewsDataset(
      reviews = df.review.to_numpy(),
      labels = df.label.to_numpy(),
      tokenizer = tokenizer,
      max_len = MAX_LEN  )

  return DataLoader(dataset, batch_size = BATCH_SIZE, num_workers = 4)

4.2 Dividir el Dataset en Train y Test

In [None]:
#Conjunto de prueba --> 20% de los datos
#Crear DataLoader --> train y test (Garantizar que los datos se carguen de manera similar)
df_corpus_train, df_corpus_test = train_test_split(df_corpus, test_size=0.2, random_state=RANDOM_SEED)

train_data_loader = data_loader(df_corpus_train, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = data_loader(df_corpus_test, tokenizer, MAX_LEN, BATCH_SIZE)



In [None]:
#Modelo Clasificador de Sentimiento (BERT)
#Número de neuronas del Modelo --> 768
#Número de neuronas para clasificación (salida) --> 2
#Dropout --> 30% en cada iteración
#Capa forward --> Orden de conexión de las capas de la nn
class BERTSentimentClassifier(nn.Module):

  def __init__(self, n_classes):
    super(BERTSentimentClassifier, self).__init__()
    self.bert = BertModel.from_pretrained(pre_trained_model)
    self.drop = nn.Dropout(p=0.3)
    self.linear = nn.Linear(self.bert.config.hidden_size, n_classes)

  def forward(self, input_ids, attention_mask):
    _, cls_output = self.bert(
        input_ids = input_ids,
        attention_mask = attention_mask
    )
    drop_output = self.drop(cls_output)
    output = self.linear(drop_output)
    return output

In [None]:
model = BERTSentimentClassifier(NCLASSES)
model = model.to(device)

4.3 Entrenamiento del Modelo

In [None]:
#configurar fase de entrenamiento del modelo
#Definir número de iteraciones
#Optimizador para ajustar los pesos
#configurar el programador de velocidad de aprendizaje (learning rate)
#Establecer warmup en 0 --> No hay calentamiento
#Función de pérdida --> Pérdida durante el entrenamiento.
EPOCHS = 5
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps = 0,
    num_training_steps = total_steps
)
loss_fn = nn.CrossEntropyLoss().to(device)



In [None]:
#Funciones para entrenar y evaluar el modelo
def train_model(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
  model = model.train()
  losses = []
  correct_predictions = 0
  for batch in data_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['label'].to(device)
    outputs = model(input_ids = input_ids, attention_mask = attention_mask)
    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, labels)
    correct_predictions += torch.sum(preds == labels)
    losses.append(loss.item())
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
  return correct_predictions.double()/n_examples, np.mean(losses)

def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()
  losses = []
  correct_predictions = 0
  with torch.no_grad():
    for batch in data_loader:
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['label'].to(device)
      outputs = model(input_ids = input_ids, attention_mask = attention_mask)
      _, preds = torch.max(outputs, dim=1)
      loss = loss_fn(outputs, labels)
      correct_predictions += torch.sum(preds == labels)
      losses.append(loss.item())
  return correct_predictions.double()/n_examples, np.mean(losses)

In [None]:
# Entrenamiento!!!

for epoch in range(EPOCHS):
  print('Epoch {} de {}'.format(epoch+1, EPOCHS))
  print('------------------')
  train_acc, train_loss = train_model(
      model, train_data_loader, loss_fn, optimizer, device, scheduler, len(df_train)
  )
  test_acc, test_loss = eval_model(
      model, test_data_loader, loss_fn, device, len(df_test)
  )
  print('Entrenamiento: Loss: {}, accuracy: {}'.format(train_loss, train_acc))
  print('Validación: Loss: {}, accuracy: {}'.format(test_loss, test_acc))
  print('')

Epoch 1 de 5
------------------
Entrenamiento: Loss: 0.2651677090637386, accuracy: 0.90175
Validación: Loss: 0.3484566710740328, accuracy: 0.892

Epoch 2 de 5
------------------
Entrenamiento: Loss: 0.178699440728873, accuracy: 0.9460000000000001
Validación: Loss: 0.40424482348561286, accuracy: 0.8935000000000001

Epoch 3 de 5
------------------
Entrenamiento: Loss: 0.11829778066650033, accuracy: 0.971
Validación: Loss: 0.4401103568077087, accuracy: 0.898

Epoch 4 de 5
------------------
Entrenamiento: Loss: 0.07990703093074263, accuracy: 0.982625
Validación: Loss: 0.49894495904445646, accuracy: 0.8965

Epoch 5 de 5
------------------
Entrenamiento: Loss: 0.06776802394539118, accuracy: 0.9855
Validación: Loss: 0.49894495904445646, accuracy: 0.8965



5. **Modelo B: Regresión Logística**

5.1 Preparación de Datos