In [21]:
import math
import nltk
import spacy
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import transformers
import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from transformers import BertTokenizer
import os
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
from transformers import BertForSequenceClassification



In [22]:

df = pd.read_csv('imdb_labeled_harry_reviews.csv')

# Using 'Review' and 'Sentiment' columns for training
X = df['Cleaned Review'].values
y = df['Sentiment'].values

# Split data into training and testing data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Data read and split successfully.")

Data read and split successfully.


In [23]:
print(df.shape)  # Should print (rows, columns)
print(df.head())  # Check first few rows


(9453, 5)
                                              review sentiment  \
0  One of the other reviewers has mentioned that ...  positive   
1  A wonderful little production. <br /><br />The...  positive   
2  I thought this was a wonderful way to spend ti...  positive   
3  Basically there's a family where a little boy ...  negative   
4  Petter Mattei's "Love in the Time of Money" is...  positive   

                                      Cleaned Review  Sentiment  \
0  one reviewers mentioned watching 1 oz episode ...          1   
1  wonderful little production br br filming tech...          1   
2  thought wonderful way spend time hot summer we...          1   
3  basically theres family little boy jake thinks...          1   
4  petter matteis love time money visually stunni...          1   

  Sentiment Label  
0         neutral  
1         neutral  
2         neutral  
3         neutral  
4         neutral  


In [24]:
import os
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

In [25]:
#Initialize Tokenizer and Encode Data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenisasi dan encoding data
def encode_data(texts, labels, tokenizer, max_length=128):
    input_ids = []
    attention_masks = []
    for text in texts:
        encoded = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0), torch.tensor(labels)

# Encode data
train_inputs, train_masks, train_labels = encode_data(X_train, y_train, tokenizer)
val_inputs, val_masks, val_labels = encode_data(X_val, y_val, tokenizer)

print("Data tokenized and encoded successfully.")

Data tokenized and encoded successfully.


In [32]:
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.data import RandomSampler
from torch.utils.data import SequentialSampler


# Create a DataLoader for training and validation
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

print("DataLoader created successfully.")

DataLoader created successfully.


In [33]:
from transformers import BertForSequenceClassification
from transformers import AdamW
# Initialize BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=3, # Number of classes (negative, neutral, positive)
    output_attentions=False,
    output_hidden_states=False
)

# Definisikan optimizer
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

# Setup device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print("Model and optimizer initialized successfully.")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model and optimizer initialized successfully.


In [37]:
import time
import datetime  # Also required for format_time function

from transformers import get_linear_schedule_with_warmup

# Determine the number of epochs
epochs = 3

# Total training steps
total_steps = len(train_dataloader) * epochs

# Create a scheduler to adjust the learning rate during training
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Accuracy function
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# Function to format time
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

print("Training preparation done successfully.")

Training preparation done successfully.


In [38]:
# Start training
for epoch_i in range(0, epochs):
    print(f'Epoch {epoch_i + 1}/{epochs}')
    print('Training...')

    # Start Timer
    t0 = time.time()

    # Set to training model
    model.train()

    # Total loss during training
    total_train_loss = 0

    for step, batch in enumerate(train_dataloader):
        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print(f' Batch {step} of {len(train_dataloader)}. Elapsed: {elapsed}.')

        # Take the batch and move it to the device
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Reset the gradien
        model.zero_grad()

        # Forward pass
        outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss
        logits = outputs.logits

        # Accumulated total loss
        total_train_loss += loss.item()

        # Backward pass to calculate gradient
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameter dan learning rate
        optimizer.step()
        scheduler.step()

    # Average loss during training
    avg_train_loss = total_train_loss / len(train_dataloader)
    training_time = format_time(time.time() - t0)
    print(f' Average training loss: {avg_train_loss:.2f}')
    print(f' Training epoch took: {training_time}')

    # Validation on the validation set
    print('Running Validation...')

    t0 = time.time()
    model.eval()

    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    for batch in val_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad():
            outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs.loss
            logits = outputs.logits

        total_eval_loss += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        total_eval_accuracy += flat_accuracy(logits, label_ids)

    avg_val_accuracy = total_eval_accuracy / len(val_dataloader)
    print(f' Accuracy: {avg_val_accuracy:.2f}')
    avg_val_loss = total_eval_loss / len(val_dataloader)
    validation_time = format_time(time.time() - t0)
    print(f' Validation Loss: {avg_val_loss:.2f}')
    print(f' Validation took: {validation_time}')

print('Training complete!')

Epoch 1/3
Training...
 Batch 40 of 473. Elapsed: 0:08:20.
 Batch 80 of 473. Elapsed: 0:16:09.
 Batch 120 of 473. Elapsed: 0:24:17.
 Batch 160 of 473. Elapsed: 0:32:12.
 Batch 200 of 473. Elapsed: 0:40:12.
 Batch 240 of 473. Elapsed: 0:48:14.
 Batch 280 of 473. Elapsed: 0:53:53.
 Batch 320 of 473. Elapsed: 0:58:22.
 Batch 360 of 473. Elapsed: 1:02:51.
 Batch 400 of 473. Elapsed: 1:07:20.
 Batch 440 of 473. Elapsed: 1:11:52.
 Average training loss: 0.12
 Training epoch took: 1:15:32
Running Validation...
 Accuracy: 0.96
 Validation Loss: 0.15
 Validation took: 0:03:31
Epoch 2/3
Training...
 Batch 40 of 473. Elapsed: 0:04:31.
 Batch 80 of 473. Elapsed: 0:09:03.
 Batch 120 of 473. Elapsed: 0:13:32.
 Batch 160 of 473. Elapsed: 0:18:03.
 Batch 200 of 473. Elapsed: 0:22:34.
 Batch 240 of 473. Elapsed: 0:27:00.
 Batch 280 of 473. Elapsed: 0:31:27.
 Batch 320 of 473. Elapsed: 0:35:59.
 Batch 360 of 473. Elapsed: 0:40:31.
 Batch 400 of 473. Elapsed: 0:45:03.
 Batch 440 of 473. Elapsed: 0:49:32.


In [39]:
# Saving the model
output_dir = './model/'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f'Model saved to {output_dir}')

Model saved to ./model_save/


In [40]:
import numpy as np
from sklearn.metrics import classification_report
import torch

# Assuming model, device, and val_dataloader are already defined and initialized

# Evaluate model
model.eval()
predictions, true_labels = [], []

for batch in val_dataloader:
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)
    
    with torch.no_grad():
        outputs = model(b_input_ids, attention_mask=b_input_mask)
        logits = outputs.logits
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        predictions.extend(np.argmax(logits, axis=1).flatten())
        true_labels.extend(label_ids.flatten())

# Map the numerical predictions and true labels back to sentiment labels
sentiment_mapping = {0: 'negative', 1: 'neutral', 2: 'positive'}
predictions_mapped = [sentiment_mapping[pred] for pred in predictions]
true_labels_mapped = [sentiment_mapping[true] for true in true_labels]

# Print classification report
print(classification_report(true_labels_mapped, predictions_mapped, target_names=['negative', 'neutral', 'positive'], labels=['negative', 'neutral', 'positive']))

              precision    recall  f1-score   support

    negative       0.38      0.56      0.45        48
     neutral       0.98      0.99      0.99      1817
    positive       0.00      0.00      0.00        26

    accuracy                           0.96      1891
   macro avg       0.45      0.52      0.48      1891
weighted avg       0.96      0.96      0.96      1891



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
