In [None]:
!pip install torch
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import pandas as pd



In [None]:
print('-------------')

-------------


In [None]:
model_name = 'bert-large-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
dataset_file_name = './output.csv'
df = pd.read_csv(dataset_file_name,  usecols=['review', 'decison'])
df['lebel'] = df['decison'].apply(lambda data: 0 if data=='no' else 1)
df.shape
df.head(5)

Unnamed: 0,review,decison,lebel
0,"Worst airline experience ever. Always delayed,...",no,0
1,One of the worst experience for our upcoming f...,no,0
2,Turkish Airlines has changed since the first t...,no,0
3,When we flew back on July 30th Chisinau to Rig...,no,0
4,I had the pleasure to meet Pablo at Miami airp...,yes,1


# New section

In [None]:
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)

In [None]:

# Define a custom dataset class
class SentimentDataset(Dataset):
    def __init__(self, reviews, labels):
        self.reviews = reviews
        self.labels = labels

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        review = str(self.reviews.iloc[idx])
        label = int(self.labels.iloc[idx])
        return review, label

In [None]:

# Create train and test datasets
train_dataset = SentimentDataset(train_df['review'], train_df['lebel'])
test_dataset = SentimentDataset(test_df['review'], test_df['lebel'])

In [None]:
# Tokenize and format the input data
def tokenize_data(data):
    input_ids = []
    attention_masks = []

    for text, label in tqdm(data):
        encoded_text = tokenizer.encode_plus(
            text,
            max_length=128,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )
        input_ids.append(encoded_text['input_ids'])
        attention_masks.append(encoded_text['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks, torch.tensor(data.labels.values)


In [None]:
# Tokenize the training and testing datasets
train_input_ids, train_attention_masks, train_labels = tokenize_data(train_dataset)
test_input_ids, test_attention_masks, test_labels = tokenize_data(test_dataset)


100%|██████████| 1644/1644 [00:05<00:00, 276.41it/s]
100%|██████████| 705/705 [00:07<00:00, 88.23it/s]


In [None]:
# Create DataLoader for training and testing datasets
batch_size = 64
train_data = torch.utils.data.TensorDataset(train_input_ids, train_attention_masks, train_labels)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)


test_data = torch.utils.data.TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

In [None]:
# Fine-tune the BERT model
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

In [None]:
# Training loop (you might want to adjust this loop based on your specific requirements)
num_epochs = 3
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,

In [None]:
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    print('---')
    for batch in train_loader:
        print('==')
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    average_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch + 1}/{num_epochs}, Average Training Loss: {average_loss}')


---
==


OutOfMemoryError: ignored

In [None]:

# Evaluate the model on the test set
model.eval()
correct_predictions = 0

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1)
        correct_predictions += torch.sum(predictions == labels).item()

accuracy = correct_predictions / len(test_loader.dataset)
print(f'Test Accuracy: {accuracy}')

NameError: ignored

In [None]:
def predict_sentiment(model, tokenizer, review, max_length=128):
    model.eval()
    with torch.no_grad():
        # Tokenize and format the input review
        inputs = tokenizer.encode_plus(
            review,
            max_length=max_length,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )

        # Move inputs to the appropriate device
        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)

        # Make predictions
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        # Get predicted sentiment (class with the highest logit value)
        predicted_class = torch.argmax(logits, dim=1).item()

    return predicted_class

In [None]:

# Sample extra review
extra_review = "The food is very bad"

# Predict sentiment for the extra review
predicted_sentiment = predict_sentiment(model, tokenizer, extra_review)

# Interpret the result
if predicted_sentiment == 0:
    print("Negative sentiment")
elif predicted_sentiment == 1:
    print("Positive sentiment")
else:
    print("Neutral sentiment")

Negative sentiment
