# Import Library

In [2]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
# !pip install transformers[torch]

Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->transformers[torch])
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->transformers[torch])
  Using cached nvidia_cublas_cu

In [16]:
# !pip install accelerate[torch]

Collecting accelerate[torch]
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/309.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━[0m [32m143.4/309.4 kB[0m [31m4.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate[torch])
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia

# Sentiment Analysist Use Pretrained Model BERT Uncased + Fine tuned using row dataset


In [2]:
df = pd.read_excel('lowercase_labels_datasets.xlsx')

In [4]:
df
print(df.columns)

Index(['text', 'sentiment'], dtype='object')


## pre processing:
- drop N/A

In [5]:
df.head()

Unnamed: 0,text,sentiment
0,deserved candidate promoted promptly unbiased ...,1.0
1,got lot learning platform monthly learning pla...,1.0
2,based business unit get experience company pol...,1.0
3,client project good use latest tech work,1.0
4,worked ibm year tc year rd company year comple...,1.0


In [7]:
# Remove rows with missing or invalid text data
df.dropna(subset=['text', 'sentiment'], inplace=True)


In [10]:
len(df)

46416

## preparation for sentiment analysis, consists:
- encode labels
- split data
- tokenize the text column on train and val

In [9]:
# Encode sentiment labels
label_encoder = LabelEncoder()
df['sentiment'] = label_encoder.fit_transform(df['sentiment'])

In [10]:
# Split data into train and test sets
train_texts, val_texts, train_labels, val_labels = train_test_split(df['text'].tolist(), df['sentiment'].tolist(), test_size=0.2)


In [11]:
# Debugging: Print first few entries
print("First 5 train texts:", train_texts[:5])
print("First 5 train labels:", train_labels[:5])
print("First 5 val texts:", val_texts[:5])
print("First 5 val labels:", val_labels[:5])

First 5 train texts: ['as of now nothing from my end because everything going smooth for me.', 'nothing to like here', 'connect with other team to due team work', 'everything', 'just the compensation part']
First 5 train labels: [0, 1, 1, 1, 0]
First 5 val texts: ['bad behavior in bench time, work life balance is total on client wish for good or bad environment, appraisal is poor and in some project senior are doing micro management.', 'work atmosphere', 'now a days the way of handling projects become too difficult.', 'the best environment to work and very understanding manager . work flexibility is there and many more.', 'best work culture']
First 5 val labels: [0, 1, 0, 1, 1]


In [12]:
# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)  # 2 classes: negative and positive


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
# Tokenize data
train_encodings = tokenizer(train_texts, padding=True, truncation=True, max_length=128)
val_encodings = tokenizer(val_texts, padding=True, truncation=True, max_length=128)


In [14]:
# Create torch dataset
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SentimentDataset(train_encodings, train_labels)
val_dataset = SentimentDataset(val_encodings, val_labels)

In [15]:
!pip show accelerate

Name: accelerate
Version: 0.31.0
Summary: Accelerate
Home-page: https://github.com/huggingface/accelerate
Author: The HuggingFace team
Author-email: zach.mueller@huggingface.co
License: Apache
Location: /usr/local/lib/python3.10/dist-packages
Requires: huggingface-hub, numpy, packaging, psutil, pyyaml, safetensors, torch
Required-by: 


## fine tuned, results, and save.

The fine tuned consists:
- epoch: 3
-  bath size: 16

In [16]:
# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

In [17]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)


In [18]:
# Train model
trainer.train()

Step,Training Loss
10,0.7225
20,0.7363
30,0.7181
40,0.7038
50,0.7147
60,0.6895
70,0.6879
80,0.6668
90,0.6681
100,0.6518


TrainOutput(global_step=7032, training_loss=0.5734468095080845, metrics={'train_runtime': 2592.8517, 'train_samples_per_second': 43.38, 'train_steps_per_second': 2.712, 'total_flos': 7398617098959360.0, 'train_loss': 0.5734468095080845, 'epoch': 3.0})

In [19]:
# Evaluate model
results = trainer.evaluate()
print(results)

{'eval_loss': 0.6024132966995239, 'eval_runtime': 66.7705, 'eval_samples_per_second': 140.391, 'eval_steps_per_second': 8.776, 'epoch': 3.0}


In [35]:
#import matplotlib.pyplot as plt

# Get the training logs
# train_logs = trainer.callback_handler.log_history

# Extract the training loss from the logs
# train_loss = [log["loss"] for log in train_logs if "loss" in log]

# Plot the training loss
# plt.plot(train_loss, label="Training Loss")
# plt.xlabel("Epoch")
# plt.ylabel("Loss")
# plt.title("Training Loss Over Epochs")
# plt.legend()
# plt.show()

AttributeError: 'CallbackHandler' object has no attribute 'log_history'

In [20]:
# Save model pretrained model for hugging face
model.save_pretrained("D:\Project\Final Project Indonesia AI/01_09062024_bert_sentiment_model", from_pt=True)

In [None]:
model.save_model('D:\Project\Final Project Indonesia AI')

In [None]:
!pip install

## testing the fine tuned models!

In [5]:
# Load tokenizer and fine-tuned model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained("/content/drive/MyDrive/AI FOR INDONESIA/Final Project/01_09062024_bert_sentiment_model") # the fine tuned model save

# Function to tokenize text and obtain BERT embeddings
def tokenize_and_predict(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    predictions = outputs.logits
    return predictions

# Function to compute percentage
def compute_percentage(predictions):
    softmax_output = torch.softmax(predictions, dim=1)
    positive_percentage = softmax_output[:, 1].item() * 100
    negative_percentage = 100 - positive_percentage
    return positive_percentage, negative_percentage

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [23]:
# User input
user_text = input("")

# Tokenize user input and get predictions
user_predictions = tokenize_and_predict(user_text)
positive_percentage, negative_percentage = compute_percentage(user_predictions)

# Determine sentiment
sentiment = "Positive" if user_predictions.argmax() == 1 else "Negative"

print("Sentiment:", sentiment)
print("Positive sentiment percentage:", positive_percentage)
print("Negative sentiment percentage:", negative_percentage)

🤯jobdesk
Sentiment: Negative
Positive sentiment percentage: 46.032196283340454
Negative sentiment percentage: 53.967803716659546


# Sentimen Analysist Use pretrained Model

In [45]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
from tqdm import tqdm



In [46]:
# Load dataset
data = pd.read_excel('/content/drive/MyDrive/AI FOR INDONESIA/Final Project/lowercase_text_only_datasets.xlsx')



In [47]:
data.head()

Unnamed: 0,text
0,deserved candidates are promoted promptly.\nun...
1,you got lot of learning platform and monthly l...
2,based on which business unit you are you will ...
3,some client projects are good as they use the ...
4,"i have worked in ibm (4 years) ,tcs (1 year) ,..."


In [48]:
# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")



In [49]:
# Function to tokenize text and obtain BERT embeddings
def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1)  # Average pooling over token embeddings



In [None]:
# Apply BERT analysis to each text in the dataset
output_embeddings = []
for text in tqdm(data['text'].astype(str)):
    embeddings = get_bert_embeddings(text)
    output_embeddings.append(embeddings)



 92%|█████████▏| 49949/54026 [1:34:57<06:43, 10.10it/s]

In [1]:
# Add the BERT analysis results to the dataset as a new column
data['output_Bert'] = output_embeddings

# Save the updated dataset
data.to_csv('output_dataset.csv', index=False)

NameError: name 'output_embeddings' is not defined

# Sentiment Analysist Use Pretrained Model BERT Uncased + Fine tuned from cleaned datasets  


In [3]:
df = pd.read_excel('preprocessing_capegini.xlsx')

In [4]:
df

Unnamed: 0,text,sentiment
0,deserved candidate promoted promptly unbiased ...,1.0
1,got lot learning platform monthly learning pla...,1.0
2,based business unit get experience company pol...,1.0
3,client project good use latest tech work,1.0
4,worked ibm year tc year rd company year comple...,1.0
...,...,...
54022,amazon adopted giving responsibility workplace...,0.0
54023,terrible experience working amazon pay leaders...,0.0
54024,logistics supply management application operat...,0.0
54025,toxic culture depend team hierarchical questio...,0.0


In [5]:
# Remove rows with missing or invalid text data
df.dropna(subset=['text', 'sentiment'], inplace=True)


In [6]:
df

Unnamed: 0,text,sentiment
0,deserved candidate promoted promptly unbiased ...,1.0
1,got lot learning platform monthly learning pla...,1.0
2,based business unit get experience company pol...,1.0
3,client project good use latest tech work,1.0
4,worked ibm year tc year rd company year comple...,1.0
...,...,...
54021,le visibility next level promotion point time ...,0.0
54022,amazon adopted giving responsibility workplace...,0.0
54023,terrible experience working amazon pay leaders...,0.0
54024,logistics supply management application operat...,0.0


## preparation for sentiment analysis, consists:
- encode labels
- split data
- tokenize the text column on train and val

In [7]:
# Encode sentiment labels
label_encoder = LabelEncoder()
df['sentiment'] = label_encoder.fit_transform(df['sentiment'])

In [8]:
# Split data into train and test sets
train_texts, val_texts, train_labels, val_labels = train_test_split(df['text'].tolist(), df['sentiment'].tolist(), test_size=0.2)


In [19]:
# GPU check available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [17]:
# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)  # 2 classes: negative and positive
model.to(device) # using gpu cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [10]:
# Tokenize data
train_encodings = tokenizer(train_texts, padding=True, truncation=True, max_length=128)
val_encodings = tokenizer(val_texts, padding=True, truncation=True, max_length=128)


In [11]:
# Create torch dataset
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SentimentDataset(train_encodings, train_labels)
val_dataset = SentimentDataset(val_encodings, val_labels)

## fine tuned, results, and save.

The fine tuned consists:
- epoch: 3
-  bath size: 32

In [30]:
# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    learning_rate=3e-5
)

In [31]:
class CustomTrainer(Trainer):
    def _prepare_inputs(self, inputs):
        # Move inputs to the right device
        if torch.cuda.is_available():
            inputs = {k: v.to('cuda') for k, v in inputs.items()}
        return inputs

In [32]:
# Trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

In [33]:
# Train model
trainer.train()

  2%|▏         | 79/3483 [18:31<13:17:53, 14.06s/it]
                                                   
  2%|▏         | 76/3483 [03:09<1:20:28,  1.42s/it]

{'loss': 0.6039, 'grad_norm': 7.995917797088623, 'learning_rate': 3e-06, 'epoch': 0.04}


                                                   
  2%|▏         | 76/3483 [04:08<1:20:28,  1.42s/it] 

{'loss': 0.585, 'grad_norm': 6.20795202255249, 'learning_rate': 6e-06, 'epoch': 0.09}


                                                   
  2%|▏         | 76/3483 [05:07<1:20:28,  1.42s/it] 

{'loss': 0.6572, 'grad_norm': 5.559715747833252, 'learning_rate': 9e-06, 'epoch': 0.13}


                                                   
  2%|▏         | 76/3483 [06:06<1:20:28,  1.42s/it] 

{'loss': 0.6493, 'grad_norm': 3.923051357269287, 'learning_rate': 1.2e-05, 'epoch': 0.17}


                                                   
  2%|▏         | 76/3483 [07:05<1:20:28,  1.42s/it] 

{'loss': 0.6268, 'grad_norm': 4.417115688323975, 'learning_rate': 1.5e-05, 'epoch': 0.22}


                                                   
  2%|▏         | 76/3483 [08:03<1:20:28,  1.42s/it] 

{'loss': 0.6404, 'grad_norm': 4.569465637207031, 'learning_rate': 1.8e-05, 'epoch': 0.26}


                                                   
  2%|▏         | 76/3483 [09:00<1:20:28,  1.42s/it]

{'loss': 0.6208, 'grad_norm': 3.736809253692627, 'learning_rate': 2.1e-05, 'epoch': 0.3}


                                                   
  2%|▏         | 76/3483 [09:57<1:20:28,  1.42s/it]

{'loss': 0.6309, 'grad_norm': 2.6925530433654785, 'learning_rate': 2.4e-05, 'epoch': 0.34}


                                                   
  2%|▏         | 76/3483 [10:54<1:20:28,  1.42s/it]

{'loss': 0.6228, 'grad_norm': 3.730441093444824, 'learning_rate': 2.7000000000000002e-05, 'epoch': 0.39}


                                                   
  2%|▏         | 76/3483 [11:53<1:20:28,  1.42s/it]

{'loss': 0.5955, 'grad_norm': 2.6296682357788086, 'learning_rate': 3e-05, 'epoch': 0.43}


                                                   
  2%|▏         | 76/3483 [12:53<1:20:28,  1.42s/it]

{'loss': 0.6239, 'grad_norm': 3.2641515731811523, 'learning_rate': 2.949715051961113e-05, 'epoch': 0.47}


                                                   
  2%|▏         | 76/3483 [13:51<1:20:28,  1.42s/it]

{'loss': 0.6038, 'grad_norm': 2.2596938610076904, 'learning_rate': 2.899430103922226e-05, 'epoch': 0.52}


                                                   
  2%|▏         | 76/3483 [14:50<1:20:28,  1.42s/it]

{'loss': 0.6078, 'grad_norm': 1.7881311178207397, 'learning_rate': 2.8491451558833388e-05, 'epoch': 0.56}


                                                   
  2%|▏         | 76/3483 [15:43<1:20:28,  1.42s/it]

{'loss': 0.6246, 'grad_norm': 2.4745266437530518, 'learning_rate': 2.7988602078444518e-05, 'epoch': 0.6}


                                                   
  2%|▏         | 76/3483 [16:34<1:20:28,  1.42s/it]

{'loss': 0.6082, 'grad_norm': 2.407715082168579, 'learning_rate': 2.748575259805565e-05, 'epoch': 0.65}


                                                   
  2%|▏         | 76/3483 [17:26<1:20:28,  1.42s/it]

{'loss': 0.6104, 'grad_norm': 3.2056241035461426, 'learning_rate': 2.698290311766678e-05, 'epoch': 0.69}


                                                   
  2%|▏         | 76/3483 [18:20<1:20:28,  1.42s/it]

{'loss': 0.5991, 'grad_norm': 1.8405077457427979, 'learning_rate': 2.648005363727791e-05, 'epoch': 0.73}


                                                   
  2%|▏         | 76/3483 [19:14<1:20:28,  1.42s/it]

{'loss': 0.6018, 'grad_norm': 2.2860007286071777, 'learning_rate': 2.597720415688904e-05, 'epoch': 0.78}


                                                   
  2%|▏         | 76/3483 [20:08<1:20:28,  1.42s/it]

{'loss': 0.618, 'grad_norm': 1.8008999824523926, 'learning_rate': 2.547435467650017e-05, 'epoch': 0.82}


                                                   
  2%|▏         | 76/3483 [21:02<1:20:28,  1.42s/it]

{'loss': 0.6188, 'grad_norm': 1.8563376665115356, 'learning_rate': 2.49715051961113e-05, 'epoch': 0.86}


                                                   
  2%|▏         | 76/3483 [22:00<1:20:28,  1.42s/it]

{'loss': 0.5952, 'grad_norm': 1.6813839673995972, 'learning_rate': 2.4468655715722426e-05, 'epoch': 0.9}


                                                   
  2%|▏         | 76/3483 [22:55<1:20:28,  1.42s/it]

{'loss': 0.5986, 'grad_norm': 1.7797671556472778, 'learning_rate': 2.3965806235333556e-05, 'epoch': 0.95}


                                                   
  2%|▏         | 76/3483 [23:49<1:20:28,  1.42s/it]

{'loss': 0.6087, 'grad_norm': 2.3820126056671143, 'learning_rate': 2.346295675494469e-05, 'epoch': 0.99}


                                                   
  2%|▏         | 76/3483 [24:44<1:20:28,  1.42s/it]

{'loss': 0.5741, 'grad_norm': 2.40838623046875, 'learning_rate': 2.296010727455582e-05, 'epoch': 1.03}


                                                   
  2%|▏         | 76/3483 [25:40<1:20:28,  1.42s/it]

{'loss': 0.5898, 'grad_norm': 2.49617862701416, 'learning_rate': 2.2457257794166946e-05, 'epoch': 1.08}


                                                   
  2%|▏         | 76/3483 [26:35<1:20:28,  1.42s/it]

{'loss': 0.594, 'grad_norm': 3.326719045639038, 'learning_rate': 2.1954408313778076e-05, 'epoch': 1.12}


                                                   
  2%|▏         | 76/3483 [27:31<1:20:28,  1.42s/it]

{'loss': 0.5801, 'grad_norm': 2.1797850131988525, 'learning_rate': 2.1451558833389206e-05, 'epoch': 1.16}


                                                   
  2%|▏         | 76/3483 [28:26<1:20:28,  1.42s/it]

{'loss': 0.5839, 'grad_norm': 2.2415149211883545, 'learning_rate': 2.0948709353000336e-05, 'epoch': 1.21}


                                                   
  2%|▏         | 76/3483 [29:22<1:20:28,  1.42s/it]

{'loss': 0.5861, 'grad_norm': 1.9618936777114868, 'learning_rate': 2.0445859872611463e-05, 'epoch': 1.25}


                                                   
  2%|▏         | 76/3483 [30:18<1:20:28,  1.42s/it]

{'loss': 0.583, 'grad_norm': 1.9430819749832153, 'learning_rate': 1.9943010392222593e-05, 'epoch': 1.29}


                                                   
  2%|▏         | 76/3483 [31:17<1:20:28,  1.42s/it]

{'loss': 0.5968, 'grad_norm': 5.654139995574951, 'learning_rate': 1.9440160911833727e-05, 'epoch': 1.34}


                                                   
  2%|▏         | 76/3483 [32:13<1:20:28,  1.42s/it]

{'loss': 0.6028, 'grad_norm': 1.373599648475647, 'learning_rate': 1.8937311431444857e-05, 'epoch': 1.38}


                                                   
  2%|▏         | 76/3483 [33:09<1:20:28,  1.42s/it]

{'loss': 0.6044, 'grad_norm': 2.2413198947906494, 'learning_rate': 1.8434461951055984e-05, 'epoch': 1.42}


                                                   
  2%|▏         | 76/3483 [34:05<1:20:28,  1.42s/it]

{'loss': 0.6088, 'grad_norm': 3.0875980854034424, 'learning_rate': 1.7931612470667114e-05, 'epoch': 1.46}


                                                   
  2%|▏         | 76/3483 [35:01<1:20:28,  1.42s/it]

{'loss': 0.5715, 'grad_norm': 2.6904196739196777, 'learning_rate': 1.7428762990278244e-05, 'epoch': 1.51}


                                                   
  2%|▏         | 76/3483 [35:57<1:20:28,  1.42s/it]

{'loss': 0.5668, 'grad_norm': 1.7949897050857544, 'learning_rate': 1.6925913509889374e-05, 'epoch': 1.55}


                                                   
  2%|▏         | 76/3483 [36:53<1:20:28,  1.42s/it]

{'loss': 0.5719, 'grad_norm': 3.053746461868286, 'learning_rate': 1.64230640295005e-05, 'epoch': 1.59}


                                                   
  2%|▏         | 76/3483 [37:49<1:20:28,  1.42s/it]

{'loss': 0.5912, 'grad_norm': 2.0147759914398193, 'learning_rate': 1.592021454911163e-05, 'epoch': 1.64}


                                                   
  2%|▏         | 76/3483 [38:45<1:20:28,  1.42s/it]

{'loss': 0.5607, 'grad_norm': 1.8114467859268188, 'learning_rate': 1.5417365068722765e-05, 'epoch': 1.68}


                                                   
  2%|▏         | 76/3483 [39:41<1:20:28,  1.42s/it]

{'loss': 0.5964, 'grad_norm': 2.3230390548706055, 'learning_rate': 1.4914515588333891e-05, 'epoch': 1.72}


                                                   
  2%|▏         | 76/3483 [40:40<1:20:28,  1.42s/it]

{'loss': 0.5676, 'grad_norm': 2.3647232055664062, 'learning_rate': 1.4411666107945023e-05, 'epoch': 1.77}


                                                   
  2%|▏         | 76/3483 [41:36<1:20:28,  1.42s/it]

{'loss': 0.592, 'grad_norm': 1.8679039478302002, 'learning_rate': 1.3908816627556152e-05, 'epoch': 1.81}


                                                   
  2%|▏         | 76/3483 [42:33<1:20:28,  1.42s/it]

{'loss': 0.5901, 'grad_norm': 1.468088150024414, 'learning_rate': 1.3405967147167282e-05, 'epoch': 1.85}


                                                   
  2%|▏         | 76/3483 [43:29<1:20:28,  1.42s/it]

{'loss': 0.5846, 'grad_norm': 3.028562068939209, 'learning_rate': 1.290311766677841e-05, 'epoch': 1.89}


                                                   
  2%|▏         | 76/3483 [44:25<1:20:28,  1.42s/it]

{'loss': 0.569, 'grad_norm': 3.046839475631714, 'learning_rate': 1.240026818638954e-05, 'epoch': 1.94}


                                                   
  2%|▏         | 76/3483 [45:21<1:20:28,  1.42s/it]

{'loss': 0.5899, 'grad_norm': 1.7646021842956543, 'learning_rate': 1.189741870600067e-05, 'epoch': 1.98}


                                                   
  2%|▏         | 76/3483 [46:17<1:20:28,  1.42s/it]

{'loss': 0.5912, 'grad_norm': 3.191164493560791, 'learning_rate': 1.13945692256118e-05, 'epoch': 2.02}


                                                   
  2%|▏         | 76/3483 [47:13<1:20:28,  1.42s/it]

{'loss': 0.5531, 'grad_norm': 3.127356767654419, 'learning_rate': 1.0891719745222929e-05, 'epoch': 2.07}


                                                   
  2%|▏         | 76/3483 [48:09<1:20:28,  1.42s/it]

{'loss': 0.5432, 'grad_norm': 1.7282863855361938, 'learning_rate': 1.038887026483406e-05, 'epoch': 2.11}


                                                   
  2%|▏         | 76/3483 [49:04<1:20:28,  1.42s/it]

{'loss': 0.5408, 'grad_norm': 2.9119255542755127, 'learning_rate': 9.88602078444519e-06, 'epoch': 2.15}


                                                   
  2%|▏         | 76/3483 [50:03<1:20:28,  1.42s/it]

{'loss': 0.5606, 'grad_norm': 3.539696216583252, 'learning_rate': 9.38317130405632e-06, 'epoch': 2.2}


                                                   
  2%|▏         | 76/3483 [50:59<1:20:28,  1.42s/it]

{'loss': 0.5435, 'grad_norm': 2.8829903602600098, 'learning_rate': 8.880321823667448e-06, 'epoch': 2.24}


                                                   
  2%|▏         | 76/3483 [51:55<1:20:28,  1.42s/it]

{'loss': 0.5359, 'grad_norm': 2.409550666809082, 'learning_rate': 8.377472343278578e-06, 'epoch': 2.28}


                                                   
  2%|▏         | 76/3483 [52:50<1:20:28,  1.42s/it]

{'loss': 0.5526, 'grad_norm': 1.9949469566345215, 'learning_rate': 7.87462286288971e-06, 'epoch': 2.33}


                                                   
  2%|▏         | 76/3483 [53:46<1:20:28,  1.42s/it]

{'loss': 0.5836, 'grad_norm': 2.5176985263824463, 'learning_rate': 7.371773382500838e-06, 'epoch': 2.37}


                                                   
  2%|▏         | 76/3483 [54:42<1:20:28,  1.42s/it]

{'loss': 0.5359, 'grad_norm': 2.811687469482422, 'learning_rate': 6.868923902111968e-06, 'epoch': 2.41}


                                                   
  2%|▏         | 76/3483 [55:37<1:20:28,  1.42s/it]

{'loss': 0.5562, 'grad_norm': 1.7371596097946167, 'learning_rate': 6.366074421723098e-06, 'epoch': 2.45}


                                                   
  2%|▏         | 76/3483 [56:33<1:20:28,  1.42s/it]

{'loss': 0.5606, 'grad_norm': 2.4067986011505127, 'learning_rate': 5.863224941334227e-06, 'epoch': 2.5}


                                                   
  2%|▏         | 76/3483 [57:29<1:20:28,  1.42s/it]

{'loss': 0.5518, 'grad_norm': 2.2243287563323975, 'learning_rate': 5.360375460945357e-06, 'epoch': 2.54}


                                                   
  2%|▏         | 76/3483 [58:25<1:20:28,  1.42s/it]

{'loss': 0.564, 'grad_norm': 2.4572155475616455, 'learning_rate': 4.8575259805564865e-06, 'epoch': 2.58}


                                                   
  2%|▏         | 76/3483 [59:23<1:20:28,  1.42s/it]

{'loss': 0.5407, 'grad_norm': 2.7758336067199707, 'learning_rate': 4.354676500167617e-06, 'epoch': 2.63}


                                                   
  2%|▏         | 76/3483 [1:00:18<1:20:28,  1.42s/it]

{'loss': 0.5572, 'grad_norm': 3.165695905685425, 'learning_rate': 3.851827019778746e-06, 'epoch': 2.67}


                                                     
  2%|▏         | 76/3483 [1:01:14<1:20:28,  1.42s/it]

{'loss': 0.5291, 'grad_norm': 2.412881851196289, 'learning_rate': 3.348977539389876e-06, 'epoch': 2.71}


                                                     
  2%|▏         | 76/3483 [1:02:09<1:20:28,  1.42s/it]

{'loss': 0.561, 'grad_norm': 1.4612728357315063, 'learning_rate': 2.846128059001006e-06, 'epoch': 2.76}


                                                     
  2%|▏         | 76/3483 [1:03:05<1:20:28,  1.42s/it]

{'loss': 0.5211, 'grad_norm': 2.5966286659240723, 'learning_rate': 2.3432785786121355e-06, 'epoch': 2.8}


                                                     
  2%|▏         | 76/3483 [1:04:00<1:20:28,  1.42s/it]

{'loss': 0.5485, 'grad_norm': 1.788744330406189, 'learning_rate': 1.840429098223265e-06, 'epoch': 2.84}


                                                     
  2%|▏         | 76/3483 [1:04:56<1:20:28,  1.42s/it]

{'loss': 0.5436, 'grad_norm': 2.129650354385376, 'learning_rate': 1.3375796178343948e-06, 'epoch': 2.89}


                                                     
  2%|▏         | 76/3483 [1:05:51<1:20:28,  1.42s/it]

{'loss': 0.5619, 'grad_norm': 2.93367600440979, 'learning_rate': 8.347301374455247e-07, 'epoch': 2.93}


                                                     
  2%|▏         | 76/3483 [1:06:47<1:20:28,  1.42s/it]

{'loss': 0.5495, 'grad_norm': 2.4409220218658447, 'learning_rate': 3.3188065705665436e-07, 'epoch': 2.97}


                                                     
100%|██████████| 3483/3483 [1:05:15<00:00,  1.12s/it]

{'train_runtime': 3915.6233, 'train_samples_per_second': 28.449, 'train_steps_per_second': 0.89, 'train_loss': 0.5836303731746931, 'epoch': 3.0}





TrainOutput(global_step=3483, training_loss=0.5836303731746931, metrics={'train_runtime': 3915.6233, 'train_samples_per_second': 28.449, 'train_steps_per_second': 0.89, 'total_flos': 7327379780720640.0, 'train_loss': 0.5836303731746931, 'epoch': 3.0})

In [34]:
# Evaluate model
results = trainer.evaluate()
print(results)

100%|██████████| 291/291 [01:25<00:00,  3.39it/s]

{'eval_loss': 0.619106650352478, 'eval_runtime': 86.0218, 'eval_samples_per_second': 107.926, 'eval_steps_per_second': 3.383, 'epoch': 3.0}





In [35]:
# Save model pretrained model for hugging face
output_dir="D:\Project\Final Project Indonesia AI/02_11062024_bert_sentiment_model"
model.save_pretrained(output_dir, from_pt=True)
tokenizer.save_pretrained(output_dir)

('D:\\Project\\Final Project Indonesia AI/02_11062024_bert_sentiment_model\\tokenizer_config.json',
 'D:\\Project\\Final Project Indonesia AI/02_11062024_bert_sentiment_model\\special_tokens_map.json',
 'D:\\Project\\Final Project Indonesia AI/02_11062024_bert_sentiment_model\\vocab.txt',
 'D:\\Project\\Final Project Indonesia AI/02_11062024_bert_sentiment_model\\added_tokens.json')

In [41]:
# import matplotlib.pyplot as plt

# # Assuming `trainer` is your Trainer object with training history
# train_loss = trainer.state.log_history['loss']
# eval_loss = trainer.state.log_history['eval_loss']

# # Plot loss over epochs
# plt.plot(range(len(train_loss)), train_loss, label='Train Loss')
# plt.plot(range(len(eval_loss)), eval_loss, label='Eval Loss')
# plt.xlabel('Epoch')
# plt.ylabel('Loss')
# plt.title('Training and Evaluation Loss')
# plt.legend()
# plt.show()

TypeError: list indices must be integers or slices, not str

## testing the fine tuned models with cleaned dataset

In [3]:
# Load tokenizer and fine-tuned model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained("02_11062024_bert_sentiment_model") # the fine tuned model save

# Function to tokenize text and obtain BERT embeddings
def tokenize_and_predict(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    predictions = outputs.logits
    return predictions

# Function to compute percentage
def compute_percentage(predictions):
    softmax_output = torch.softmax(predictions, dim=1)
    positive_percentage = softmax_output[:, 1].item() * 100
    negative_percentage = 100 - positive_percentage
    return positive_percentage, negative_percentage

In [4]:
# User input
user_text = ("🤯jobdesk")

# Tokenize user input and get predictions
user_predictions = tokenize_and_predict(user_text)
positive_percentage, negative_percentage = compute_percentage(user_predictions)

# Determine sentiment
sentiment = "Positive" if user_predictions.argmax() == 1 else "Negative"

print("Sentiment:", sentiment)
print("Positive sentiment percentage:", positive_percentage)
print("Negative sentiment percentage:", negative_percentage)

Sentiment: Negative
Positive sentiment percentage: 40.34416675567627
Negative sentiment percentage: 59.65583324432373
