<a href="https://colab.research.google.com/github/Desertfeng/Sentiment_Analysis-Based-on-BERT/blob/main/In_Depth_Exploration_of_Sentiment_Analysis_Based_on_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [69]:
# Use Model
# Test Model
# Train Model

import pandas as pd
import torch


In [70]:
# Use Model
# Test Model
# Train Model

!pip install accelerate
!pip install transformers[torch]
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset



In [20]:
# Train Model
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [63]:
# Test Model
# Train Model
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [57]:
# Test Model
# Train Model

data_path = 'https://drive.google.com/file/d/1GNfku-xK9oPqokbTYoD0uYjzVx0cGxwP/view?usp=drive_link'
column_names = ['target', 'ids', 'date', 'flag', 'user', 'text']
df = pd.read_csv(data_path, encoding='ISO-8859-1', names=column_names)
df = df[['target', 'text']]
df['target'] = df['target'].map({0: 0, 4: 1})
df_sampled = df.sample(frac=0.2, random_state=42)

# 使用这个子集替换原始数据集
df = df_sampled.reset_index(drop=True)

In [66]:
# Test Model
# Train Model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


In [35]:
# Test Model
# Train Model

def tokenize_function(examples):
    return tokenizer(examples['text'].tolist(), padding='max_length', truncation=True, max_length=128)

tokenized_datasets = tokenize_function(df)

In [None]:
# Test Model
# Train Model
labels = df['target'].tolist()
num_samples = len(df)
train_encodings = {key: value[:int(0.9 * num_samples)] for key, value in tokenized_datasets.items()}
train_labels = labels[:int(0.9 * num_samples)]
val_encodings = {key: value[int(0.9 * num_samples):] for key, value in tokenized_datasets.items()}
val_labels = labels[int(0.9 * num_samples):]
train_dataset = CustomDataset(train_encodings, train_labels)
val_dataset = CustomDataset(val_encodings, val_labels)


In [36]:
# Train Model
# 4. 初始化模型
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [38]:
# Train Model
# 5. 设置训练参数和开始训练
training_args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    evaluation_strategy="steps",
    save_steps=5000,
    eval_steps=5000,
    logging_steps=500,
    learning_rate=2e-5,
    output_dir="./results",
    do_train=True,
    do_eval=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

trainer.train()

Step,Training Loss,Validation Loss
5000,0.3983,0.381106
10000,0.3997,0.362351
15000,0.3751,0.388362
20000,0.3561,0.358346
25000,0.352,0.35522
30000,0.3562,0.345483
35000,0.3534,0.369384
40000,0.3062,0.484612
45000,0.3104,0.379376
50000,0.3179,0.425256


TrainOutput(global_step=72000, training_loss=0.34102473576863607, metrics={'train_runtime': 7273.8754, 'train_samples_per_second': 79.187, 'train_steps_per_second': 9.898, 'total_flos': 3.788799197184e+16, 'train_loss': 0.34102473576863607, 'epoch': 2.0})

In [39]:
# Train Model
model.save_pretrained('gdrive/MyDrive/sentiment_model')
tokenizer.save_pretrained('gdrive/MyDrive/sentiment_model')

('gdrive/MyDrive/sentiment_model/tokenizer_config.json',
 'gdrive/MyDrive/sentiment_model/special_tokens_map.json',
 'gdrive/MyDrive/sentiment_model/vocab.txt',
 'gdrive/MyDrive/sentiment_model/added_tokens.json')

In [73]:
# Use Model
# Test Model



!pip install gdown




In [75]:
# Use Model
# Test Model



!mkdir -p /content/sentiment_model
!gdown --id "1-uD1pyDGjkDQRWUotfQM58vKQnG9TnMd" -O /content/sentiment_model/pytorch_model.bin
!gdown --id "1-xxz9d1IeDpwhCJuGtkgkNtxFKthiN5U" -O /content/sentiment_model/vocab.txt
!gdown --id "100L1lMCFAcXhi9wzRO2v1UDEk_dwR1Cq" -O /content/sentiment_model/added_tokens.json
!gdown --id "1-goKv1Ar_OGZS_I3aX5rirH4MhbhbCff" -O /content/sentiment_model/config.json
!gdown --id "1055TIepfB61jsnzer11p6ZBZCHcnZEFy" -O /content/sentiment_model/special_tokens_map.json
!gdown --id "10AKjdHpeGlxqI8tcu0o4RAaV30ryjiw5" -O /content/sentiment_model/tokenizer_config.json


Downloading...
From: https://drive.google.com/uc?id=1-uD1pyDGjkDQRWUotfQM58vKQnG9TnMd
To: /content/sentiment_model/pytorch_model.bin
100% 438M/438M [00:02<00:00, 167MB/s]
Downloading...
From: https://drive.google.com/uc?id=1-xxz9d1IeDpwhCJuGtkgkNtxFKthiN5U
To: /content/sentiment_model/vocab.txt
100% 232k/232k [00:00<00:00, 21.1MB/s]
Downloading...
From: https://drive.google.com/uc?id=100L1lMCFAcXhi9wzRO2v1UDEk_dwR1Cq
To: /content/sentiment_model/added_tokens.json
100% 82.0/82.0 [00:00<00:00, 242kB/s]
Downloading...
From: https://drive.google.com/uc?id=1-goKv1Ar_OGZS_I3aX5rirH4MhbhbCff
To: /content/sentiment_model/config.json
100% 727/727 [00:00<00:00, 1.98MB/s]
Downloading...
From: https://drive.google.com/uc?id=1055TIepfB61jsnzer11p6ZBZCHcnZEFy
To: /content/sentiment_model/special_tokens_map.json
100% 125/125 [00:00<00:00, 322kB/s]
Downloading...
From: https://drive.google.com/uc?id=10AKjdHpeGlxqI8tcu0o4RAaV30ryjiw5
To: /content/sentiment_model/tokenizer_config.json
100% 1.43k/1.43k [

In [76]:
# Use Model
# Test Model



from transformers import BertForSequenceClassification, BertTokenizer

model = BertForSequenceClassification.from_pretrained('/content/sentiment_model')
tokenizer = BertTokenizer.from_pretrained('/content/sentiment_model')


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [58]:
# Use Model

def sentiment_analysis(sentence):
    # 对句子进行token化
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=128)

    # 使用模型进行预测
    with torch.no_grad():
        logits = model(**inputs).logits

    # 获取预测结果
    pred = torch.argmax(logits, dim=1).item()

    # 根据预测结果返回情感
    if pred == 0:
        return "Negative Sentiment"
    else:
        return "Positive Sentiment"

# 使用方法：
sentence = input("Please enter a sentence for sentiment analysis: ")
result = sentiment_analysis(sentence)
print(f"The sentiment of the sentence is: {result}")


KeyboardInterrupt: ignored

In [59]:
# Test Model



!pip install transformers[torch]
!pip install sklearn


Collecting sklearn
  Downloading sklearn-0.0.post10.tar.gz (3.6 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.


In [60]:
# Test Model

from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pandas as pd

# 加载模型和tokenizer
model = BertForSequenceClassification.from_pretrained('/content/sentiment_model')
tokenizer = BertTokenizer.from_pretrained('/content/sentiment_model')





# 创建一个Trainer实例，用于评估
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    compute_metrics=lambda eval_pred: {"dummy_metric": 0},  # 后面手动计算指标
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [61]:
# Test Model

# 获取预测结果
predictions = trainer.predict(val_dataset)
preds = predictions.predictions.argmax(-1)  # 获取每个样本的预测类别

# 计算指标
accuracy = accuracy_score(val_labels, preds)
precision, recall, f1, _ = precision_recall_fscore_support(val_labels, preds, average='binary')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

NameError: ignored