<a href="https://colab.research.google.com/github/Desertfeng/Sentiment_Analysis-Based-on-BERT/blob/main/In_Depth_Exploration_of_Sentiment_Analysis_Based_on_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [29]:
import pandas as pd
import torch


In [None]:
!pip install accelerate
!pip install transformers[torch]
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset

In [31]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [32]:
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
data_path = '/content/gdrive/MyDrive/MLA2/sentiment140.csv'

In [None]:
data_path = '/content/gdrive/MyDrive/MLA2/sentiment140.csv' #If can not access google drive, use loacl path instead

In [33]:
data_path = '/content/gdrive/MyDrive/MLA2/sentiment140.csv'
column_names = ['target', 'ids', 'date', 'flag', 'user', 'text']
df = pd.read_csv(data_path, encoding='ISO-8859-1', names=column_names)
df = df[['target', 'text']]
df['target'] = df['target'].map({0: 0, 4: 1})
df_sampled = df.sample(frac=0.2, random_state=42)

# 使用这个子集替换原始数据集
df = df_sampled.reset_index(drop=True)

In [34]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


In [35]:
def tokenize_function(examples):
    return tokenizer(examples['text'].tolist(), padding='max_length', truncation=True, max_length=128)

tokenized_datasets = tokenize_function(df)

In [36]:
labels = df['target'].tolist()
num_samples = len(df)
train_encodings = {key: value[:int(0.9 * num_samples)] for key, value in tokenized_datasets.items()}
train_labels = labels[:int(0.9 * num_samples)]
val_encodings = {key: value[int(0.9 * num_samples):] for key, value in tokenized_datasets.items()}
val_labels = labels[int(0.9 * num_samples):]

train_dataset = CustomDataset(train_encodings, train_labels)
val_dataset = CustomDataset(val_encodings, val_labels)

In [37]:
# 4. 初始化模型
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [38]:
# 5. 设置训练参数和开始训练
training_args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    evaluation_strategy="steps",
    save_steps=5000,
    eval_steps=5000,
    logging_steps=500,
    learning_rate=2e-5,
    output_dir="./results",
    do_train=True,
    do_eval=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

trainer.train()

Step,Training Loss,Validation Loss
5000,0.3983,0.381106
10000,0.3997,0.362351
15000,0.3751,0.388362
20000,0.3561,0.358346
25000,0.352,0.35522
30000,0.3562,0.345483
35000,0.3534,0.369384
40000,0.3062,0.484612
45000,0.3104,0.379376
50000,0.3179,0.425256


TrainOutput(global_step=72000, training_loss=0.34102473576863607, metrics={'train_runtime': 7273.8754, 'train_samples_per_second': 79.187, 'train_steps_per_second': 9.898, 'total_flos': 3.788799197184e+16, 'train_loss': 0.34102473576863607, 'epoch': 2.0})

In [39]:
model.save_pretrained('gdrive/MyDrive/sentiment_model')
tokenizer.save_pretrained('gdrive/MyDrive/sentiment_model')

('gdrive/MyDrive/sentiment_model/tokenizer_config.json',
 'gdrive/MyDrive/sentiment_model/special_tokens_map.json',
 'gdrive/MyDrive/sentiment_model/vocab.txt',
 'gdrive/MyDrive/sentiment_model/added_tokens.json')

In [40]:
def sentiment_analysis(sentence):
    # 获取模型的设备
    device = next(model.parameters()).device

    # 对句子进行token化
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=128)

    # 将输入数据移动到模型所在的设备
    inputs = {name: tensor.to(device) for name, tensor in inputs.items()}

    # 使用模型进行预测
    with torch.no_grad():
        logits = model(**inputs).logits

    # 获取预测结果
    pred = torch.argmax(logits, dim=1).item()

    # 根据预测结果返回情感
    if pred == 0:
        return "Negative Sentiment"
    else:
        return "Positive Sentiment"


# 新段落

In [41]:
sentence = input("Please enter a sentence for sentiment analysis: ")
result = sentiment_analysis(sentence)
print(f"The sentiment of the sentence is: {result}")

Please enter a sentence for sentiment analysis: 1
The sentiment of the sentence is: Negative Sentiment


In [42]:
model = BertForSequenceClassification.from_pretrained('gdrive/MyDrive/sentiment_model')
tokenizer = BertTokenizer.from_pretrained('gdrive/MyDrive/sentiment_model')


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
model = BertForSequenceClassification.from_pretrained('gdrive/MyDrive/sentiment_model')
tokenizer = BertTokenizer.from_pretrained('gdrive/MyDrive/sentiment_model') #If can not access google drive, use loacl path instead

In [45]:
def sentiment_analysis(sentence):
    # 对句子进行token化
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=128)

    # 使用模型进行预测
    with torch.no_grad():
        logits = model(**inputs).logits

    # 获取预测结果
    pred = torch.argmax(logits, dim=1).item()

    # 根据预测结果返回情感
    if pred == 0:
        return "Negative Sentiment"
    else:
        return "Positive Sentiment"

# 使用方法：
sentence = input("Please enter a sentence for sentiment analysis: ")
result = sentiment_analysis(sentence)
print(f"The sentiment of the sentence is: {result}")


KeyboardInterrupt: ignored

In [None]:
!pip install transformers[torch]
!pip install sklearn


In [46]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pandas as pd

# 加载模型和tokenizer
model_path = 'gdrive/MyDrive/sentiment_model'
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)



# 创建一个Trainer实例，用于评估
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    compute_metrics=lambda eval_pred: {"dummy_metric": 0},  # 后面手动计算指标
)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [48]:
# 获取预测结果
predictions = trainer.predict(val_dataset)
preds = predictions.predictions.argmax(-1)  # 获取每个样本的预测类别

# 计算指标
accuracy = accuracy_score(val_labels, preds)
precision, recall, f1, _ = precision_recall_fscore_support(val_labels, preds, average='binary')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


Accuracy: 0.86571875
Precision: 0.8694940850742512
Recall: 0.8614176173555265
F1 Score: 0.8654370087370432
