In [None]:
pip install opencc-python-reimplemented


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting opencc-python-reimplemented
  Downloading opencc_python_reimplemented-0.1.7-py2.py3-none-any.whl (481 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m481.8/481.8 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: opencc-python-reimplemented
Successfully installed opencc-python-reimplemented-0.1.7


In [None]:
pip install transformers torch sklearn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m53.3 MB/s[0m eta [36m0:00:00[0m
Collecting sklearn
  Downloading sklearn-0.0.post4.tar.gz (3.6 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m87.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py) ..

In [None]:
import pandas as pd
import numpy as np
import torch
from opencc import OpenCC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import GPT2ForSequenceClassification, GPT2Tokenizer, GPT2Config
from transformers import Trainer, TrainingArguments
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding
from torch.utils.data import Dataset
from transformers import GPT2ForSequenceClassification, BertTokenizer, GPT2Config
# 读取数据
df = pd.read_excel("Hotel2000.xlsx", header=None)

# 定义自定义数据集类型
class TextClassificationDataset(Dataset):
    def __init__(self, input_ids, attention_masks, labels):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx],
            'labels': self.labels[idx],
        }

# 转换简体中文为繁体中文
cc = OpenCC('s2t')
texts = [cc.convert(text) for text in df[0].tolist()]
labels = df[5].tolist()

# 定义模型和tokenizer
model = GPT2ForSequenceClassification.from_pretrained("ckiplab/gpt2-base-chinese", num_labels=len(set(labels)))
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")

# 对文本进行编码
input_encodings = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
input_ids = input_encodings["input_ids"]
attention_mask = input_encodings["attention_mask"]

# 将标签转换为Tensor
labels = torch.tensor(labels)

# 划分训练集和测试集
train_inputs, test_inputs, train_labels, test_labels = train_test_split(input_ids, labels, random_state=42, test_size=0.3)
train_masks, test_masks, _, _ = train_test_split(attention_mask, labels, random_state=42, test_size=0.3)

# 配置模型和训练参数
config = GPT2Config.from_pretrained("ckiplab/gpt2-base-chinese", num_labels=len(set(labels)))
model = GPT2ForSequenceClassification.from_pretrained("ckiplab/gpt2-base-chinese", config=config)
model.init_weights()
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=2,
    per_device_train_batch_size=1,  # 将训练批次大小设置为1
    per_device_eval_batch_size=1,   # 将评估批次大小设置为1
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    evaluation_strategy="steps",
    eval_steps=500,
)

# 将训练数据和测试数据转换为自定义数据集类型
train_dataset = TextClassificationDataset(train_inputs, train_masks, train_labels)
eval_dataset = TextClassificationDataset(test_inputs, test_masks, test_labels)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 训练模型
trainer.train()

# 评估模型
preds = trainer.predict(eval_dataset)
pred_labels = np.argmax(preds.predictions, axis=1)
true_labels = test_labels.numpy()
print(classification_report(true_labels, pred_labels))



Downloading (…)lve/main/config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/421M [00:00<?, ?B/s]

Some weights of the model checkpoint at ckiplab/gpt2-base-chinese were not used when initializing GPT2ForSequenceClassification: ['lm_head.weight']
- This IS expected if you are initializing GPT2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at ckiplab/gpt2-base-chinese and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

Some weights of the model checkpoint at ckiplab/gpt2-base-chinese were not used when initializing GPT2ForSequenceClassification: ['lm_head.weight']
- This IS expected if you are initializing GPT2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at ckiplab/gpt2-base-chinese and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss
500,1.9084,1.274496
1000,1.9642,0.859179
1500,1.2583,1.592898
2000,0.0013,0.882189
2500,0.6876,0.794981


              precision    recall  f1-score   support

           0       0.86      0.83      0.84       298
           1       0.83      0.86      0.85       302

    accuracy                           0.84       600
   macro avg       0.85      0.84      0.84       600
weighted avg       0.85      0.84      0.84       600



In [None]:
import pandas as pd
import numpy as np
import torch
from opencc import OpenCC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import GPT2ForSequenceClassification, GPT2Tokenizer, GPT2Config
from transformers import Trainer, TrainingArguments
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding
from torch.utils.data import Dataset
from transformers import GPT2ForSequenceClassification, BertTokenizer, GPT2Config
# 读取数据
df = pd.read_excel("New Hotel2000.xlsx", header=None)

# 定义自定义数据集类型
class TextClassificationDataset(Dataset):
    def __init__(self, input_ids, attention_masks, labels):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx],
            'labels': self.labels[idx],
        }

# 转换简体中文为繁体中文
cc = OpenCC('s2t')
texts = [cc.convert(text) for text in df[0].tolist()]
labels = df[5].tolist()

# 定义模型和tokenizer
model = GPT2ForSequenceClassification.from_pretrained("ckiplab/gpt2-base-chinese", num_labels=len(set(labels)))
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")

# 对文本进行编码
input_encodings = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
input_ids = input_encodings["input_ids"]
attention_mask = input_encodings["attention_mask"]

# 将标签转换为Tensor
labels = torch.tensor(labels)

# 划分训练集和测试集
train_inputs, test_inputs, train_labels, test_labels = train_test_split(input_ids, labels, random_state=42, test_size=0.3)
train_masks, test_masks, _, _ = train_test_split(attention_mask, labels, random_state=42, test_size=0.3)

# 配置模型和训练参数
config = GPT2Config.from_pretrained("ckiplab/gpt2-base-chinese", num_labels=len(set(labels)))
model = GPT2ForSequenceClassification.from_pretrained("ckiplab/gpt2-base-chinese", config=config)
model.init_weights()
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=4,
    per_device_train_batch_size=1,  # 将训练批次大小设置为1
    per_device_eval_batch_size=1,   # 将评估批次大小设置为1
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    evaluation_strategy="steps",
    eval_steps=500,
)

# 将训练数据和测试数据转换为自定义数据集类型
train_dataset = TextClassificationDataset(train_inputs, train_masks, train_labels)
eval_dataset = TextClassificationDataset(test_inputs, test_masks, test_labels)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 训练模型
trainer.train()

# 评估模型
preds = trainer.predict(eval_dataset)
pred_labels = np.argmax(preds.predictions, axis=1)
true_labels = test_labels.numpy()
print(classification_report(true_labels, pred_labels))

Some weights of the model checkpoint at ckiplab/gpt2-base-chinese were not used when initializing GPT2ForSequenceClassification: ['lm_head.weight']
- This IS expected if you are initializing GPT2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at ckiplab/gpt2-base-chinese and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of the model checkpoint at ckiplab/gpt2-base-chinese were not used wh

Step,Training Loss,Validation Loss
500,1.7501,1.113163
1000,1.96,0.896299
1500,1.4853,1.346057
2000,0.0013,0.895885
2500,1.0659,0.867765
3000,0.7846,0.950962
3500,0.5465,0.86241
4000,0.9367,0.773918
4500,0.0002,0.825678
5000,0.0003,0.857032


              precision    recall  f1-score   support

           0       0.88      0.91      0.89       298
           1       0.90      0.88      0.89       302

    accuracy                           0.89       600
   macro avg       0.89      0.89      0.89       600
weighted avg       0.89      0.89      0.89       600



In [None]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import GPT2ForSequenceClassification, GPT2Config
from transformers import BertTokenizer
from transformers import TrainingArguments, Trainer
from torch.utils.data import Dataset
from transformers import DataCollatorWithPadding

# 读取数据
df = pd.read_excel("New Hotel2000.xlsx", header=None)

# 定义自定义数据集类型
class TextClassificationDataset(Dataset):
    def __init__(self, input_ids, attention_masks, labels):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx],
            'labels': self.labels[idx],
        }

# 读取文本和标签数据
texts = df[0].tolist()
labels = df[5].tolist()

tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")

input_encodings = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
input_ids = input_encodings["input_ids"]
attention_mask = input_encodings["attention_mask"]

labels = torch.tensor(labels)

train_inputs, test_inputs, train_labels, test_labels = train_test_split(input_ids, labels, random_state=42, test_size=0.3)
train_masks, test_masks, _, _ = train_test_split(attention_mask, labels, random_state=42, test_size=0.3)

config = GPT2Config(vocab_size=tokenizer.vocab_size, num_labels=len(set(labels)))
model = GPT2ForSequenceClassification(config=config)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=2,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    evaluation_strategy="steps",
    eval_steps=500,
)

train_dataset = TextClassificationDataset(train_inputs, train_masks, train_labels)
eval_dataset = TextClassificationDataset(test_inputs, test_masks, test_labels)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)

device = torch.device("cpu")
model.to(device)

trainer.train()

preds = trainer.predict(eval_dataset)
pred_labels = np.argmax(preds.predictions, axis=1)
true_labels = test_labels.numpy()
print(classification_report(true_labels, pred_labels))




Step,Training Loss,Validation Loss
500,1.3973,1.132341
1000,2.5182,1.094998
1500,0.4926,0.870587
2000,0.0045,0.774936
2500,1.1279,0.744082


              precision    recall  f1-score   support

           0       0.87      0.85      0.86       298
           1       0.85      0.87      0.86       302

    accuracy                           0.86       600
   macro avg       0.86      0.86      0.86       600
weighted avg       0.86      0.86      0.86       600



In [None]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import BertForSequenceClassification, BertTokenizer, BertConfig
from transformers import Trainer, TrainingArguments
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding
from torch.utils.data import Dataset

# 读取数据
df = pd.read_excel("New Hotel2000.xlsx", header=None)

class TextClassificationDataset(Dataset):
    def __init__(self, input_ids, attention_masks, labels):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx],
            'labels': self.labels[idx],
        }

texts = df[0].tolist()
labels = df[5].tolist()

model = BertForSequenceClassification.from_pretrained("chinese_wwm_L-12_H-768_A-12", num_labels=len(set(labels)))
tokenizer = BertTokenizer.from_pretrained("chinese_wwm_L-12_H-768_A-12")

input_encodings = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
input_ids = input_encodings["input_ids"]
attention_mask = input_encodings["attention_mask"]

labels = torch.tensor(labels)

train_inputs, test_inputs, train_labels, test_labels = train_test_split(input_ids, labels, random_state=42, test_size=0.3)
train_masks, test_masks, _, _ = train_test_split(attention_mask, labels, random_state=42, test_size=0.3)

config = BertConfig.from_pretrained("chinese_wwm_L-12_H-768_A-12", num_labels=len(set(labels)))
model = BertForSequenceClassification.from_pretrained("chinese_wwm_L-12_H-768_A-12", config=config)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=2,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    evaluation_strategy="steps",
    eval_steps=500,
)

train_dataset = TextClassificationDataset(train_inputs, train_masks, train_labels)
eval_dataset = TextClassificationDataset(test_inputs, test_masks, test_labels)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

trainer.train()

preds = trainer.predict(eval_dataset)
pred_labels = np.argmax(preds.predictions, axis=1)
true_labels = test_labels.numpy()
print(classification_report(true_labels, pred_labels))


Some weights of the model checkpoint at hfl/chinese-bert-wwm were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint

Step,Training Loss,Validation Loss
500,1.4724,1.139272
1000,2.2084,0.924554
1500,1.9686,0.863151
2000,0.002,0.690316
2500,0.0015,0.725391


              precision    recall  f1-score   support

           0       0.84      0.91      0.87       298
           1       0.90      0.82      0.86       302

    accuracy                           0.87       600
   macro avg       0.87      0.87      0.87       600
weighted avg       0.87      0.87      0.87       600

