In [5]:
import re
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

# 检查GPU是否可用
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Step 1: 数据加载
data_path = 'data.xlsx'
df = pd.read_excel(data_path)
print(df.head())

# Step 2: 数据预处理
df = df[['bjnr', 'bjlbmc', 'bjlxmc']]
df.columns = ['报警内容', '警情粗类', '警情细类']
df.dropna(subset=['报警内容', '警情粗类', '警情细类'], inplace=True)

# 正则表达式提取报警信息
def extract_alarm_text(text):
    match = re.search(r'报警：(.*)', text)
    if match:
        return match.group(1).strip()
    else:
        return text.strip()

df['报警内容'] = df['报警内容'].apply(extract_alarm_text)

# 剔除重复信息
df.drop_duplicates(subset=['报警内容', '警情粗类', '警情细类'], keep='first', inplace=True)

label_encoder_coarse = LabelEncoder()
df['警情粗类编码'] = label_encoder_coarse.fit_transform(df['警情粗类'])

label_encoder_fine = LabelEncoder()
df['警情细类编码'] = label_encoder_fine.fit_transform(df['警情细类'])

import pickle

# 保存 LabelEncoder 对象
with open('label_encoder_coarse.pkl', 'wb') as f:
    pickle.dump(label_encoder_coarse, f)
with open('label_encoder_fine.pkl', 'wb') as f:
    pickle.dump(label_encoder_fine, f)

# 打印处理后的数据信息
print(f"处理后数据总数: {len(df)}")

# 可选：保存处理后的数据
df.to_excel('processed_data.xlsx', index=False)



                    bjsj                                               bjnr  \
0  2024-04-11 21:49:00.0  2024年4月11日 21时48分55秒 薛一铭( 180****5228 ，142427*...   
1  2024-04-11 21:43:25.0  2024年4月11日 21时43分22秒 郭女士( 139****6828 ) 报警：沙XX...   
2  2024-04-11 21:16:36.0  2024年4月11日 21时16分35秒 牛女士( 151****7579 ，142329*...   
3  2024-04-11 21:09:29.0  2024年4月11日 21时9分28秒 王先生( 151****8799、140111***...   
4  2024-04-11 21:01:51.0  2024年4月11日 10时44分48秒 黄志明( 151****3088 、350524*...   

   bjlbdm  bjlxdm    bjxldm bjlbmc bjlxmc  bjxlmc  
0      10  100100  100120.0   刑事案件     盗窃     NaN  
1      10  100100  100199.0   刑事案件     盗窃     NaN  
2      10  100100  100120.0   刑事案件     盗窃     NaN  
3      10  100100  100120.0   刑事案件     盗窃     NaN  
4      10  100100  100199.0   刑事案件     盗窃     NaN  
处理后数据总数: 793


In [6]:
# Step 3: 文本特征提取
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

X_train_coarse, X_test_coarse, y_train_coarse, y_test_coarse = train_test_split(df['报警内容'], df['警情粗类编码'], test_size=0.2, random_state=205392)
X_train_fine, X_test_fine, y_train_fine, y_test_fine = train_test_split(df['报警内容'], df['警情细类编码'], test_size=0.2, random_state=205392)

def encode_data(texts, labels, max_length=128):
    inputs = tokenizer(texts.tolist(), max_length=max_length, padding='max_length', truncation=True, return_tensors='pt')
    inputs['labels'] = torch.tensor(labels.values)
    return inputs

train_coarse_encodings = encode_data(X_train_coarse, y_train_coarse)
test_coarse_encodings = encode_data(X_test_coarse, y_test_coarse)
train_fine_encodings = encode_data(X_train_fine, y_train_fine)
test_fine_encodings = encode_data(X_test_fine, y_test_fine)


In [7]:
# Step 4: 模型训练
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: tensor[idx].to(device) for key, tensor in self.encodings.items()}

    def __len__(self):
        return len(self.encodings['input_ids'])

train_coarse_dataset = Dataset(train_coarse_encodings)
test_coarse_dataset = Dataset(test_coarse_encodings)
train_fine_dataset = Dataset(train_fine_encodings)
test_fine_dataset = Dataset(test_fine_encodings)

# 定义训练参数和保存检查点的参数
training_args_coarse = TrainingArguments(
    output_dir='./results_coarse',
    num_train_epochs=10,  # 10轮训练
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    remove_unused_columns=False,
    load_best_model_at_end=True,
    logging_dir='./logs_coarse',  # 添加日志目录
    logging_steps=100,  # 每100个步骤记录一次日志
    logging_first_step=True,
    save_steps=500,  # 每500个步骤保存一次检查点
    learning_rate=4e-5,  # 设置学习率
)

# 定义Trainer对象来训练粗类分类模型
model_coarse = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder_coarse.classes_))
model_coarse.to(device)

trainer_coarse = Trainer(
    model=model_coarse,
    args=training_args_coarse,
    train_dataset=train_coarse_dataset,
    eval_dataset=test_coarse_dataset,
    tokenizer=tokenizer,
)

# 训练粗类分类模型
trainer_coarse.train()

# 保存粗类分类模型的最终检查点
trainer_coarse.save_model('./checkpoint/coarse_model')


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None)


  0%|          | 0/800 [00:00<?, ?it/s]

{'loss': 2.1058, 'learning_rate': 3.995e-05, 'epoch': 0.01}


  0%|          | 0/20 [00:00<?, ?it/s]

{'eval_loss': 1.897748589515686, 'eval_runtime': 1.2668, 'eval_samples_per_second': 125.509, 'eval_steps_per_second': 15.787, 'epoch': 1.0}
{'loss': 1.9071, 'learning_rate': 3.5000000000000004e-05, 'epoch': 1.25}


  0%|          | 0/20 [00:00<?, ?it/s]

{'eval_loss': 1.7327489852905273, 'eval_runtime': 1.285, 'eval_samples_per_second': 123.734, 'eval_steps_per_second': 15.564, 'epoch': 2.0}
{'loss': 1.5944, 'learning_rate': 3.0000000000000004e-05, 'epoch': 2.5}


  0%|          | 0/20 [00:00<?, ?it/s]

{'eval_loss': 1.5316656827926636, 'eval_runtime': 1.311, 'eval_samples_per_second': 121.285, 'eval_steps_per_second': 15.256, 'epoch': 3.0}
{'loss': 1.3797, 'learning_rate': 2.5e-05, 'epoch': 3.75}


  0%|          | 0/20 [00:00<?, ?it/s]

{'eval_loss': 1.4068586826324463, 'eval_runtime': 1.3029, 'eval_samples_per_second': 122.038, 'eval_steps_per_second': 15.351, 'epoch': 4.0}
{'loss': 1.031, 'learning_rate': 2e-05, 'epoch': 5.0}


  0%|          | 0/20 [00:00<?, ?it/s]

{'eval_loss': 1.4349815845489502, 'eval_runtime': 1.2744, 'eval_samples_per_second': 124.761, 'eval_steps_per_second': 15.693, 'epoch': 5.0}


  0%|          | 0/20 [00:00<?, ?it/s]

{'eval_loss': 1.3194950819015503, 'eval_runtime': 1.3157, 'eval_samples_per_second': 120.847, 'eval_steps_per_second': 15.201, 'epoch': 6.0}
{'loss': 0.6914, 'learning_rate': 1.5000000000000002e-05, 'epoch': 6.25}


  0%|          | 0/20 [00:00<?, ?it/s]

{'eval_loss': 1.38361656665802, 'eval_runtime': 1.3144, 'eval_samples_per_second': 120.965, 'eval_steps_per_second': 15.216, 'epoch': 7.0}
{'loss': 0.5057, 'learning_rate': 1e-05, 'epoch': 7.5}


  0%|          | 0/20 [00:00<?, ?it/s]

{'eval_loss': 1.461667776107788, 'eval_runtime': 1.2564, 'eval_samples_per_second': 126.549, 'eval_steps_per_second': 15.918, 'epoch': 8.0}
{'loss': 0.3472, 'learning_rate': 5e-06, 'epoch': 8.75}


  0%|          | 0/20 [00:00<?, ?it/s]

{'eval_loss': 1.5069868564605713, 'eval_runtime': 1.2569, 'eval_samples_per_second': 126.504, 'eval_steps_per_second': 15.912, 'epoch': 9.0}
{'loss': 0.2576, 'learning_rate': 0.0, 'epoch': 10.0}


  0%|          | 0/20 [00:00<?, ?it/s]

{'eval_loss': 1.5308083295822144, 'eval_runtime': 1.2355, 'eval_samples_per_second': 128.698, 'eval_steps_per_second': 16.188, 'epoch': 10.0}
{'train_runtime': 202.7584, 'train_samples_per_second': 31.269, 'train_steps_per_second': 3.946, 'train_loss': 0.9645163357257843, 'epoch': 10.0}


In [10]:
# 定义细类分类模型的训练参数
training_args_fine = TrainingArguments(
    output_dir='./results_fine',
    num_train_epochs=17, 
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    remove_unused_columns=False,
    load_best_model_at_end=True,
    logging_dir='./logs_fine',  # 添加日志目录
    logging_steps=100,  # 每100个步骤记录一次日志
    logging_first_step=True,
    save_steps=500,  # 每500个步骤保存一次检查点
    learning_rate=4e-5,  # 设置学习率
)

# 定义Trainer对象来训练细类分类模型
model_fine = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder_fine.classes_))
model_fine.to(device)

trainer_fine = Trainer(
    model=model_fine,
    args=training_args_fine,
    train_dataset=train_fine_dataset,
    eval_dataset=test_fine_dataset,
    tokenizer=tokenizer,
)

# 训练细类分类模型
trainer_fine.train()

# 保存细类分类模型的最终检查点
trainer_fine.save_model('./checkpoint/fine_model')


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1360 [00:00<?, ?it/s]

{'loss': 4.2207, 'learning_rate': 3.997058823529412e-05, 'epoch': 0.01}


  0%|          | 0/20 [00:00<?, ?it/s]

{'eval_loss': 3.9265012741088867, 'eval_runtime': 11.2258, 'eval_samples_per_second': 14.164, 'eval_steps_per_second': 1.782, 'epoch': 1.0}
{'loss': 4.0159, 'learning_rate': 3.705882352941177e-05, 'epoch': 1.25}


  0%|          | 0/20 [00:00<?, ?it/s]

{'eval_loss': 3.8476178646087646, 'eval_runtime': 11.9078, 'eval_samples_per_second': 13.353, 'eval_steps_per_second': 1.68, 'epoch': 2.0}
{'loss': 3.7703, 'learning_rate': 3.411764705882353e-05, 'epoch': 2.5}


In [None]:
# Step 5: 模型评估
preds_coarse = trainer_coarse.predict(test_coarse_dataset)
y_pred_coarse = preds_coarse.predictions.argmax(-1)
print("粗类分类报告:\n", classification_report(y_test_coarse, y_pred_coarse, labels=range(len(label_encoder_coarse.classes_)), target_names=label_encoder_coarse.classes_))

preds_fine = trainer_fine.predict(test_fine_dataset)
y_pred_fine = preds_fine.predictions.argmax(-1)
print("细类分类报告:\n", classification_report(y_test_fine, y_pred_fine, labels=range(len(label_encoder_fine.classes_)), target_names=label_encoder_fine.classes_))
