In [None]:
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split  
import re
import random
import jieba.posseg as pseg
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.preprocessing import LabelEncoder
import pickle
from transformers import TrainingArguments
from transformers import Trainer

In [4]:
# 定义计算设备
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Step 1: 数据加载
data_path = 'data.xlsx'
try:
    df = pd.read_excel(data_path)
    print(df.head())
except Exception as e:
    print(f"Error loading data from {data_path}: {e}")
    exit(1)

# Step 2: 数据预处理
df = df[['bjnr', 'bjlbmc', 'bjlxmc']]
df.columns = ['报警内容', '警情粗类', '警情细类']
df.dropna(subset=['报警内容', '警情粗类', '警情细类'], inplace=True)

# 正则表达式提取报警信息
def extract_alarm_text(text):
    match = re.search(r'报警：(.*)', text)
    if match:
        return match.group(1).strip()
    else:
        return text.strip()

df['报警内容'] = df['报警内容'].apply(extract_alarm_text)

# 同义词替换函数
synonyms_dict = {
    '报警': ['警报', '求助', '求援'],
    # 可以根据实际需要添加更多的同义词对应关系
}

def synonym_replacement(text, n=1):
    words = list(pseg.cut(text))  # 使用结巴分词来处理中文文本
    new_words = []
    replaced = False
    for word, flag in words:
        if word in synonyms_dict and len(synonyms_dict[word]) > 0:
            synonym = random.choice(synonyms_dict[word])
            new_words.append(synonym)
            replaced = True
        else:
            new_words.append(word)
    
    # 如果没有替换成功，则随机替换一个词
    if not replaced and len(words) > 0:
        index = random.randint(0, len(words) - 1)
        synonym = random.choice(synonyms_dict.get(words[index].word, [words[index].word]))  # 如果找不到同义词，则保持原词
        new_words[index] = synonym
    
    return ''.join(new_words)

# 数据增强：同义词替换
augmented_data = []
for _, row in df.iterrows():
    augmented_data.append(row.to_dict())
    augmented_content = synonym_replacement(row['报警内容'])
    augmented_data.append({'报警内容': augmented_content, '警情粗类': row['警情粗类'], '警情细类': row['警情细类']})

augmented_df = pd.DataFrame(augmented_data)

# 剔除重复信息
augmented_df.drop_duplicates(subset=['报警内容', '警情粗类', '警情细类'], keep='first', inplace=True)

label_encoder_coarse = LabelEncoder()
augmented_df['警情粗类编码'] = label_encoder_coarse.fit_transform(augmented_df['警情粗类'])

label_encoder_fine = LabelEncoder()
augmented_df['警情细类编码'] = label_encoder_fine.fit_transform(augmented_df['警情细类'])

# 保存 LabelEncoder 对象
with open('label_encoder_coarse.pkl', 'wb') as f:
    pickle.dump(label_encoder_coarse, f)
with open('label_encoder_fine.pkl', 'wb') as f:
    pickle.dump(label_encoder_fine, f)

# 打印处理后的数据信息
print(f"处理后数据总数: {len(augmented_df)}")

# 可选：保存处理后的数据
augmented_df.to_excel('processed_data.xlsx', index=False)

                    bjsj                                               bjnr  \
0  2024-04-11 21:49:00.0  2024年4月11日 21时48分55秒 薛一铭( 180****5228 ，142427*...   
1  2024-04-11 21:43:25.0  2024年4月11日 21时43分22秒 郭女士( 139****6828 ) 报警：沙XX...   
2  2024-04-11 21:16:36.0  2024年4月11日 21时16分35秒 牛女士( 151****7579 ，142329*...   
3  2024-04-11 21:09:29.0  2024年4月11日 21时9分28秒 王先生( 151****8799、140111***...   
4  2024-04-11 21:01:51.0  2024年4月11日 10时44分48秒 黄志明( 151****3088 、350524*...   

   bjlbdm  bjlxdm    bjxldm bjlbmc bjlxmc  bjxlmc  
0      10  100100  100120.0   刑事案件     盗窃     NaN  
1      10  100100  100199.0   刑事案件     盗窃     NaN  
2      10  100100  100120.0   刑事案件     盗窃     NaN  
3      10  100100  100120.0   刑事案件     盗窃     NaN  
4      10  100100  100199.0   刑事案件     盗窃     NaN  
处理后数据总数: 994


In [5]:
# Step 3: 文本特征提取
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

X_train_coarse, X_test_coarse, y_train_coarse, y_test_coarse = train_test_split(augmented_df['报警内容'], augmented_df['警情粗类编码'], test_size=0.2, random_state=205392)
X_train_fine, X_test_fine, y_train_fine, y_test_fine = train_test_split(augmented_df['报警内容'], augmented_df['警情细类编码'], test_size=0.2, random_state=205392)

def encode_data(texts, labels, max_length=128):
    inputs = tokenizer(texts.tolist(), max_length=max_length, padding='max_length', truncation=True, return_tensors='pt')
    inputs['labels'] = torch.tensor(labels.values)
    return inputs

train_coarse_encodings = encode_data(X_train_coarse, y_train_coarse)
test_coarse_encodings = encode_data(X_test_coarse, y_test_coarse)
train_fine_encodings = encode_data(X_train_fine, y_train_fine)
test_fine_encodings = encode_data(X_test_fine, y_test_fine)

NameError: name 'train_test_split' is not defined

In [None]:
# Step 4: 模型训练
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: tensor[idx].to(device) for key, tensor in self.encodings.items()}

    def __len__(self):
        return len(self.encodings['input_ids'])

train_coarse_dataset = Dataset(train_coarse_encodings)
test_coarse_dataset = Dataset(test_coarse_encodings)
train_fine_dataset = Dataset(train_fine_encodings)
test_fine_dataset = Dataset(test_fine_encodings)

# 定义训练参数和保存检查点的参数
training_args_coarse = TrainingArguments(
    output_dir='./results_coarse',
    num_train_epochs=10,  # 10轮训练
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    remove_unused_columns=False,
    load_best_model_at_end=True,
    logging_dir='./logs_coarse',  # 添加日志目录
    logging_steps=100,  # 每100个步骤记录一次日志
    logging_first_step=True,
    save_steps=500,  # 每500个步骤保存一次检查点
    learning_rate=4e-5,  # 设置学习率
)

# 定义Trainer对象来训练粗类分类模型
model_coarse = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder_coarse.classes_))
model_coarse.to(device)

trainer_coarse = Trainer(
    model=model_coarse,
    args=training_args_coarse,
    train_dataset=train_coarse_dataset,
    eval_dataset=test_coarse_dataset,
    tokenizer=tokenizer,
)

# 训练粗类分类模型
trainer_coarse.train()

# 保存粗类分类模型的最终检查点
trainer_coarse.save_model('./checkpoint/coarse_model')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None)


  0%|          | 0/1000 [00:00<?, ?it/s]

{'loss': 2.3312, 'learning_rate': 3.9960000000000004e-05, 'epoch': 0.01}
{'loss': 1.7974, 'learning_rate': 3.6e-05, 'epoch': 1.0}


  0%|          | 0/25 [00:00<?, ?it/s]

{'eval_loss': 1.5270096063613892, 'eval_runtime': 1.4467, 'eval_samples_per_second': 137.558, 'eval_steps_per_second': 17.281, 'epoch': 1.0}
{'loss': 1.5574, 'learning_rate': 3.2000000000000005e-05, 'epoch': 2.0}


  0%|          | 0/25 [00:00<?, ?it/s]

{'eval_loss': 1.4068944454193115, 'eval_runtime': 1.4928, 'eval_samples_per_second': 133.307, 'eval_steps_per_second': 16.747, 'epoch': 2.0}
{'loss': 1.3393, 'learning_rate': 2.8e-05, 'epoch': 3.0}


  0%|          | 0/25 [00:00<?, ?it/s]

{'eval_loss': 1.2799484729766846, 'eval_runtime': 1.5786, 'eval_samples_per_second': 126.064, 'eval_steps_per_second': 15.837, 'epoch': 3.0}
{'loss': 1.1074, 'learning_rate': 2.4e-05, 'epoch': 4.0}


  0%|          | 0/25 [00:00<?, ?it/s]

{'eval_loss': 1.2164697647094727, 'eval_runtime': 1.514, 'eval_samples_per_second': 131.444, 'eval_steps_per_second': 16.513, 'epoch': 4.0}
{'loss': 0.8549, 'learning_rate': 2e-05, 'epoch': 5.0}


  0%|          | 0/25 [00:00<?, ?it/s]

{'eval_loss': 1.0581930875778198, 'eval_runtime': 1.5989, 'eval_samples_per_second': 124.458, 'eval_steps_per_second': 15.635, 'epoch': 5.0}
{'loss': 0.6389, 'learning_rate': 1.6000000000000003e-05, 'epoch': 6.0}


  0%|          | 0/25 [00:00<?, ?it/s]

{'eval_loss': 0.9233143925666809, 'eval_runtime': 1.5566, 'eval_samples_per_second': 127.847, 'eval_steps_per_second': 16.061, 'epoch': 6.0}
{'loss': 0.4463, 'learning_rate': 1.2e-05, 'epoch': 7.0}


  0%|          | 0/25 [00:00<?, ?it/s]

{'eval_loss': 1.026775598526001, 'eval_runtime': 1.5287, 'eval_samples_per_second': 130.18, 'eval_steps_per_second': 16.354, 'epoch': 7.0}
{'loss': 0.3355, 'learning_rate': 8.000000000000001e-06, 'epoch': 8.0}


  0%|          | 0/25 [00:00<?, ?it/s]

{'eval_loss': 1.0527417659759521, 'eval_runtime': 1.5213, 'eval_samples_per_second': 130.808, 'eval_steps_per_second': 16.433, 'epoch': 8.0}
{'loss': 0.2479, 'learning_rate': 4.000000000000001e-06, 'epoch': 9.0}


  0%|          | 0/25 [00:00<?, ?it/s]

{'eval_loss': 1.079898715019226, 'eval_runtime': 1.5117, 'eval_samples_per_second': 131.642, 'eval_steps_per_second': 16.538, 'epoch': 9.0}
{'loss': 0.185, 'learning_rate': 0.0, 'epoch': 10.0}


  0%|          | 0/25 [00:00<?, ?it/s]

{'eval_loss': 1.1008565425872803, 'eval_runtime': 1.5095, 'eval_samples_per_second': 131.827, 'eval_steps_per_second': 16.561, 'epoch': 10.0}
{'train_runtime': 244.8695, 'train_samples_per_second': 32.507, 'train_steps_per_second': 4.084, 'train_loss': 0.8515378339290619, 'epoch': 10.0}


In [None]:
# 定义细类分类模型的训练参数
training_args_fine = TrainingArguments(
    output_dir='./results_fine',
    num_train_epochs=15, 
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    remove_unused_columns=False,
    load_best_model_at_end=True,
    logging_dir='./logs_fine',  # 添加日志目录
    logging_steps=100,  # 每100个步骤记录一次日志
    logging_first_step=True,
    save_steps=500,  # 每500个步骤保存一次检查点
    learning_rate=5e-5,  # 设置学习率
)

# 定义Trainer对象来训练细类分类模型
model_fine = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder_fine.classes_))
model_fine.to(device)

trainer_fine = Trainer(
    model=model_fine,
    args=training_args_fine,
    train_dataset=train_fine_dataset,
    eval_dataset=test_fine_dataset,
    tokenizer=tokenizer,
)

# 训练细类分类模型
trainer_fine.train()

# 保存细类分类模型的最终检查点
trainer_fine.save_model('./checkpoint/fine_model')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1500 [00:00<?, ?it/s]

{'loss': 4.0364, 'learning_rate': 4.996666666666667e-05, 'epoch': 0.01}


KeyboardInterrupt: 

In [None]:
# Step 5: 模型评估
preds_coarse = trainer_coarse.predict(test_coarse_dataset)
y_pred_coarse = preds_coarse.predictions.argmax(-1)
print("粗类分类报告:\n", classification_report(y_test_coarse, y_pred_coarse, labels=range(len(label_encoder_coarse.classes_)), target_names=label_encoder_coarse.classes_))

preds_fine = trainer_fine.predict(test_fine_dataset)
y_pred_fine = preds_fine.predictions.argmax(-1)
print("细类分类报告:\n", classification_report(y_test_fine, y_pred_fine, labels=range(len(label_encoder_fine.classes_)), target_names=label_encoder_fine.classes_))


  0%|          | 0/25 [00:00<?, ?it/s]

NameError: name 'classification_report' is not defined