# B站用户隐私分析示例

本notebook展示如何分析和保护B站用户数据的隐私。

In [1]:
import sys
from pathlib import Path

# 添加项目根目录到Python路径
project_root = Path('.').resolve().parent
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from src.task1_pii_detection import task_pii_detection, parse_args
from src.data_processing import process_user_comments
# from src.dp_processing import apply_differential_privacy



## 1. 数据加载与预处理

In [2]:
# 检查Python路径
print("Python路径:")
for path in sys.path:
    print(f"- {path}")

Python路径:
- d:\systems\SoftwarePath\Anaconda\envs\biliprivacy\python38.zip
- d:\systems\SoftwarePath\Anaconda\envs\biliprivacy\DLLs
- d:\systems\SoftwarePath\Anaconda\envs\biliprivacy\lib
- d:\systems\SoftwarePath\Anaconda\envs\biliprivacy
- 
- C:\Users\Dora\AppData\Roaming\Python\Python38\site-packages
- d:\systems\SoftwarePath\Anaconda\envs\biliprivacy\lib\site-packages
- d:\systems\SoftwarePath\Anaconda\envs\biliprivacy\lib\site-packages\win32
- d:\systems\SoftwarePath\Anaconda\envs\biliprivacy\lib\site-packages\win32\lib
- d:\systems\SoftwarePath\Anaconda\envs\biliprivacy\lib\site-packages\Pythonwin
- d:\systems\SoftwarePath\Anaconda\envs\biliprivacy\lib\site-packages\setuptools\_vendor
- D:\code\BiliPrivacy
- D:\code\BiliPrivacy\configs


In [2]:
# 加载示例数据
User_name = "bidao"  # 示例视频ID
max_comments = 700

comments_data,output_file = process_user_comments(User_name, max_comments=max_comments)

print(f"加载了 {len(comments_data)} 条评论")
comments_data
# output_file

2024-12-30 10:20:25,241 - src.data_processing - INFO - 开始处理用户 bidao 的数据
2024-12-30 10:20:25,746 - src.data_processing - INFO - 原始数据行数: 3116
2024-12-30 10:20:25,852 - src.data_processing - INFO - 处理后数据已保存到: D:\code\BiliPrivacy\data\processed_data\bidao.xlsx
2024-12-30 10:20:25,858 - src.data_processing - INFO - 文本数据已保存到: D:\code\BiliPrivacy\data\processed_data\bidao_test.txt


加载了 700 条评论


Unnamed: 0,rpid,message,rank,entropy
1,216712971968,坐在这个轮椅上的大爷：不然你以为我为啥坐轮椅？,1,4.175736
4,205146994208,看的时候恍惚就了一下自己究竟毕没毕业，吓了一跳,1,4.262692
5,204474768976,那一秒画面上写了，日本山荷叶,2,3.807355
6,204457961776,在？能不能黑幕我几个厉害的将领,2,3.773557
8,204354634592,我们民勤的土房子也是加稻草！,2,3.807355
...,...,...,...,...
3067,1173878981,可以考虑！有什么想看的都可以告诉我,2,3.852169
3070,1167829323,谁说的？就这么几个评论我早看完了,2,4.000000
3092,1120722365,就派你去调查一下事情的真相！,2,3.807355
3103,1119146689,→_→广告词你都想好了！那派你去推广我的视频吧！,2,4.251629


## 2. 大模型推理（以GPT-4o为例）

In [4]:
import argparse
# Task type: ['task1_pii_detection',task2_user_profiling', 'task3_fans_profiling']

task_type = 'task1_pii_detection'
model = 'gpt-4o'
args = parse_args()

args.task = task_type
args.model = model

task_pii_detection(args)


defense type: No defense
----- streaming request -----


2024-12-30 10:22:32,018 - openai._base_client - INFO - Retrying request to /chat/completions in 0.479370 seconds
2024-12-30 10:22:37,508 - openai._base_client - INFO - Retrying request to /chat/completions in 0.848167 seconds


APITimeoutError: Request timed out.

## 3. 差分隐私处理

In [None]:
# 应用差分隐私保护
epsilon = 1.0  # 隐私预算
privatized_data = apply_differential_privacy(comments_data, epsilon)

# 比较原始数据和处理后的数据
print("原始数据统计:")
print(comments_data.describe())
print("\n差分隐私处理后的数据统计:")
print(privatized_data.describe())

## 4. 数据可视化

In [None]:
# 设置绘图风格
plt.style.use('seaborn')

# 绘制PII分布
plt.figure(figsize=(10, 6))
sns.countplot(data=pii_types, x='pii_type')
plt.title('PII类型分布')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()