# Exploratory Data Analysis for Packet Loss Event Classification

本 notebook 用于对网络延迟数据进行探索性分析，理解延迟与丢包事件的关系。

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import yaml

plt.style.use('seaborn-v0_8')
sns.set_palette('husl')

## 读取配置文件

In [None]:
with open('../config.yaml', 'r', encoding='utf-8') as f:
    config = yaml.safe_load(f)

## 加载数据样例

In [None]:
sample_file = Path(config['paths']['dataset']['first_capture']) / 'cpe_a-cpe_b-fiber.csv'
df = pd.read_csv(sample_file)
df['time'] = pd.to_datetime(df['time'])
df.head()

## 基本统计信息

In [None]:
df.describe()

## 丢包统计分析

In [None]:
df['is_packet_loss'] = (df['delay_ms'] == -1).astype(int)
packet_loss_count = df['is_packet_loss'].sum()
total = len(df)
print(f'丢包事件数: {packet_loss_count}')
print(f'丢包率: {packet_loss_count/total*100:.2f}%')

## 延迟时序可视化（含丢包）

In [None]:
plt.figure(figsize=(15, 5))
plt.plot(df['time'], df['delay_ms'], label='Delay (ms)')
plt.scatter(df.loc[df['is_packet_loss']==1, 'time'], [0]*df['is_packet_loss'].sum(), color='red', label='Packet Loss', s=10)
plt.title('Delay and Packet Loss Over Time')
plt.xlabel('Time')
plt.ylabel('Delay (ms)')
plt.legend()
plt.tight_layout()
plt.show()

## 延迟分布直方图（去除丢包）

In [None]:
plt.figure(figsize=(10, 5))
sns.histplot(df[df['delay_ms']!=-1]['delay_ms'], bins=50, kde=True)
plt.title('Distribution of Delay (ms)')
plt.xlabel('Delay (ms)')
plt.ylabel('Count')
plt.show()

## 丢包事件的前后延迟分析

In [None]:
# 找到丢包事件的索引
loss_indices = df.index[df['is_packet_loss'] == 1].tolist()
# 取丢包前后各5个点的延迟
window = 5
before_loss = []
after_loss = []
for idx in loss_indices:
    if idx-window >= 0:
        before_loss.extend(df.iloc[idx-window:idx]['delay_ms'].values)
    if idx+1+window <= len(df):
        after_loss.extend(df.iloc[idx+1:idx+1+window]['delay_ms'].values)
plt.figure(figsize=(10,5))
sns.histplot(before_loss, color='blue', label='Before Loss', kde=True, stat='density')
sns.histplot(after_loss, color='orange', label='After Loss', kde=True, stat='density')
plt.legend()
plt.title('Delay Distribution Before and After Packet Loss')
plt.xlabel('Delay (ms)')
plt.show()

## 延迟自相关分析（滞后1~3）

In [None]:
df['delay_lag1'] = df['delay_ms'].shift(1)
df['delay_lag2'] = df['delay_ms'].shift(2)
df['delay_lag3'] = df['delay_ms'].shift(3)
corr = df[['delay_ms', 'delay_lag1', 'delay_lag2', 'delay_lag3']].corr()
plt.figure(figsize=(6,4))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix of Delay and Lags')
plt.show()

## 小结
- 延迟分布、丢包率、时序特征等为后续特征工程和建模提供了依据。
- 可以尝试用滑动窗口统计特征预测丢包事件。