# 导入库并加载数据

In [5]:
import pandas as pd
import numpy as np

# 加载 CSV 文件
df = pd.read_csv("./data/城市_20220101-20221231/china_cities_20220102.csv")

# 查看数据的基本信息
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 360 entries, 0 to 359
Columns: 378 entries, date to 西咸新区
dtypes: float64(375), int64(2), object(1)
memory usage: 1.0+ MB


Unnamed: 0,date,hour,type,北京,天津,石家庄,唐山,秦皇岛,邯郸,保定,...,塔城地区,阿勒泰地区,石河子,五家渠,三沙,兰州新区,赣江新区,儋州,雄安新区,西咸新区
0,20220102,0,AQI,17.0,17.0,103.0,33.0,51.0,174.0,54.0,...,52.0,22.0,173.0,394.0,,116.0,84.0,51.0,50.0,235.0
1,20220102,0,PM2.5,6.0,5.0,77.0,10.0,18.0,132.0,18.0,...,29.0,10.0,131.0,292.0,,88.0,62.0,30.0,8.0,185.0
2,20220102,0,PM2.5_24h,24.0,71.0,61.0,67.0,69.0,114.0,53.0,...,17.0,7.0,151.0,265.0,,55.0,56.0,22.0,58.0,130.0
3,20220102,0,PM10,15.0,15.0,135.0,33.0,51.0,179.0,58.0,...,54.0,2.0,247.0,495.0,,147.0,113.0,52.0,50.0,181.0
4,20220102,0,PM10_24h,54.0,117.0,109.0,121.0,113.0,151.0,92.0,...,35.0,7.0,277.0,443.0,,95.0,118.0,43.0,121.0,126.0


# 检测异常值

常见的异常值检测方法包括：

IQR（四分位距）法：基于数据的四分位数计算，异常值定义为超出1.5倍IQR范围的点。
Z-score法：基于标准差，通常定义为与均值相差超过3个标准差的点为异常值。
IQR 方法的实现：

In [6]:
def detect_outliers_iqr(df, columns):
    outliers = pd.DataFrame(False, index=df.index, columns=columns)
    for col in columns:
        if df[col].dtype in ['float64', 'int64']:  # 只检测数值型列
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            # 标记异常值
            outliers[col] = (df[col] < lower_bound) | (df[col] > upper_bound)
    return outliers

# 应用异常值检测并标记

选择需要检测的数值列，并应用异常值检测方法，然后为数据集添加一个标记列。

In [7]:
# 选择数值列
numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns

# 检测异常值
outliers = detect_outliers_iqr(df, numeric_columns)

# 添加列标记异常值
df['has_outliers'] = outliers.any(axis=1)

# 查看标记后的数据
df.head()

Unnamed: 0,date,hour,type,北京,天津,石家庄,唐山,秦皇岛,邯郸,保定,...,阿勒泰地区,石河子,五家渠,三沙,兰州新区,赣江新区,儋州,雄安新区,西咸新区,has_outliers
0,20220102,0,AQI,17.0,17.0,103.0,33.0,51.0,174.0,54.0,...,22.0,173.0,394.0,,116.0,84.0,51.0,50.0,235.0,True
1,20220102,0,PM2.5,6.0,5.0,77.0,10.0,18.0,132.0,18.0,...,10.0,131.0,292.0,,88.0,62.0,30.0,8.0,185.0,False
2,20220102,0,PM2.5_24h,24.0,71.0,61.0,67.0,69.0,114.0,53.0,...,7.0,151.0,265.0,,55.0,56.0,22.0,58.0,130.0,True
3,20220102,0,PM10,15.0,15.0,135.0,33.0,51.0,179.0,58.0,...,2.0,247.0,495.0,,147.0,113.0,52.0,50.0,181.0,True
4,20220102,0,PM10_24h,54.0,117.0,109.0,121.0,113.0,151.0,92.0,...,7.0,277.0,443.0,,95.0,118.0,43.0,121.0,126.0,True


# 分析结果

可以统计异常值的数量，并进一步分析包含异常值的记录。

In [8]:
# 统计包含异常值的行数
outlier_count = df['has_outliers'].value_counts()
print(outlier_count)

# 显示部分包含异常值的记录
df_with_outliers = df[df['has_outliers']]
df_with_outliers.head()

has_outliers
False    221
True     139
Name: count, dtype: int64


Unnamed: 0,date,hour,type,北京,天津,石家庄,唐山,秦皇岛,邯郸,保定,...,阿勒泰地区,石河子,五家渠,三沙,兰州新区,赣江新区,儋州,雄安新区,西咸新区,has_outliers
0,20220102,0,AQI,17.0,17.0,103.0,33.0,51.0,174.0,54.0,...,22.0,173.0,394.0,,116.0,84.0,51.0,50.0,235.0,True
2,20220102,0,PM2.5_24h,24.0,71.0,61.0,67.0,69.0,114.0,53.0,...,7.0,151.0,265.0,,55.0,56.0,22.0,58.0,130.0,True
3,20220102,0,PM10,15.0,15.0,135.0,33.0,51.0,179.0,58.0,...,2.0,247.0,495.0,,147.0,113.0,52.0,50.0,181.0,True
4,20220102,0,PM10_24h,54.0,117.0,109.0,121.0,113.0,151.0,92.0,...,7.0,277.0,443.0,,95.0,118.0,43.0,121.0,126.0,True
5,20220102,0,SO2,2.0,5.0,12.0,3.0,4.0,22.0,5.0,...,1.0,6.0,2.0,,30.0,8.0,4.0,6.0,11.0,True


# 保存处理后的数据

最后，将处理后的数据保存到一个新的 CSV 文件中，以备后续使用。

In [9]:
# 保存标记了异常值的数据
df.to_csv("china_cities_with_outliers.csv", index=False)