# 导入库并加载数据

In [1]:
import pandas as pd
import numpy as np

# 加载 CSV 文件
df = pd.read_csv("./data/城市_20220101-20221231/china_cities_20220102.csv")
df = df[]
df = df.T
df = df.iloc[3:]
# 查看数据的基本信息
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 375 entries, 北京 to 西咸新区
Columns: 360 entries, 0 to 359
dtypes: object(360)
memory usage: 1.0+ MB


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,350,351,352,353,354,355,356,357,358,359
北京,17.0,6.0,24.0,15.0,54.0,2.0,4.0,10.0,33.0,55.0,...,2.0,2.0,42.0,24.0,17.0,58.0,30.0,53.0,0.47,0.31
天津,17.0,5.0,71.0,15.0,117.0,5.0,13.0,15.0,56.0,54.0,...,9.0,8.0,65.0,34.0,5.0,60.0,19.0,55.0,0.84,0.65
石家庄,103.0,77.0,61.0,135.0,109.0,12.0,14.0,70.0,60.0,4.0,...,17.0,7.0,55.0,34.0,6.0,64.0,33.0,57.0,0.6,0.46
唐山,33.0,10.0,67.0,33.0,121.0,3.0,12.0,7.0,58.0,63.0,...,12.0,7.0,61.0,27.0,7.0,66.0,21.0,62.0,1.03,0.63
秦皇岛,51.0,18.0,69.0,51.0,113.0,4.0,17.0,11.0,56.0,64.0,...,18.0,8.0,57.0,28.0,11.0,68.0,34.0,64.0,1.62,0.64


# 检测异常值

常见的异常值检测方法包括：

IQR（四分位距）法：基于数据的四分位数计算，异常值定义为超出1.5倍IQR范围的点。
Z-score法：基于标准差，通常定义为与均值相差超过3个标准差的点为异常值。
IQR 方法的实现：

In [2]:
def detect_outliers_iqr(df, columns):
    outliers = pd.DataFrame(False, index=df.index, columns=columns)
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        # 标记异常值
        outliers[col] = (df[col] < lower_bound) | (df[col] > upper_bound)
    return outliers

# 应用异常值检测并标记

选择需要检测的数值列，并应用异常值检测方法，然后为数据集添加一个标记列。

In [3]:
# 选择数值列
#numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns

# 检测异常值
outliers = detect_outliers_iqr(df, df.columns)

# 添加列标记异常值
df['has_outliers'] = outliers.any(axis=1)

# 查看标记后的数据
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,351,352,353,354,355,356,357,358,359,has_outliers
北京,17.0,6.0,24.0,15.0,54.0,2.0,4.0,10.0,33.0,55.0,...,2.0,42.0,24.0,17.0,58.0,30.0,53.0,0.47,0.31,False
天津,17.0,5.0,71.0,15.0,117.0,5.0,13.0,15.0,56.0,54.0,...,8.0,65.0,34.0,5.0,60.0,19.0,55.0,0.84,0.65,False
石家庄,103.0,77.0,61.0,135.0,109.0,12.0,14.0,70.0,60.0,4.0,...,7.0,55.0,34.0,6.0,64.0,33.0,57.0,0.6,0.46,False
唐山,33.0,10.0,67.0,33.0,121.0,3.0,12.0,7.0,58.0,63.0,...,7.0,61.0,27.0,7.0,66.0,21.0,62.0,1.03,0.63,False
秦皇岛,51.0,18.0,69.0,51.0,113.0,4.0,17.0,11.0,56.0,64.0,...,8.0,57.0,28.0,11.0,68.0,34.0,64.0,1.62,0.64,False


# 分析结果

可以统计异常值的数量，并进一步分析包含异常值的记录。

In [4]:
# 统计包含异常值的行数
outlier_count = df['has_outliers'].value_counts()
print(outlier_count)

# 显示部分包含异常值的记录
df_with_outliers = df[df['has_outliers']]
df_with_outliers.head()

has_outliers
False    193
True     182
Name: count, dtype: int64


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,351,352,353,354,355,356,357,358,359,has_outliers
邯郸,174.0,132.0,114.0,179.0,151.0,22.0,20.0,84.0,71.0,13.0,...,9.0,51.0,42.0,17.0,62.0,32.0,55.0,0.87,0.83,True
邢台,156.0,119.0,92.0,146.0,124.0,19.0,18.0,88.0,74.0,6.0,...,8.0,62.0,47.0,6.0,60.0,28.0,53.0,0.75,0.68,True
太原,58.0,28.0,63.0,66.0,115.0,10.0,32.0,50.0,61.0,23.0,...,19.0,78.0,56.0,3.0,48.0,15.0,36.0,1.75,1.14,True
呼和浩特,44.0,20.0,34.0,44.0,69.0,14.0,14.0,56.0,39.0,12.0,...,20.0,68.0,53.0,6.0,49.0,17.0,37.0,1.82,1.09,True
沈阳,46.0,14.0,44.0,46.0,81.0,14.0,21.0,21.0,34.0,48.0,...,19.0,57.0,32.0,8.0,56.0,24.0,52.0,1.64,0.75,True


# 保存处理后的数据

最后，将处理后的数据保存到一个新的 CSV 文件中，以备后续使用。

In [5]:
# 保存标记了异常值的数据
df.to_csv("china_cities_with_outliers.csv", index=False)