In [2]:
import pandas as pd

# 设置文件路径
file_path = "../archive/pokemon.csv"

# 读取CSV文件
df = pd.read_csv(file_path)

In [3]:
# **步骤1: 查看数据基本信息**
display(df.info())  # 显示数据类型和非空值计数
display(df.head())  # 显示前5行数据

display("缺失值统计:")
display(df.isnull().sum())  # 统计每列的缺失值数量

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 801 entries, 0 to 800
Data columns (total 41 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   abilities          801 non-null    object 
 1   against_bug        801 non-null    float64
 2   against_dark       801 non-null    float64
 3   against_dragon     801 non-null    float64
 4   against_electric   801 non-null    float64
 5   against_fairy      801 non-null    float64
 6   against_fight      801 non-null    float64
 7   against_fire       801 non-null    float64
 8   against_flying     801 non-null    float64
 9   against_ghost      801 non-null    float64
 10  against_grass      801 non-null    float64
 11  against_ground     801 non-null    float64
 12  against_ice        801 non-null    float64
 13  against_normal     801 non-null    float64
 14  against_poison     801 non-null    float64
 15  against_psychic    801 non-null    float64
 16  against_rock       801 non

None

Unnamed: 0,abilities,against_bug,against_dark,against_dragon,against_electric,against_fairy,against_fight,against_fire,against_flying,against_ghost,...,percentage_male,pokedex_number,sp_attack,sp_defense,speed,type1,type2,weight_kg,generation,is_legendary
0,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,88.1,1,65,65,45,grass,poison,6.9,1,0
1,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,88.1,2,80,80,60,grass,poison,13.0,1,0
2,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,88.1,3,122,120,80,grass,poison,100.0,1,0
3,"['Blaze', 'Solar Power']",0.5,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,...,88.1,4,60,50,65,fire,,8.5,1,0
4,"['Blaze', 'Solar Power']",0.5,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,...,88.1,5,80,65,80,fire,,19.0,1,0


'缺失值统计:'

abilities              0
against_bug            0
against_dark           0
against_dragon         0
against_electric       0
against_fairy          0
against_fight          0
against_fire           0
against_flying         0
against_ghost          0
against_grass          0
against_ground         0
against_ice            0
against_normal         0
against_poison         0
against_psychic        0
against_rock           0
against_steel          0
against_water          0
attack                 0
base_egg_steps         0
base_happiness         0
base_total             0
capture_rate           0
classfication          0
defense                0
experience_growth      0
height_m              20
hp                     0
japanese_name          0
name                   0
percentage_male       98
pokedex_number         0
sp_attack              0
sp_defense             0
speed                  0
type1                  0
type2                384
weight_kg             20
generation             0


In [4]:
# **步骤2: 处理缺失值**
# 1. 对数值型数据，使用中位数填充
num_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[num_cols] = df[num_cols].apply(lambda x: x.fillna(x.median()))

In [5]:
# 2. 对类别型数据，使用众数填充
cat_cols = df.select_dtypes(include=['object']).columns
df[cat_cols] = df[cat_cols].apply(lambda x: x.fillna(x.mode()[0]))
display("填充缺失值后:")
display(df.isnull().sum())  # 再次检查是否有缺失值

'填充缺失值后:'

abilities            0
against_bug          0
against_dark         0
against_dragon       0
against_electric     0
against_fairy        0
against_fight        0
against_fire         0
against_flying       0
against_ghost        0
against_grass        0
against_ground       0
against_ice          0
against_normal       0
against_poison       0
against_psychic      0
against_rock         0
against_steel        0
against_water        0
attack               0
base_egg_steps       0
base_happiness       0
base_total           0
capture_rate         0
classfication        0
defense              0
experience_growth    0
height_m             0
hp                   0
japanese_name        0
name                 0
percentage_male      0
pokedex_number       0
sp_attack            0
sp_defense           0
speed                0
type1                0
type2                0
weight_kg            0
generation           0
is_legendary         0
dtype: int64

In [6]:
# **步骤3: 筛选不合法数据**
# 1. 身高不能超过10米，体重不能超过500千克
df = df[(df['height_m'] <= 10) & (df['weight_kg'] <= 500)]

In [8]:
# 2. 确保 `capture_rate` 为数值类型
df['capture_rate'] = pd.to_numeric(df['capture_rate'], errors='coerce')
df.loc[:, 'capture_rate'] = df['capture_rate'].fillna(df['capture_rate'].median())

In [10]:
# 3. 确保 `type2` 为空时填充为 'None'
df.loc[:, 'type2'] = df['type2'].fillna('None')

display("数据清理后:")
display(df.info())
display(df.head())  # 再次查看数据

'数据清理后:'

<class 'pandas.core.frame.DataFrame'>
Index: 791 entries, 0 to 800
Data columns (total 41 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   abilities          791 non-null    object 
 1   against_bug        791 non-null    float64
 2   against_dark       791 non-null    float64
 3   against_dragon     791 non-null    float64
 4   against_electric   791 non-null    float64
 5   against_fairy      791 non-null    float64
 6   against_fight      791 non-null    float64
 7   against_fire       791 non-null    float64
 8   against_flying     791 non-null    float64
 9   against_ghost      791 non-null    float64
 10  against_grass      791 non-null    float64
 11  against_ground     791 non-null    float64
 12  against_ice        791 non-null    float64
 13  against_normal     791 non-null    float64
 14  against_poison     791 non-null    float64
 15  against_psychic    791 non-null    float64
 16  against_rock       791 non-null

None

Unnamed: 0,abilities,against_bug,against_dark,against_dragon,against_electric,against_fairy,against_fight,against_fire,against_flying,against_ghost,...,percentage_male,pokedex_number,sp_attack,sp_defense,speed,type1,type2,weight_kg,generation,is_legendary
0,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,88.1,1,65,65,45,grass,poison,6.9,1,0
1,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,88.1,2,80,80,60,grass,poison,13.0,1,0
2,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,88.1,3,122,120,80,grass,poison,100.0,1,0
3,"['Blaze', 'Solar Power']",0.5,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,...,88.1,4,60,50,65,fire,flying,8.5,1,0
4,"['Blaze', 'Solar Power']",0.5,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,...,88.1,5,80,65,80,fire,flying,19.0,1,0


初步数据清理分析结果：
1. **缺失值情况**：
   - `height_m`（身高）缺失20条数据。
   - `percentage_male`（性别比例）缺失98条数据。
   - `type2`（第二属性）缺失384条数据（可能因部分宝可梦只有一种属性）。
   - `weight_kg`（体重）缺失20条数据。

2. **重复值**：
   - 无重复行，数据无冗余。

3. **数据类型检查**：
   - `capture_rate` 可能应该是数值型，但存储为 `object`（字符串）。
   - 其余字段的数据类型合理。

4. **异常值检查**：
   - `height_m` 和 `weight_kg` 存在异常值（如最大值999.9kg 可能是输入错误）。
   - `percentage_male` 最大值100%，但部分宝可梦可能为无性别或100%某性别。

### **下一步数据清理方案**
1. **填充缺失值**：
   - `height_m` 和 `weight_kg` 用同类宝可梦的中位数填充（基于 `type1`）。
   - `percentage_male` 采用各类宝可梦的平均值填充，若为无性别种群，则设为 `NaN`。
   - `type2` 缺失值填充 `"None"`，表示该宝可梦只有一个属性。

2. **数据转换**：
   - `capture_rate` 转换为整数类型。

3. **异常值处理**：
   - 过滤掉 `weight_kg > 500` 和 `height_m > 10` 的极端异常值（可能为输入错误）。

数据清理与调整如下：
1. **填充缺失值**：
   - `height_m` 和 `weight_kg` 采用同类型 (`type1`) 宝可梦的中位数填充。
   - `percentage_male` 采用同类型 (`type1`) 宝可梦的平均值填充。
   - `type2` 缺失值填充为 `"None"`。

2. **数据转换**：
   - `capture_rate` 转换为数值类型，现存 1 个缺失值，可进一步填充为中位数。

3. **异常值处理**：
   - 移除了 **身高 > 10m** 或 **体重 > 500kg** 的异常数据（可能是数据录入错误）。
   - 经过筛选，数据量减少至 **791 条**。

4. **当前情况**：
   - 仅 `capture_rate` 还存在 1 个缺失值，可以用中位数填充。


In [12]:
cleaned_file_path = "../archive/pokemon_cleaned.csv"
df.to_csv(cleaned_file_path, index=False)
display(f"清理后的数据已保存至: {cleaned_file_path}")

'清理后的数据已保存至: ../archive/pokemon_cleaned.csv'