## import包

In [1]:
import pandas as pd
import numpy as np



## 构造数据

In [2]:
df = pd.DataFrame({
    "feature_1":np.random.randint(1,10,size=10),
    "feature_2":np.random.randint(1,10,size=10),
    "feature_3":np.random.random(10).round(2),
    "feature_4":np.random.random(10).round(2),
    "feature_5":np.random.randn(10),
    "feature_6":np.random.randn(10)
})

In [3]:
df.loc[:8,        'feature_1'] = np.nan
df.loc[[1, 5, 6], 'feature_2'] = np.nan
df.loc[[2, 3, 5], 'feature_3'] = np.nan
df.loc[[1, 6],    'feature_4'] = np.nan
df.loc[[5, 6],    'feature_5'] = np.nan
df.loc[5:,        'feature_6'] = np.nan

In [4]:
df

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6
0,,1.0,0.72,0.76,0.967159,0.520366
1,,,0.19,,-2.258241,-1.407333
2,,7.0,,0.2,-2.164111,-2.068565
3,,6.0,,0.74,-0.240079,-1.423015
4,,3.0,0.33,0.07,-0.48992,-0.414284
5,,,,0.98,,
6,,,0.45,,,
7,,2.0,0.19,0.35,0.730657,
8,,3.0,0.65,0.28,0.835582,
9,3.0,1.0,0.55,0.17,0.586271,


## 查看缺失值的比例

In [5]:
df.isna().sum()/len(df)*100

feature_1    90.0
feature_2    30.0
feature_3    30.0
feature_4    20.0
feature_5    20.0
feature_6    50.0
dtype: float64

## 删除法

### 删除行

In [6]:
df.drop([5, 6], inplace=True) # 删除行

In [7]:
df

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6
0,,1.0,0.72,0.76,0.967159,0.520366
1,,,0.19,,-2.258241,-1.407333
2,,7.0,,0.2,-2.164111,-2.068565
3,,6.0,,0.74,-0.240079,-1.423015
4,,3.0,0.33,0.07,-0.48992,-0.414284
7,,2.0,0.19,0.35,0.730657,
8,,3.0,0.65,0.28,0.835582,
9,3.0,1.0,0.55,0.17,0.586271,


### 删除列

In [8]:
df.drop(['feature_1'], axis=1, inplace=True) # 删除列

In [9]:
df

Unnamed: 0,feature_2,feature_3,feature_4,feature_5,feature_6
0,1.0,0.72,0.76,0.967159,0.520366
1,,0.19,,-2.258241,-1.407333
2,7.0,,0.2,-2.164111,-2.068565
3,6.0,,0.74,-0.240079,-1.423015
4,3.0,0.33,0.07,-0.48992,-0.414284
7,2.0,0.19,0.35,0.730657,
8,3.0,0.65,0.28,0.835582,
9,1.0,0.55,0.17,0.586271,


## 代表值填充

### 使用数据范围之外的数值进行填充

In [10]:
df['feature_2'].fillna(df['feature_2'].min() - 1, inplace=True)

In [11]:
df

Unnamed: 0,feature_2,feature_3,feature_4,feature_5,feature_6
0,1.0,0.72,0.76,0.967159,0.520366
1,0.0,0.19,,-2.258241,-1.407333
2,7.0,,0.2,-2.164111,-2.068565
3,6.0,,0.74,-0.240079,-1.423015
4,3.0,0.33,0.07,-0.48992,-0.414284
7,2.0,0.19,0.35,0.730657,
8,3.0,0.65,0.28,0.835582,
9,1.0,0.55,0.17,0.586271,


### 使用统计值填充

In [12]:
df['feature_3'].fillna(df['feature_3'].mean(), inplace=True)   # 用均值填充
# df['feature_3'].fillna(df['feature_3'].median(), inplace=True) # 用中位数填充

In [13]:
df

Unnamed: 0,feature_2,feature_3,feature_4,feature_5,feature_6
0,1.0,0.72,0.76,0.967159,0.520366
1,0.0,0.19,,-2.258241,-1.407333
2,7.0,0.438333,0.2,-2.164111,-2.068565
3,6.0,0.438333,0.74,-0.240079,-1.423015
4,3.0,0.33,0.07,-0.48992,-0.414284
7,2.0,0.19,0.35,0.730657,
8,3.0,0.65,0.28,0.835582,
9,1.0,0.55,0.17,0.586271,


### 使用相邻值填充

In [14]:
df['feature_4'].fillna(method='ffill', inplace=True) # 用前一个有效观察值填充
# df['feature_4'].fillna(method='bfill', inplace=True) # 用后一个有效观察值填充

In [15]:
df

Unnamed: 0,feature_2,feature_3,feature_4,feature_5,feature_6
0,1.0,0.72,0.76,0.967159,0.520366
1,0.0,0.19,0.76,-2.258241,-1.407333
2,7.0,0.438333,0.2,-2.164111,-2.068565
3,6.0,0.438333,0.74,-0.240079,-1.423015
4,3.0,0.33,0.07,-0.48992,-0.414284
7,2.0,0.19,0.35,0.730657,
8,3.0,0.65,0.28,0.835582,
9,1.0,0.55,0.17,0.586271,


## 使用预测值填充

In [16]:
from sklearn.linear_model import LinearRegression

train = df.loc[df['feature_6'].notnull()]
test = df.loc[df['feature_6'].isnull()]

target = 'feature_6'
used_features = [x for x in train.columns if x!= target]

lr = LinearRegression()
lr.fit(train[used_features], train[target])
pred = lr.predict(test[used_features])

df.loc[df['feature_6'].isnull(), 'feature_6'] = pred

In [17]:
df

Unnamed: 0,feature_2,feature_3,feature_4,feature_5,feature_6
0,1.0,0.72,0.76,0.967159,0.520366
1,0.0,0.19,0.76,-2.258241,-1.407333
2,7.0,0.438333,0.2,-2.164111,-2.068565
3,6.0,0.438333,0.74,-0.240079,-1.423015
4,3.0,0.33,0.07,-0.48992,-0.414284
7,2.0,0.19,0.35,0.730657,-0.02643
8,3.0,0.65,0.28,0.835582,0.388862
9,1.0,0.55,0.17,0.586271,0.684211
