In [2]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

In [17]:
data = {
    'size':['XL','L','M',np.nan,'M','M'],
    'color':['red','green','blue','green','red','green'],
    'gender':['female','male',np.nan,'female','fmale','male'],
    'price':[199.0,89.0,np.nan,129.0,79.0,89.0],
    'weight':[500,450,300,np.nan,410,np.nan],
    'brought':['yes','no','yes','no','yes','no']
}

dict

In [4]:
df=pd.DataFrame(data)
df

Unnamed: 0,size,color,gender,price,weight,brought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,,300.0,yes
3,,green,female,129.0,,no
4,M,red,fmale,79.0,410.0,yes
5,M,green,male,89.0,,no


## 计算每行中空值的比例

In [5]:
np.round(df.isnull().sum()/len(df),2)

size       0.17
color      0.00
gender     0.17
price      0.17
weight     0.33
brought    0.00
dtype: float64

## 填充缺失值
scikit-learn impute包

### 使用平均值填充

In [12]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
df[['weight']] = imputer.fit_transform(df[['weight']])

In [13]:
df

Unnamed: 0,size,color,gender,price,weight,brought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,,300.0,yes
3,,green,female,129.0,415.0,no
4,M,red,fmale,79.0,410.0,yes
5,M,green,male,89.0,415.0,no


In [26]:
df['weight']
#type(df['weight'])

0    500.0
1    450.0
2    300.0
3    415.0
4    410.0
5    415.0
Name: weight, dtype: float64

In [24]:
df[['weight','brought']]
#type(df[['weight','brought']])

Unnamed: 0,weight,brought
0,500.0,yes
1,450.0,no
2,300.0,yes
3,415.0,no
4,410.0,yes
5,415.0,no


In [28]:
imputer.statistics_[0]

415.0

### 使用固定值填充

In [29]:
imputer = SimpleImputer(
    missing_values=np.nan,
    strategy='constant',
    fill_value=99.0
)

In [31]:
df[['price']] = imputer.fit_transform(df[['price']])
df

Unnamed: 0,size,color,gender,price,weight,brought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,99.0,300.0,yes
3,,green,female,129.0,415.0,no
4,M,red,fmale,79.0,410.0,yes
5,M,green,male,89.0,415.0,no


In [32]:
imputer.statistics_[0]

99.0

### 使用最频繁出现的列填充

In [34]:
imputer = SimpleImputer(
    missing_values=np.nan,
    strategy='most_frequent'
)

In [36]:
df[['size']] = imputer.fit_transform(df[['size']])
df

Unnamed: 0,size,color,gender,price,weight,brought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,99.0,300.0,yes
3,M,green,female,129.0,415.0,no
4,M,red,fmale,79.0,410.0,yes
5,M,green,male,89.0,415.0,no


In [37]:
imputer.statistics_[0]

'M'

### 统计

In [3]:
data = {
    'size':['XL','L','M',np.nan,'M','M'],
    'color':['red','green','blue','green','red','green'],
    'gender':['female','male',np.nan,'female','female','male'],
    'price':[199.0,89.0,np.nan,129.0,79.0,89.0],
    'weight':[500,450,300,np.nan,410,np.nan],
    'bought':['yes','no','yes','no','yes','no']
}

In [4]:
df = pd.DataFrame(data)

In [5]:
df[~df['weight'].isnull()].select_dtypes(include=['float']).mean()

price     122.333333
weight    415.000000
dtype: float64

### 批量填充

In [6]:
imputer = SimpleImputer(
    missing_values=np.nan,
    strategy='constant',
    fill_value='empty'
)
columns = df.select_dtypes(include=['object']).columns
df.loc[:, columns] = imputer.fit_transform(df[columns])
df

Unnamed: 0,size,color,gender,price,weight,brought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,empty,,300.0,yes
3,empty,green,female,129.0,,no
4,M,red,fmale,79.0,410.0,yes
5,M,green,male,89.0,,no


## 数值离散化

### 等宽区间

In [10]:
df = pd.DataFrame(
    data={'weight':np.array([75,78.5,85,91,84.5,83,68],dtype='float64')}
)
df

Unnamed: 0,weight
0,75.0
1,78.5
2,85.0
3,91.0
4,84.5
5,83.0
6,68.0


In [13]:
df['weight_cut'] = pd.cut(df['weight'],bins=3)
df

Unnamed: 0,weight,weight_cut
0,75.0,"(67.977, 75.667]"
1,78.5,"(75.667, 83.333]"
2,85.0,"(83.333, 91.0]"
3,91.0,"(83.333, 91.0]"
4,84.5,"(83.333, 91.0]"
5,83.0,"(75.667, 83.333]"
6,68.0,"(67.977, 75.667]"


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   weight      7 non-null      float64 
 1   weight_cut  7 non-null      category
dtypes: category(1), float64(1)
memory usage: 347.0 bytes


### 指定区间

In [15]:
df = pd.DataFrame(
    data={'weight':np.array([75,78.5,85,91,84.5,83,68],dtype='float64')}
)
df

Unnamed: 0,weight
0,75.0
1,78.5
2,85.0
3,91.0
4,84.5
5,83.0
6,68.0


In [16]:
df['weight_cut'] = pd.cut(df['weight'],bins=[60,75,80,95])
df

Unnamed: 0,weight,weight_cut
0,75.0,"(60, 75]"
1,78.5,"(75, 80]"
2,85.0,"(80, 95]"
3,91.0,"(80, 95]"
4,84.5,"(80, 95]"
5,83.0,"(80, 95]"
6,68.0,"(60, 75]"


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   weight      7 non-null      float64 
 1   weight_cut  7 non-null      category
dtypes: category(1), float64(1)
memory usage: 347.0 bytes


### 区间标签

In [18]:
df = pd.DataFrame(
    data={'weight':np.array([75,78.5,85,91,84.5,83,68],dtype='float64')}
)
df

Unnamed: 0,weight
0,75.0
1,78.5
2,85.0
3,91.0
4,84.5
5,83.0
6,68.0


In [19]:
df['weight_cut'] = pd.cut(df['weight'],bins=[60,75,80,95],labels=['light','normal','heavy'])
df

Unnamed: 0,weight,weight_cut
0,75.0,light
1,78.5,normal
2,85.0,heavy
3,91.0,heavy
4,84.5,heavy
5,83.0,heavy
6,68.0,light


### 虚拟编码

In [20]:
df = pd.DataFrame(
    data={'weight':np.array([75,78.5,85,91,84.5,83,68],dtype='float64')}
)
df

Unnamed: 0,weight
0,75.0
1,78.5
2,85.0
3,91.0
4,84.5
5,83.0
6,68.0


In [22]:
df['weight_cut'] = pd.cut(df['weight'],bins=[60,75,80,95],labels=['light','normal','heavy'])
df

Unnamed: 0,weight,weight_cut
0,75.0,light
1,78.5,normal
2,85.0,heavy
3,91.0,heavy
4,84.5,heavy
5,83.0,heavy
6,68.0,light


In [24]:
df=pd.get_dummies(df)
df

Unnamed: 0,weight,weight_cut_light,weight_cut_normal,weight_cut_heavy
0,75.0,1,0,0
1,78.5,0,1,0
2,85.0,0,0,1
3,91.0,0,0,1
4,84.5,0,0,1
5,83.0,0,0,1
6,68.0,1,0,0


## 特征提取

### 元素数量

In [25]:
data_dict = {
    'currency': [
        ['PLN','USD'],
        ['EUR','USD','PLN','CAD'],
        ['GBP'],
        ['JPY','CZK','HUP'],
        []
    ]
}
df = pd.DataFrame(data_dict)
df

Unnamed: 0,currency
0,"[PLN, USD]"
1,"[EUR, USD, PLN, CAD]"
2,[GBP]
3,"[JPY, CZK, HUP]"
4,[]


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   currency  5 non-null      object
dtypes: object(1)
memory usage: 168.0+ bytes


In [28]:
df.iloc[0]

currency    [PLN, USD]
Name: 0, dtype: object

In [30]:
type(df.iloc[0][0])

list

In [32]:
df['number'] = df['currency'].map(len)
df

Unnamed: 0,currency,number
0,"[PLN, USD]",2
1,"[EUR, USD, PLN, CAD]",4
2,[GBP],1
3,"[JPY, CZK, HUP]",3
4,[],0


map()函数是将series中的元素通过map()中的方法映射成为另一个元素

### 是否包含元素

In [34]:
df['USD_flag'] = df['currency'].map(
    lambda x : 1 if 'USD' in x else 0)
df

Unnamed: 0,currency,number,USD_flag
0,"[PLN, USD]",2,1
1,"[EUR, USD, PLN, CAD]",4,1
2,[GBP],1,0
3,"[JPY, CZK, HUP]",3,0
4,[],0,0


### 从字符串提取标签

In [41]:
df = pd.DataFrame(
    {
        'tags':['#good#vibes','hot#summer#holiday','#street#food','#workout']
    })
df

Unnamed: 0,tags
0,#good#vibes
1,hot#summer#holiday
2,#street#food
3,#workout


In [45]:
new_df = df['tags'].str.split('#',expand=True)
new_df.drop(columns=[0])
new_df.columns = ['tag1','tag2','tag3']
new_df

Unnamed: 0,tag1,tag2,tag3
0,,good,vibes
1,hot,summer,holiday
2,,street,food
3,,workout,


In [50]:
imputer=SimpleImputer(missing_values='',strategy='constant', fill_value='None')
new_df[['tag1']] = imputer.fit_transform(new_df[['tag1']])
new_df

Unnamed: 0,tag1,tag2,tag3
0,,good,vibes
1,hot,summer,holiday
2,,street,food
3,,workout,
