# 数据预处理桑

常见的预处理方法:
- 缺失值处理：真实的数据往往因为各种原因存在缺失值，需要用删除法或填补法来得到一个完整的数据子集。
- 离群值检测和处理：检测数据集中那些明显偏离数据集中的其他样本，为数据分析提供高质量的数据。
- 标准化：数据分析及建模过程中，许多机器学习算法需要其输入特征为标准化形式；若样本的特征之间的量纲差异太大，样本之间相似度评估结果将存在偏差。
- 特征编码：模型输入的特征通常需要是数值型的，所以需要将非数值型特征转换为数值特征。
- 离散化：在数据信息损失尽量少的前提下，尽可能减少元数。

![sklean中的相关类](./img/img.png)


In [9]:
import numpy as np
import pandas as pd

# Imputer has been remove
# from sklearn.preprocessing import Imputer 

from sklearn.impute import SimpleImputer
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import Binarizer
from sklearn.cluster import KMeans 

“teenager_sns”包含30000个样本的美国高中生社交网络信息数据集。每个样本包含40个变量，其中 gradyear, gender, age和friends四个变量代表高中生的毕业年份、性别、年龄和好友数等基本信息。 其余36个变量代表36个词语，代表高中生的5大兴趣。
“accord_sedan_testing”是一个二手汽车数据集，包含二手汽车的价格、已行驶英里、上市年份、档次、引擎缸数、换挡方式等

In [22]:
teenager_sns=pd.read_csv('../../dataset/teenager_sns.csv')
print(teenager_sns.shape)
teenager_sns.head(100)

(30000, 40)


Unnamed: 0,gradyear,gender,age,friends,basketball,football,soccer,softball,volleyball,swimming,...,blonde,mall,shopping,clothes,hollister,abercrombie,die,death,drunk,drugs
0,2006,M,18.980,7,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2006,F,18.801,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,2006,M,18.335,69,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,2006,F,18.875,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2006,,18.995,10,0,0,0,0,0,0,...,0,0,2,0,0,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2006,F,18.396,69,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
96,2006,F,18.261,20,12,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
97,2006,F,,13,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
98,2006,M,18.730,52,0,0,0,0,4,0,...,0,0,0,0,1,1,0,0,0,0


In [32]:
print(teenager_sns.info()) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 40 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   gradyear      30000 non-null  int64  
 1   gender        27276 non-null  object 
 2   age           24914 non-null  float64
 3   friends       30000 non-null  int64  
 4   basketball    30000 non-null  int64  
 5   football      30000 non-null  int64  
 6   soccer        30000 non-null  int64  
 7   softball      30000 non-null  int64  
 8   volleyball    30000 non-null  int64  
 9   swimming      30000 non-null  int64  
 10  cheerleading  30000 non-null  int64  
 11  baseball      30000 non-null  int64  
 12  tennis        30000 non-null  int64  
 13  sports        30000 non-null  int64  
 14  cute          30000 non-null  int64  
 15  sex           30000 non-null  int64  
 16  sexy          30000 non-null  int64  
 17  hot           30000 non-null  int64  
 18  kissed        30000 non-nu

TypeError: 'method' object is not iterable

### 1.缺失值处理
查看数据集的基本信息
可以看到性别和年龄有缺失值
<br>
填充方法:
- mean
- median(中位数)
- most_frequent(众数)
- constant(常数)<br>
<br>
#### 使用sklearn中的Imputer方法，将数据集“teenager_sns”中“age”列利用均值“mean”进行填充

In [18]:
# 基本案例，使用均值填充
# 先在训练集上得到每一列的均值，然后在测试集上进行拟合
import numpy as np
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]])
X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]]
print(imp_mean.transform(X))

[[ 7.   2.   3. ]
 [ 4.   3.5  6. ]
 [10.   3.5  9. ]]


In [35]:
imp_mean=SimpleImputer(missing_values=np.NaN, strategy='mean',copy=False) #这里不能使用字符串形式的NaN
imp_mean.fit(teenager_sns[['age']])
teenager_sns['age_imputed']=imp_mean.transform(teenager_sns[['age']])
# 显示缺失和填充的数据
teenager_sns[teenager_sns['age'].isnull()].head(100)

Unnamed: 0,gradyear,gender,age,friends,basketball,football,soccer,softball,volleyball,swimming,...,mall,shopping,clothes,hollister,abercrombie,die,death,drunk,drugs,age_imputed
5,2006,F,,142,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,17.993949
13,2006,,,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,17.993949
15,2006,,,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,17.993949
16,2006,,,135,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,17.993949
26,2006,F,,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,17.993949
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
757,2006,F,,44,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,17.993949
776,2006,,,7,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,17.993949
781,2006,,,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,17.993949
800,2006,M,,13,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,17.993949


In [51]:
# 性别的缺失值处理
imp_most_frequent=SimpleImputer(missing_values=np.NaN, strategy='most_frequent',copy=False)
imp_most_frequent.fit(teenager_sns[['gender']])
teenager_sns['gender_imputed']=imp_most_frequent.transform(teenager_sns[['gender']])

# print(type(teenager_sns['gender'].isnull())) #series
# print(teenager_sns['gender'].isnull())

teenager_sns[teenager_sns['gender'].isnull()].head(100)

<class 'pandas.core.series.Series'>
0        False
1        False
2        False
3        False
4         True
         ...  
29995    False
29996    False
29997    False
29998    False
29999    False
Name: gender, Length: 30000, dtype: bool


Unnamed: 0,gradyear,gender,age,friends,basketball,football,soccer,softball,volleyball,swimming,...,shopping,clothes,hollister,abercrombie,die,death,drunk,drugs,age_imputed,gender_imputed
4,2006,,18.995,10,0,0,0,0,0,0,...,2,0,0,0,0,0,1,1,18.995000,F
13,2006,,,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,17.993949,F
15,2006,,,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,17.993949,F
16,2006,,,135,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,17.993949,F
41,2006,,,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,17.993949,F
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1109,2006,,18.932,28,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,18.932000,F
1121,2006,,14.333,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,14.333000,F
1142,2006,,,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,17.993949,F
1145,2006,,,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,17.993949,F


In [52]:
import os 
print(os.getcwd())

/Users/donga5/aCode/MachineLearning/main/DataProcessing
