In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

In [None]:
# 注意读取csv文件的不同格式
data = pd.read_table('./used_car_train_20200313.csv', sep = ' ',index_col=0)  # index_col将SalesID转换为行索引
data.iloc[:,range(0,17)].head()
data.head()

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150000 entries, 0 to 149999
Data columns (total 30 columns):
name                 150000 non-null int64
regDate              150000 non-null int64
model                149999 non-null float64
brand                150000 non-null int64
bodyType             145494 non-null float64
fuelType             141320 non-null float64
gearbox              144019 non-null float64
power                150000 non-null int64
kilometer            150000 non-null float64
notRepairedDamage    150000 non-null object
regionCode           150000 non-null int64
seller               150000 non-null int64
offerType            150000 non-null int64
creatDate            150000 non-null int64
price                150000 non-null int64
v_0                  150000 non-null float64
v_1                  150000 non-null float64
v_2                  150000 non-null float64
v_3                  150000 non-null float64
v_4                  150000 non-null float64
v_5     

### 需要处理的特征数据：
（1）与结果无关：name
（2）算法难以理解：regDate,createDate
 (3) 缺失： model(1),bodyType,fuelType,gearbox
 
 特征类型:
 - 分类型变量：常为object，若碰到用数字int等类型表示的特征

- Field	Description
- SaleID	交易ID，唯一编码
- name	汽车交易名称，已脱敏
- regDate	汽车注册日期，例如20160101，2016年01月01日
- model	车型编码，已脱敏
- brand	汽车品牌，已脱敏
- bodyType	车身类型：豪华轿车：0，微型车：1，厢型车：2，大巴车：3，敞篷车：4，双门汽车：5，商务车：6，搅拌车：7- 
- fuelType	燃油类型：汽油：0，柴油：1，液化石油气：2，天然气：3，混合动力：4，其他：5，电动：6
- gearbox	变速箱：手动：0，自动：1
- power	发动机功率：范围 [ 0, 600 ]
- kilometer	汽车已行驶公里，单位万km
- notRepairedDamage	汽车有尚未修复的损坏：是：0，否：1
- regionCode	地区编码，已脱敏
- seller	销售方：个体：0，非个体：1
- offerType	报价类型：提供：0，请求：1
- creatDate	汽车上线时间，即开始售卖时间
- price	二手车交易价格（预测目标）
- v系列特征	匿名特征，包含v0-14在内15个匿名特征

In [5]:
# 人为筛选无用特征，交易名称,axis=1对列进行操作
data.drop(['name'], axis=1, inplace=True)

In [8]:
np.unique(data.creatDate).shape

(96,)

In [53]:
data.regionCode

SaleID
0         1046
1         4366
2         2806
3          434
4         6977
          ... 
149995    4576
149996    2826
149997    3302
149998    1877
149999     235
Name: regionCode, Length: 150000, dtype: int64

## 数据预处理-- 处理日期
（1）算法无法理解的特征数据：日期/时间->带有有效信息的值
 (2) 文字型转换为数字形：填补缺失值（连续型/分类型）
 (3）数据编码：OneHot
 
 显然int型的日期特征对价格有影响，但是上线日期和开始售卖的日期中，开始售卖的年份(也许还有月份)和上线的时长(即使用的时长)是比较值得关注的特征量
 

In [41]:
# data_time = data.loc[:,['regDate','creatDate']].copy()
# data_time['reg_year'] = data_time['regDate']//10000
# data_time['create_year'] = data_time['creatDate']//10000
# data_time['create_day'] = data_time['creatDate']%100
# data_time['create_mounth'] = ((data_time['creatDate']%10000 - data_time['create_day'])/100).astype(int)

In [48]:
# data_time['use_time'] = data_time['create_year'] - data_time['reg_year']
# np.unique(data_time['create_day'])
# data_time['create_mounth'].describe()

count    150000.000000
mean          3.161580
std           0.380709
min           1.000000
25%           3.000000
50%           3.000000
75%           3.000000
max          12.000000
Name: create_mounth, dtype: float64

In [37]:
# mounth = (data_time['creatDate']%10000 - data_time['creatDate']%100)/100
# # mounth = int(mounth)
# data_time['creatDate']%100
# (data_time['creatDate']%10000 - data_time['creatDate']%100)

SaleID
0         400
1         300
2         400
3         300
4         300
         ... 
149995    300
149996    300
149997    300
149998    400
149999    300
Name: creatDate, Length: 150000, dtype: int64

### 数据预处理--处理地点
对于超过25个类别的分类型变量，会被算法认为是连续型变量，而不是25个类

In [49]:
# data['regionCode'].value_counts().count()

7905

## 数据预处理-对缺失值的处理
- 文字型转换成数字型变量->que
缺失值->编码和哑变量
- 有噪声的特征notRepairedDamage:float '-'

存在缺失值的特征：
- model(1)，可忽略； df = df.dropna(subset=['model'])删除，且将行索引重排
- bodyType:车型(145494); 与brand，power, kilometers
- fuelType：燃油类型(141320); 与brand，power,bodyType有关
- gearbox：变速箱0/1(144019); 与bodyType，brand有关

In [197]:
# np.unique(data.notRepairedDamage)

array(['-', '0.0', '1.0'], dtype=object)

In [198]:
# test_d = np.array(data.notRepairedDamage).reshape(-1,1)
# tranform_d = SimpleImputer(missing_values='-', strategy='constant',fill_value='1.0').fit_transform(test_d)
# data.notRepairedDamage = pd.DataFrame(tranform_d).astype(float)

In [199]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150000 entries, 0 to 149999
Data columns (total 29 columns):
regDate              150000 non-null int64
model                149999 non-null float64
brand                150000 non-null int64
bodyType             145494 non-null float64
fuelType             141320 non-null float64
gearbox              144019 non-null float64
power                150000 non-null int64
kilometer            150000 non-null float64
notRepairedDamage    150000 non-null float64
regionCode           150000 non-null int64
seller               150000 non-null int64
offerType            150000 non-null int64
creatDate            150000 non-null int64
price                150000 non-null int64
v_0                  150000 non-null float64
v_1                  150000 non-null float64
v_2                  150000 non-null float64
v_3                  150000 non-null float64
v_4                  150000 non-null float64
v_5                  150000 non-null float64
v_6  

In [200]:
## for test
np.unique(data['fuelType'][data.fuelType.isnull().values == False])
data['fuelType'][data.fuelType.isnull().values == False]

SaleID
0         0.0
1         0.0
2         0.0
3         0.0
4         0.0
         ... 
149995    0.0
149996    0.0
149997    1.0
149998    1.0
149999    0.0
Name: fuelType, Length: 141320, dtype: float64

In [204]:
data = data.dropna(subset=['model'])
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 149999 entries, 0 to 149999
Data columns (total 29 columns):
regDate              149999 non-null int64
model                149999 non-null float64
brand                149999 non-null int64
bodyType             145493 non-null float64
fuelType             141319 non-null float64
gearbox              144018 non-null float64
power                149999 non-null int64
kilometer            149999 non-null float64
notRepairedDamage    149999 non-null float64
regionCode           149999 non-null int64
seller               149999 non-null int64
offerType            149999 non-null int64
creatDate            149999 non-null int64
price                149999 non-null int64
v_0                  149999 non-null float64
v_1                  149999 non-null float64
v_2                  149999 non-null float64
v_3                  149999 non-null float64
v_4                  149999 non-null float64
v_5                  149999 non-null float64
v_6  

In [143]:
## for test 用变量接受dropna后的值，否则会只删除那一列的值
l = np.zeros((4,4))
l[2][1]=np.nan
df = pd.DataFrame(l)
df1 = df.dropna(subset=[1])

In [205]:
# 划分特征和标签
X = data.iloc[:, data.columns != 'price']
y = data.iloc[:, data.columns == 'price']
X.shape

(149999, 28)

In [206]:
# 对行索引进行整理
for x in [X,y]:
    x.index = range(x.shape[0])

In [208]:
X.shape

(149999, 28)

### 对bodyType、fuelType和gearbox进行填充，采用LR/RFC
- 首先查看是否有需要进行特征的编码或者无量纲化：

非匿名特征除了power和kilometer外均为数值型分类型变量，无需处理；

是否对power和kilometers进行无量纲化？

- 使用RF对上述进行填充，树模型算法，不需要无量纲化

### little tip：机器学习特征工程--标准化和归一化
- 标准化适用条件：

（1）不需要对特征进行归一化：基于数模型的方法，如RF/bagging/boosting/xgboosting

（2）需要标准化（基于距离的模型）：回归分析/LR/NN/SVM
- 相关定义及sklearn中的使用

In [56]:
# FOR test: 每一列的缺失值
X.isnull().sum(axis=0)

regDate                 0
model                   0
brand                   0
bodyType             4506
fuelType             8680
gearbox              5981
power                   0
kilometer               0
notRepairedDamage       0
regionCode              0
seller                  0
offerType               0
creatDate               0
v_0                     0
v_1                     0
v_2                     0
v_3                     0
v_4                     0
v_5                     0
v_6                     0
v_7                     0
v_8                     0
v_9                     0
v_10                    0
v_11                    0
v_12                    0
v_13                    0
v_14                    0
dtype: int64

In [209]:
# 使用随机森林填补缺失值
# 找出缺失向量数从小到大的的索引序列
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier as RFC
sortindex = np.argsort(X.isnull().sum(axis=0)).values
sortindex
ind = ['bodyType','gearbox','fuelType']

In [210]:
X.columns

Index(['regDate', 'model', 'brand', 'bodyType', 'fuelType', 'gearbox', 'power',
       'kilometer', 'notRepairedDamage', 'regionCode', 'seller', 'offerType',
       'creatDate', 'v_0', 'v_1', 'v_2', 'v_3', 'v_4', 'v_5', 'v_6', 'v_7',
       'v_8', 'v_9', 'v_10', 'v_11', 'v_12', 'v_13', 'v_14'],
      dtype='object')

In [211]:
for i in ind:
    # 构建新的特征矩阵
    df = X
    fillc = df.loc[:,i]
    df = pd.concat((df.loc[:,df.columns!=i], pd.DataFrame(y)),axis=1)
    
    # 新的特征矩阵中的值进行填补
    df_0 = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0).fit_transform(df)
    
    # 训练集和测试集
    Ytrain = fillc[fillc.notnull()]
    Ytest = fillc[fillc.isnull()] # 需要Ytest的原因，为了ta的索引取出Xtest
    Xtrain = df_0[Ytrain.index,:]
    Xtest = df_0[Ytest.index,:]
    
    # RFC填充
    rfc = RFC(n_estimators=100).fit(Xtrain,Ytrain)
    Y_predict = rfc.predict(Xtest)
    
    # 将填充值返回到原特征矩阵
    X.loc[X.loc[:,i].isnull(), i] = Y_predict
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [186]:
## for test:  loc/iloc的使用
# - 在
df = X
fillc = df.loc[:,'bodyType']
df = pd.concat((df.loc[:,df.columns!='bodyType'], pd.DataFrame(y)),axis=1)
    
    # 新的特征矩阵中的值进行填补
df_0 = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0).fit_transform(df)
    
    # 训练集和测试集
Ytrain = fillc[fillc.notnull()]
Ytest = fillc[fillc.isnull()] # 需要Ytest的原因，为了ta的索引取出Xtest
Xtrain = df_0[Ytrain.index,:]
Xtest = df_0[Ytest.index,:]
    
    # RFC填充
rfc = RFC(n_estimators=100).fit(Xtrain,Ytrain)
Y_predict = rfc.predict(Xtest)
    
    # 将填充值返回到原特征矩阵
X_ = X
X_.loc[X_.loc[:,'bodyType'].isnull(), 'bodyType'] = Y_predict

In [183]:
# 另一种处理缺失值的方法 
# DataFrame的fillna方法
# data_ = data.copy()
# data_.loc[:,'bodyType'] = data_.loc[:,'bodyType'].fillna(data_.loc[:,'bodyType'].median()) #  中位数
# data_.loc[:,'bodyType'] = data_.loc[:,'bodyType'].fillna(data_.loc[:,'bodyType'].mode()[0]) # 众数
# data_.info()

In [4]:
np.unique(X.regDate).shape

NameError: name 'X' is not defined

## 特征工程--feature selection
预计采用xgboost 树模型进行回归预测
