In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from datetime import datetime
import seaborn as sns
import missingno as msno
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import learning_curve,validation_curve
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error,mean_squared_error
from sklearn.base import clone

from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from xgboost.sklearn import XGBRegressor
import lightgbm as lgb
from lightgbm import LGBMRegressor,LGBMClassifier,callback
from catboost import CatBoostRegressor

In [3]:
path = 'E:/大数据挖掘/比赛/天池-二手车交易价格预测/'
train_data = pd.read_csv(path + 'used_car_train_20200313.csv',sep=' ')
test_data = pd.read_csv(path + 'used_car_testB_20200421.csv',sep=' ')

In [4]:
def tree_feature(train_data,test_data):
    # 处理目标price长尾分布问题
    train_data['price'] = np.log1p(train_data['price'])
    
    # 合并数据，方便处理和构造特征
    train_data['train']=1
    test_data['train']=0
    data = pd.concat([train_data,test_data],ignore_index=True)
    
    # 数据预处理
    data = data_preprocessing(data)
    
    # 特征工程
    data = feature_engineering(data,train_data)
    
    # 筛选特征
#     data = select_feature(data)    
    
    return data


def data_preprocessing(data):
    # 处理无用值，数据过于稀疏，'name'
    data['name_count'] = data.groupby(['name'])['SaleID'].transform('count')
    del data['name']
    
    # 处理无用值，数据偏斜严重,'seller','offerType'
    del data['seller']
    del data['offerType']
    
    # 处理异常值'notRepairedDamage'，将'-'转化成nan
    data['notRepairedDamage'] = data['notRepairedDamage'].astype('str').apply(lambda x:x if x!='-' else None).astype('float32')
    
    # 处理异常值，截断'power''v_13' 'v_14'
    data['power'][data['power']>600] = 600
    data['power'][data['power']<1] = 1
    data['v_13'][data['v_13']>6] = 6
    data['v_14'][data['v_14']>4] = 4
    
    # 用众数填充缺失值
    from sklearn.impute import SimpleImputer
    null_col = ['bodyType','fuelType','gearbox','notRepairedDamage','model']
    si = SimpleImputer(strategy='most_frequent').fit(data[null_col])
    data[null_col] = si.transform(data[null_col])
    
    return data


def feature_engineering(data,train_data):
    # 时间类特征 'regDate','creatDate'
    data['creatDate'] = data['creatDate'].apply(date_process)
    data['regDate'] = data['regDate'].apply(date_process)
#     data['creat_year'] = data['creatDate'].dt.year
#     data['creat_month'] = data['creatDate'].dt.month
#     data['creat_day'] = data['creatDate'].dt.day
    data['reg_year'] = data['regDate'].dt.year
#     data['reg_month'] = data['regDate'].dt.month
#     data['reg_day'] = data['regDate'].dt.day
    data['days'] = (data['creatDate'] - data['regDate']).dt.days
    data['years'] = round(data['days'] / 365,1)
    bins = [0,1,2,3,5,8,10,15,20,30]
    data['years_bin'] = pd.cut(data['years'],bins,labels=False)
    del data['creatDate']
    del data['regDate']
    del data['days']
    
    # 地区类特征 'regionCode'
    data['city'] = data['regionCode'].apply(lambda x:str(x)[:2]).astype('int32')
#     data['region_count'] = data.groupby(['regionCode'])['SaleID'].transform('count')
    del data['SaleID']
    
    # 可分类特征，数据分桶 'power'
    bins = [i*10 for i in range(61)]
    data['power_bin'] = pd.cut(data['power'],bins,labels=False)
    
    
    # 可分类特征组合，与目标price组合
    data = feature_merge(data,train_data,'brand')
    data = feature_merge(data,train_data,'model')
    data = feature_merge(data,train_data,'kilometer')
         
    
    # 其他可分类特征组合,与目标price组合 ('years_bin','price') ('power_bin','price')
    feat1 = 'years_bin'
    data_gb = data.groupby(feat1)
    all_infos = {}
    for key,value in data_gb:
        info = {}
        value = value[value['price']>0]
        info[feat1 +'_amount'] = len(value)
        info[feat1 +'_price_max'] = value.price.max()
        info[feat1 +'_price_min'] = value.price.min()
        info[feat1 +'_price_median'] = value.price.median()
        info[feat1 +'_price_mean'] = value.price.mean()
        info[feat1 +'_price_std'] = value.price.std()
        all_infos[key] = info
    df = pd.DataFrame(all_infos).T.reset_index().rename(columns={'index':feat1})
    data = data.merge(df,how='left',on=feat1)
    
    feat2 = 'power_bin'
    data_gb = data.groupby(feat2)
    all_infos = {}
    for key,value in data_gb:
        info = {}
        value = value[value['price']>0]
        info[feat1 +'_amount'] = len(value)
        info[feat1 +'_price_max'] = value.price.max()
        info[feat1 +'_price_min'] = value.price.min()
        info[feat1 +'_price_median'] = value.price.median()
        info[feat1 +'_price_mean'] = value.price.mean()
        info[feat1 +'_price_std'] = value.price.std()
        all_infos[key] = info
    df = pd.DataFrame(all_infos).T.reset_index().rename(columns={'index':feat2})
    data = data.merge(df,how='left',on=feat2)
         
    
    # 匿名特征交叉组合
    v_list = ['v_0','v_2', 'v_3', 'v_5', 'v_6',  'v_8',  'v_10', 'v_11', 'v_12']
    for i in v_list:
        for j in range(15):
            data['new_'+ i +'*'+str(j)] = data[i] * data['v_'+str(j)]
    for i in v_list:
        for j in range(15):
            data['new_'+ i + '+' + str(j)] = data[i] + data['v_'+str(j)]
    for i in v_list:
        for j in range(15):
            data['new_'+ i + '-' + str(j)] = data[i] - data['v_'+str(j)]
    for i in v_list:
        data['new_' + i + '*years'] = data[i] * data['years']
    
    
    # 数据压缩，通过调整数据类型，减少数据在内存中占用的空间
    data = reduce_mem_usage(data)

    return data


def date_process(date):
    year = int(str(date)[:4])
    month = int(str(date)[4:6])
    day = int(str(date)[6:8])
    if month < 1:
        month = 1
    date = datetime(year,month,day)
    return date
    

def feature_merge(data,train_data,feature):
    train_gb = train_data.groupby(str(feature))
    all_infos = {} 
    for key,value in train_gb:
        info = {}
        value = value[value['price']>0]
        info[str(feature)+'_amount'] = len(value)
        info[str(feature)+'_price_max'] = value.price.max()
        info[str(feature)+'_price_min'] = value.price.min()
        info[str(feature)+'_price_median'] = value.price.median()
        info[str(feature)+'_price_mean'] = value.price.mean()
        info[str(feature)+'_price_std'] = value.price.std() 
        all_infos[key] = info   
    df = pd.DataFrame(all_infos).T.reset_index().rename(columns={'index':str(feature)})
    data = data.merge(df,how='left',on=str(feature))    
    return data   

def reduce_mem_usage(data):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = data.memory_usage().sum() 
    
    for col in data.columns:
        col_type = data[col].dtype
        
        if col_type != object:
            c_min = data[col].min()
            c_max = data[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    data[col] = data[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    data[col] = data[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    data[col] = data[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    data[col] = data[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    data[col] = data[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    data[col] = data[col].astype(np.float32)
                else:
                    data[col] = data[col].astype(np.float64)
        else:
            data[col] = data[col].astype('category')
    end_mem = data.memory_usage().sum() 
    return data 


In [6]:
data = tree_feature(train_data,test_data)

test = data[data['train']==0].drop(columns=['price','train'],axis=1)
X = data[data['train']==1].drop(columns=['price','train'],axis=1)
y = data[data['train']==1]['price'].astype('float64')

In [7]:
def k_folds(model,X,y,test):
    n_folds = 5
    kf=KFold(n_splits=n_folds,shuffle=True,random_state=88)
    
#     pred_val = np.zeros(len(X))
    pred_test = np.zeros(len(test)) 
    mae_val = 0

    
    for fold,(train_idx,val_idx) in enumerate(kf.split(X,y)):
        print('catb fold {}'.format(fold + 1))
        x_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx]
        x_val = X.iloc[val_idx]
        y_val = y.iloc[val_idx]
        
        model.fit(x_train,y_train,
                 eval_set=[(x_val,y_val)],
                 verbose=1000)
        
        pred = model.predict(x_val)
        mae_val += mean_absolute_error(np.expm1(pred),np.expm1(y_val))/kf.n_splits
        
        pred_test +=  model.predict(test)/kf.n_splits

    return mae_val,np.expm1(pred_test)

In [9]:
# n_folds = 5,learning_rate=0.03,max_depth=6,实际预测评分437

cat_clf = CatBoostRegressor(
                iterations=20000,
                learning_rate=0.03,
                depth=6,
                task_type='CPU',
                loss_function='MAE',
                eval_metric='MAE',
                od_type='Iter',
                use_best_model=True,
                early_stopping_rounds=50,
                random_seed=88)

mae_cat,prediction = k_folds(cat_clf,X,y,test)

print('catboost mae:{:<8.8f}'.format(mae_cat))

In [13]:
result_catb = pd.DataFrame()
result_catb['SaleID'] = test_data.SaleID
result_catb['price'] = prediction
result_catb['price'] = result_catb['price'].astype(np.int64)

result_catb.to_csv('./result/catb_pred.csv',index=False) 