# 二手车价格预测项目

## 项目概述
本项目使用机器学习方法预测二手车交易价格。通过数据探索、特征工程和模型训练，构建高精度的价格预测模型。

## 目录
1. [环境配置与数据加载](#1-环境配置与数据加载)
2. [探索性数据分析 (EDA)](#2-探索性数据分析-eda)
3. [数据预处理](#3-数据预处理)
4. [特征工程](#4-特征工程)
5. [模型训练与评估](#5-模型训练与评估)
6. [结果分析与总结](#6-结果分析与总结)

---
## 1. 环境配置与数据加载

In [None]:
# 导入必要的库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os

from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder

# 设置显示选项
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
plt.rcParams['font.sans-serif'] = ['SimHei']  # 中文显示
plt.rcParams['axes.unicode_minus'] = False

# 配置参数
RANDOM_STATE = 42
N_FOLDS = 5

print("环境配置完成!")

In [None]:
# 加载原始数据
train_raw = pd.read_csv('used_car_train_20200313.csv', sep=' ')
test_raw = pd.read_csv('used_car_testB_20200421.csv', sep=' ')

print(f"训练集: {train_raw.shape[0]:,} 行, {train_raw.shape[1]} 列")
print(f"测试集: {test_raw.shape[0]:,} 行, {test_raw.shape[1]} 列")
print(f"\n特征列表: {train_raw.columns.tolist()}")

In [None]:
# 查看训练集前5行
train_raw.head()

---
## 2. 探索性数据分析 (EDA)

### 2.1 数据基本信息

In [None]:
# 数据类型统计
print("数据类型:")
print(train_raw.dtypes)
print(f"\n数值型特征: {train_raw.select_dtypes(include=[np.number]).columns.tolist()}")
print(f"类别型特征: {train_raw.select_dtypes(include=['object']).columns.tolist()}")

In [None]:
# 数值特征统计描述
train_raw.describe()

### 2.2 缺失值分析

In [None]:
# 计算缺失值
missing = train_raw.isnull().sum()
missing_pct = (missing / len(train_raw) * 100).round(2)
missing_df = pd.DataFrame({
    '缺失数量': missing,
    '缺失比例(%)': missing_pct
})
missing_df = missing_df[missing_df['缺失数量'] > 0].sort_values('缺失比例(%)', ascending=False)

print("缺失值统计:")
if len(missing_df) > 0:
    print(missing_df)
else:
    print("无缺失值")

# 检查notRepairedDamage的特殊值'-'
print(f"\nnotRepairedDamage 值分布:")
print(train_raw['notRepairedDamage'].value_counts())

### 2.3 目标变量 Price 分析

In [None]:
# 目标变量统计
print("Price 统计信息:")
print(train_raw['price'].describe())
print(f"\n偏度: {train_raw['price'].skew():.4f}")
print(f"峰度: {train_raw['price'].kurtosis():.4f}")

In [None]:
# 价格分布可视化
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# 原始价格分布
axes[0].hist(train_raw['price'], bins=100, edgecolor='black', alpha=0.7)
axes[0].set_title('Price Distribution (Original)')
axes[0].set_xlabel('Price')
axes[0].set_ylabel('Frequency')

# log变换后的价格分布
axes[1].hist(np.log1p(train_raw['price']), bins=100, edgecolor='black', alpha=0.7, color='orange')
axes[1].set_title('Price Distribution (Log Transformed)')
axes[1].set_xlabel('Log(Price+1)')
axes[1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

print("结论: Price呈现右偏分布，需要进行log变换")

### 2.4 特征与Price相关性分析

In [None]:
# 计算数值特征与price的相关系数
numeric_cols = train_raw.select_dtypes(include=[np.number]).columns.tolist()
numeric_cols = [c for c in numeric_cols if c not in ['SaleID', 'price']]

correlations = train_raw[numeric_cols + ['price']].corr()['price'].drop('price')
correlations = correlations.sort_values(key=abs, ascending=False)

print("特征与Price的相关系数 (按绝对值排序):")
print(correlations.round(4))

In [None]:
# 相关性热力图 (Top 15特征)
top_features = correlations.head(15).index.tolist() + ['price']
corr_matrix = train_raw[top_features].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Top Features Correlation Heatmap')
plt.tight_layout()
plt.show()

### 2.5 关键发现总结

**EDA关键发现:**

1. **数据规模**: 训练集150,000条，测试集50,000条，共31个特征

2. **目标变量Price**: 
   - 范围: 11 ~ 99,999
   - 呈现强右偏分布 (偏度3.35)
   - 需要进行log变换

3. **缺失值**:
   - fuelType: 5.79%
   - gearbox: 3.99%
   - bodyType: 3.00%
   - notRepairedDamage含有'-'特殊值 (16.22%)

4. **强相关特征** (|r| > 0.3):
   - v_3 (-0.73), v_12 (0.69), v_8 (0.69), v_0 (0.63)
   - regDate (0.61), kilometer (-0.44), gearbox (0.33)

5. **数据质量问题**:
   - power存在异常值 (0值和超大值)
   - notRepairedDamage的'-'值需要处理

---
## 3. 数据预处理

In [None]:
# 复制原始数据
train = train_raw.copy()
test = test_raw.copy()

# 添加数据集标记
train['is_train'] = 1
test['is_train'] = 0

# 合并数据集 (便于统一处理)
data = pd.concat([train, test], axis=0, ignore_index=True)
print(f"合并后数据: {data.shape[0]:,} 行, {data.shape[1]} 列")

### 3.1 处理 notRepairedDamage

In [None]:
# 将'-'替换为NaN
print("处理前:")
print(data['notRepairedDamage'].value_counts())

data['notRepairedDamage'] = data['notRepairedDamage'].replace('-', np.nan)
data['notRepairedDamage'] = data['notRepairedDamage'].astype(float)

print("\n处理后:")
print(data['notRepairedDamage'].value_counts(dropna=False))

### 3.2 处理 power 异常值

In [None]:
# power异常值统计
print("处理前:")
print(f"  power=0 数量: {(data['power'] == 0).sum()}")
print(f"  power>600 数量: {(data['power'] > 600).sum()}")
print(f"  power范围: [{data['power'].min()}, {data['power'].max()}]")

# 获取正常power的中位数
power_median = data.loc[data['power'] > 0, 'power'].median()
print(f"  正常power中位数: {power_median}")

# 处理异常值
# 1. power=0 用中位数填充
data.loc[data['power'] == 0, 'power'] = power_median
# 2. power>600 截断为600
data.loc[data['power'] > 600, 'power'] = 600

print("\n处理后:")
print(f"  power范围: [{data['power'].min()}, {data['power'].max()}]")

### 3.3 优化数据类型

In [None]:
def reduce_memory_usage(df):
    """优化DataFrame内存占用"""
    start_mem = df.memory_usage(deep=True).sum() / 1024**2
    
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min, c_max = df[col].min(), df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min >= 0:
                    if c_max < 255: df[col] = df[col].astype(np.uint8)
                    elif c_max < 65535: df[col] = df[col].astype(np.uint16)
                    elif c_max < 4294967295: df[col] = df[col].astype(np.uint32)
                else:
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
            elif str(col_type)[:5] == 'float':
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
    
    end_mem = df.memory_usage(deep=True).sum() / 1024**2
    print(f"内存: {start_mem:.2f}MB -> {end_mem:.2f}MB (减少 {100*(start_mem-end_mem)/start_mem:.1f}%)")
    return df

data = reduce_memory_usage(data)

---
## 4. 特征工程

### 4.1 时间特征

In [None]:
def parse_date(date_int):
    """将整数日期转换为datetime"""
    try:
        date_str = str(int(date_int))
        if len(date_str) == 8:
            return pd.to_datetime(date_str, format='%Y%m%d', errors='coerce')
    except:
        pass
    return pd.NaT

# 解析日期
data['regDate_dt'] = data['regDate'].apply(parse_date)
data['creatDate_dt'] = data['creatDate'].apply(parse_date)

# 提取时间特征
data['reg_year'] = data['regDate_dt'].dt.year
data['reg_month'] = data['regDate_dt'].dt.month
data['reg_weekday'] = data['regDate_dt'].dt.weekday
data['reg_quarter'] = data['regDate_dt'].dt.quarter

data['creat_year'] = data['creatDate_dt'].dt.year
data['creat_month'] = data['creatDate_dt'].dt.month
data['creat_weekday'] = data['creatDate_dt'].dt.weekday

# 车辆使用年限 (关键特征!)
data['car_age_days'] = (data['creatDate_dt'] - data['regDate_dt']).dt.days
data['car_age_years'] = data['car_age_days'] / 365.25

# 清理临时列
data.drop(['regDate_dt', 'creatDate_dt'], axis=1, inplace=True)

print("时间特征创建完成!")
print(f"车龄范围: {data['car_age_years'].min():.1f} ~ {data['car_age_years'].max():.1f} 年")

### 4.2 V系列统计特征

In [None]:
# V系列特征列表
v_cols = [f'v_{i}' for i in range(15)]

# 统计特征
data['v_mean'] = data[v_cols].mean(axis=1)
data['v_std'] = data[v_cols].std(axis=1)
data['v_max'] = data[v_cols].max(axis=1)
data['v_min'] = data[v_cols].min(axis=1)
data['v_median'] = data[v_cols].median(axis=1)
data['v_range'] = data['v_max'] - data['v_min']
data['v_sum'] = data[v_cols].sum(axis=1)

# 交互特征 (基于EDA发现的强相关特征)
data['v3_v12_interaction'] = data['v_3'] * data['v_12']
data['v0_v12_interaction'] = data['v_0'] * data['v_12']
data['v0_v8_interaction'] = data['v_0'] * data['v_8']
data['v3_v8_interaction'] = data['v_3'] * data['v_8']

# v_0和v_3的衍生特征
data['v0_squared'] = data['v_0'] ** 2
data['v0_v3_ratio'] = data['v_0'] / (data['v_3'] + 1e-5)
data['v3_squared'] = data['v_3'] ** 2
data['v3_abs'] = data['v_3'].abs()

# 价值评分 (组合强相关特征)
data['value_score'] = data['v_0']*0.3 + data['v_8']*0.3 + data['v_12']*0.3 - data['v_3']*0.3

print("V系列特征创建完成!")

### 4.3 类别交叉特征

In [None]:
# 类别组合特征
data['brand_model'] = data['brand'].astype(str) + '_' + data['model'].astype(str)
data['brand_bodyType'] = data['brand'].astype(str) + '_' + data['bodyType'].astype(str)
data['brand_fuelType'] = data['brand'].astype(str) + '_' + data['fuelType'].astype(str)
data['brand_gearbox'] = data['brand'].astype(str) + '_' + data['gearbox'].astype(str)

print("类别交叉特征创建完成!")

### 4.4 统计编码特征 (基于训练集计算，防止泄露)

In [None]:
# 分离训练集用于计算统计量
train_mask = data['is_train'] == 1
train_data = data[train_mask].copy()

# 全局统计量 (用于填充未见过的类别)
global_mean = train_data['price'].mean()
global_median = train_data['price'].median()
global_std = train_data['price'].std()

# 基于brand的price统计编码
brand_stats = train_data.groupby('brand')['price'].agg(['mean', 'median', 'std', 'count'])
brand_stats.columns = ['brand_price_mean', 'brand_price_median', 'brand_price_std', 'brand_count']
data = data.merge(brand_stats, on='brand', how='left')
data['brand_price_mean'].fillna(global_mean, inplace=True)
data['brand_price_median'].fillna(global_median, inplace=True)
data['brand_price_std'].fillna(global_std, inplace=True)
data['brand_count'].fillna(1, inplace=True)

# 基于model的price统计编码
model_stats = train_data.groupby('model')['price'].agg(['mean', 'median', 'std', 'count'])
model_stats.columns = ['model_price_mean', 'model_price_median', 'model_price_std', 'model_count']
data = data.merge(model_stats, on='model', how='left')
data['model_price_mean'].fillna(global_mean, inplace=True)
data['model_price_median'].fillna(global_median, inplace=True)
data['model_price_std'].fillna(global_std, inplace=True)
data['model_count'].fillna(1, inplace=True)

# 基于brand_model组合的统计编码
brand_model_stats = train_data.groupby('brand_model')['price'].agg(['mean', 'count'])
brand_model_stats.columns = ['brand_model_price_mean', 'brand_model_count']
data = data.merge(brand_model_stats, on='brand_model', how='left')
data['brand_model_price_mean'].fillna(global_mean, inplace=True)
data['brand_model_count'].fillna(1, inplace=True)

print("统计编码特征创建完成!")

### 4.5 其他衍生特征

In [None]:
# power相关特征
data['power_per_age'] = data['power'] / (data['car_age_years'] + 0.1)
data['power_bin'] = pd.cut(data['power'], bins=[0, 75, 110, 150, 200, 600], labels=[0,1,2,3,4]).astype(float)

# kilometer相关特征
data['km_per_year'] = data['kilometer'] / (data['car_age_years'] + 0.1)
data['km_bin'] = pd.cut(data['kilometer'], bins=[0, 3, 6, 10, 12.5, 15], labels=[0,1,2,3,4]).astype(float)

# name分箱
data['name_count_bin'] = pd.qcut(data['name'], q=10, labels=False, duplicates='drop')

print("其他衍生特征创建完成!")

### 4.6 缺失值填充

In [None]:
# 基于训练集计算填充值
train_data = data[data['is_train'] == 1]

# 数值型用中位数填充
numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist()
for col in numeric_cols:
    if data[col].isnull().sum() > 0:
        median_val = train_data[col].median()
        data[col].fillna(median_val, inplace=True)

# 类别型用众数填充
cat_cols = data.select_dtypes(include=['object']).columns.tolist()
for col in cat_cols:
    if data[col].isnull().sum() > 0:
        mode_val = train_data[col].mode()
        if len(mode_val) > 0:
            data[col].fillna(mode_val[0], inplace=True)

print(f"缺失值填充完成! 剩余缺失值: {data.isnull().sum().sum()}")

In [None]:
# 删除临时的类别交叉特征字符串列 (已用于统计编码)
cross_cols = ['brand_model', 'brand_bodyType', 'brand_fuelType', 'brand_gearbox']
data.drop(cross_cols, axis=1, inplace=True, errors='ignore')

print(f"最终特征数量: {data.shape[1]}")
print(f"\n特征列表: {data.columns.tolist()}")

---
## 5. 模型训练与评估

### 5.1 准备训练数据

In [None]:
# 分离训练集和测试集
train_df = data[data['is_train'] == 1].drop('is_train', axis=1).reset_index(drop=True)
test_df = data[data['is_train'] == 0].drop('is_train', axis=1).reset_index(drop=True)

# 定义特征和目标
target_col = 'price'
id_col = 'SaleID'
feature_cols = [c for c in train_df.columns if c not in [target_col, id_col]]

# 类别特征列表
cat_features = ['brand', 'model', 'bodyType', 'fuelType', 'gearbox',
                'notRepairedDamage', 'regionCode', 'reg_month', 'reg_weekday',
                'reg_quarter', 'creat_month', 'creat_weekday', 'power_bin',
                'km_bin', 'name_count_bin', 'reg_year']
cat_features = [f for f in cat_features if f in feature_cols]

print(f"训练集: {train_df.shape}")
print(f"测试集: {test_df.shape}")
print(f"特征数: {len(feature_cols)}")
print(f"类别特征数: {len(cat_features)}")

In [None]:
# 准备数据
X = train_df[feature_cols].copy()
y = train_df[target_col].values
X_test = test_df[feature_cols].copy()
test_ids = test_df[id_col].values

# 目标变量log变换
y_log = np.log1p(y)
print(f"目标变换: log1p")
print(f"原始范围: [{y.min()}, {y.max()}]")
print(f"变换后范围: [{y_log.min():.2f}, {y_log.max():.2f}]")

### 5.2 CatBoost模型训练

In [None]:
from catboost import CatBoostRegressor, Pool

# 类别特征转字符串
X_cat = X.copy()
X_test_cat = X_test.copy()
for col in cat_features:
    X_cat[col] = X_cat[col].astype(str)
    X_test_cat[col] = X_test_cat[col].astype(str)

cat_indices = [feature_cols.index(f) for f in cat_features]

# CatBoost参数
catboost_params = {
    'iterations': 10000,
    'learning_rate': 0.05,
    'depth': 8,
    'l2_leaf_reg': 10,
    'min_data_in_leaf': 50,
    'random_strength': 0.5,
    'bagging_temperature': 0.2,
    'border_count': 254,
    'random_seed': RANDOM_STATE,
    'verbose': 500,
    'early_stopping_rounds': 200,
    'loss_function': 'RMSE',
    'eval_metric': 'MAE'
}

print("CatBoost参数配置完成!")

In [None]:
# 5折交叉验证
oof_pred_cat = np.zeros(len(X))
test_pred_cat = np.zeros(len(X_test))
cat_scores = []

kfold = KFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)

for fold, (train_idx, valid_idx) in enumerate(kfold.split(X_cat)):
    print(f"\n{'='*50}")
    print(f"Fold {fold+1}/{N_FOLDS}")
    print(f"{'='*50}")
    
    X_train, X_valid = X_cat.iloc[train_idx], X_cat.iloc[valid_idx]
    y_train, y_valid = y_log[train_idx], y_log[valid_idx]
    
    train_pool = Pool(X_train, y_train, cat_features=cat_indices)
    valid_pool = Pool(X_valid, y_valid, cat_features=cat_indices)
    
    model = CatBoostRegressor(**catboost_params)
    model.fit(train_pool, eval_set=valid_pool, use_best_model=True)
    
    # 预测并逆变换
    valid_pred = np.clip(np.expm1(model.predict(X_valid)), 0, None)
    test_pred_cat += np.expm1(model.predict(X_test_cat)) / N_FOLDS
    
    # 计算MAE
    fold_mae = mean_absolute_error(np.expm1(y_valid), valid_pred)
    cat_scores.append(fold_mae)
    oof_pred_cat[valid_idx] = valid_pred
    
    print(f"Fold {fold+1} MAE: {fold_mae:.2f}")

test_pred_cat = np.clip(test_pred_cat, 0, None)

print(f"\n{'='*50}")
print(f"CatBoost 5折CV结果")
print(f"{'='*50}")
print(f"各折MAE: {[f'{s:.2f}' for s in cat_scores]}")
print(f"平均MAE: {np.mean(cat_scores):.2f} (+/- {np.std(cat_scores):.2f})")
print(f"OOF MAE: {mean_absolute_error(y, oof_pred_cat):.2f}")

### 5.3 LightGBM模型训练

In [None]:
import lightgbm as lgb

# 标签编码类别特征
X_lgb = X.copy()
X_test_lgb = X_test.copy()

for col in cat_features:
    le = LabelEncoder()
    combined = pd.concat([X_lgb[col].astype(str), X_test_lgb[col].astype(str)], axis=0)
    le.fit(combined)
    X_lgb[col] = le.transform(X_lgb[col].astype(str))
    X_test_lgb[col] = le.transform(X_test_lgb[col].astype(str))

# LightGBM参数
lgb_params = {
    'objective': 'regression',
    'metric': 'mae',
    'boosting_type': 'gbdt',
    'num_leaves': 255,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'min_child_samples': 50,
    'lambda_l1': 0.1,
    'lambda_l2': 10,
    'random_state': RANDOM_STATE,
    'verbose': -1,
    'n_jobs': -1
}

print("LightGBM参数配置完成!")

In [None]:
# 5折交叉验证
oof_pred_lgb = np.zeros(len(X))
test_pred_lgb = np.zeros(len(X_test))
lgb_scores = []

for fold, (train_idx, valid_idx) in enumerate(kfold.split(X_lgb)):
    print(f"\n{'='*50}")
    print(f"Fold {fold+1}/{N_FOLDS}")
    print(f"{'='*50}")
    
    X_train, X_valid = X_lgb.iloc[train_idx], X_lgb.iloc[valid_idx]
    y_train, y_valid = y_log[train_idx], y_log[valid_idx]
    
    train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=cat_features)
    valid_data = lgb.Dataset(X_valid, label=y_valid, categorical_feature=cat_features)
    
    model = lgb.train(
        lgb_params, train_data, num_boost_round=10000,
        valid_sets=[train_data, valid_data], valid_names=['train', 'valid'],
        callbacks=[lgb.early_stopping(200), lgb.log_evaluation(500)]
    )
    
    # 预测并逆变换
    valid_pred = np.clip(np.expm1(model.predict(X_valid)), 0, None)
    test_pred_lgb += np.expm1(model.predict(X_test_lgb)) / N_FOLDS
    
    # 计算MAE
    fold_mae = mean_absolute_error(np.expm1(y_valid), valid_pred)
    lgb_scores.append(fold_mae)
    oof_pred_lgb[valid_idx] = valid_pred
    
    print(f"Fold {fold+1} MAE: {fold_mae:.2f}")

test_pred_lgb = np.clip(test_pred_lgb, 0, None)

print(f"\n{'='*50}")
print(f"LightGBM 5折CV结果")
print(f"{'='*50}")
print(f"各折MAE: {[f'{s:.2f}' for s in lgb_scores]}")
print(f"平均MAE: {np.mean(lgb_scores):.2f} (+/- {np.std(lgb_scores):.2f})")
print(f"OOF MAE: {mean_absolute_error(y, oof_pred_lgb):.2f}")

---
## 6. 结果分析与总结

### 6.1 模型对比

In [None]:
# 模型对比
results = pd.DataFrame({
    'Model': ['CatBoost', 'LightGBM'],
    'Mean MAE': [np.mean(cat_scores), np.mean(lgb_scores)],
    'Std MAE': [np.std(cat_scores), np.std(lgb_scores)],
    'OOF MAE': [mean_absolute_error(y, oof_pred_cat), mean_absolute_error(y, oof_pred_lgb)]
})

print("模型性能对比:")
print(results.to_string(index=False))

### 6.2 生成提交文件

In [None]:
# 选择最佳模型生成提交文件
best_pred = test_pred_cat  # CatBoost表现更好

submission = pd.DataFrame({
    'SaleID': test_ids.astype(int),
    'price': best_pred
})

submission.to_csv('submission.csv', index=False)
print("提交文件已保存: submission.csv")
print(f"预测价格范围: [{submission['price'].min():.0f}, {submission['price'].max():.0f}]")
print(f"预测价格均值: {submission['price'].mean():.0f}")

submission.head(10)

### 6.3 项目总结

**项目成果:**

1. **数据探索**: 发现价格分布右偏、V系列特征与价格强相关、存在缺失值和异常值

2. **特征工程**: 构建了47个新特征
   - 时间特征: 车龄、注册月份等
   - V系列统计特征: 均值、标准差、交互特征等
   - 统计编码特征: 基于brand/model的价格统计

3. **模型训练**: 
   - CatBoost: MAE ≈ 476
   - LightGBM: MAE ≈ 516

4. **重要特征**: value_score, v3_v12_interaction, v3_squared, v3_abs, regDate

**改进方向:**
- 尝试更多特征交互
- 模型融合 (Stacking/Blending)
- 超参数调优 (Optuna)
- 处理高杠杆点和异常值