In [98]:
import pandas as pd

# 数据读取

## 车辆于各省的销售数据

In [99]:
train_sales_data = pd.read_csv('train/train_sales_data.csv')
print(train_sales_data.shape)
train_sales_data.head()

(31680, 7)


Unnamed: 0,province,adcode,model,bodyType,regYear,regMonth,salesVolume
0,上海,310000,3c974920a76ac9c1,SUV,2016,1,292
1,云南,530000,3c974920a76ac9c1,SUV,2016,1,466
2,内蒙古,150000,3c974920a76ac9c1,SUV,2016,1,257
3,北京,110000,3c974920a76ac9c1,SUV,2016,1,408
4,四川,510000,3c974920a76ac9c1,SUV,2016,1,610


## 车辆于各省的搜索数据

In [100]:
train_search_data = pd.read_csv('train/train_search_data.csv')
print(train_search_data.shape)
train_search_data.head()

(31680, 6)


Unnamed: 0,province,adcode,model,regYear,regMonth,popularity
0,河南,410000,17bc272c93f19d56,2016,1,19036
1,河南,410000,17bc272c93f19d56,2016,2,17856
2,河南,410000,17bc272c93f19d56,2016,3,12517
3,河南,410000,17bc272c93f19d56,2016,4,9700
4,河南,410000,17bc272c93f19d56,2016,5,12780


## 评论、评价数据数据

In [101]:
train_user_reply_data = pd.read_csv('train/train_user_reply_data.csv')
print(train_user_reply_data.shape)
train_user_reply_data.head(24)

(1440, 5)


Unnamed: 0,model,regYear,regMonth,carCommentVolum,newsReplyVolum
0,02aab221aabc03b9,2016,1,132,399
1,02aab221aabc03b9,2016,2,160,3043
2,02aab221aabc03b9,2016,3,357,798
3,02aab221aabc03b9,2016,4,243,3821
4,02aab221aabc03b9,2016,5,283,933
5,02aab221aabc03b9,2016,6,252,143
6,02aab221aabc03b9,2016,7,557,623
7,02aab221aabc03b9,2016,8,652,1809
8,02aab221aabc03b9,2016,9,506,1033
9,02aab221aabc03b9,2016,10,649,4699


# 公共方法

In [102]:
def genDate(train_data):
    """
    将年月数据拼接起来，组成datetime类型
    """
    # 拼接时间数据
    train_data['Date'] = train_data['regYear'].apply(str) + '-' + train_data['regMonth'].apply(str)

    # 删除旧的年月数据
    del( train_data['regYear'] )
    del( train_data['regMonth'] )


    train_data['Date'] = pd.to_datetime(train_data['Date'])

    train_data = train_data.set_index('Date')
    return train_data

In [103]:
def get_model_province_data(train_data):
    """
    按照类型、省的层次构造数据。
    """
    car_models = []

    for model_num, model in train_data:
        provinces = model.groupby('province')

        model_province_data = []
        for province_name, province_data in provinces:
            model_province_data.append({'province_name' : province_name, 'province_data': province_data})

        car_models.append({'model_num' :  model_num,  'model_province_data' : model_province_data })
    
    return car_models

# 数据处理

## 销售数据与搜索数据处理

In [104]:
# 拼接销售数据与检索数据
sales_search_data = pd.merge(train_sales_data, train_search_data)
print(sales_search_data.shape)
sales_search_data.head()

(31680, 8)


Unnamed: 0,province,adcode,model,bodyType,regYear,regMonth,salesVolume,popularity
0,上海,310000,3c974920a76ac9c1,SUV,2016,1,292,1479
1,云南,530000,3c974920a76ac9c1,SUV,2016,1,466,1594
2,内蒙古,150000,3c974920a76ac9c1,SUV,2016,1,257,1479
3,北京,110000,3c974920a76ac9c1,SUV,2016,1,408,2370
4,四川,510000,3c974920a76ac9c1,SUV,2016,1,610,3562


In [105]:
sales_search_data.shape

(31680, 8)

In [106]:
train_user_reply_data.head()

Unnamed: 0,model,regYear,regMonth,carCommentVolum,newsReplyVolum
0,02aab221aabc03b9,2016,1,132,399
1,02aab221aabc03b9,2016,2,160,3043
2,02aab221aabc03b9,2016,3,357,798
3,02aab221aabc03b9,2016,4,243,3821
4,02aab221aabc03b9,2016,5,283,933


## 加入评论数据

In [107]:
total_data = pd.merge(sales_search_data, train_user_reply_data)

In [108]:
print(total_data.shape)
total_data.head()

(31680, 10)


Unnamed: 0,province,adcode,model,bodyType,regYear,regMonth,salesVolume,popularity,carCommentVolum,newsReplyVolum
0,上海,310000,3c974920a76ac9c1,SUV,2016,1,292,1479,11,106
1,云南,530000,3c974920a76ac9c1,SUV,2016,1,466,1594,11,106
2,内蒙古,150000,3c974920a76ac9c1,SUV,2016,1,257,1479,11,106
3,北京,110000,3c974920a76ac9c1,SUV,2016,1,408,2370,11,106
4,四川,510000,3c974920a76ac9c1,SUV,2016,1,610,3562,11,106


In [109]:
#  构建日期数据

# # 1.将日期数据设置为index
total_data = genDate(total_data)

# 2.不设置为index但是构造新的一个ds列
# total_data['ds'] = total_data['regYear'].apply(str) + '-' + total_data['regMonth'].apply(str)
# del( total_data['regYear'] )
# del( total_data['regMonth'] )

# total_data.rename(columns = {'salesVolume' : 'y'}, inplace = True)

total_data.head()

Unnamed: 0_level_0,province,adcode,model,bodyType,salesVolume,popularity,carCommentVolum,newsReplyVolum
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2016-01-01,上海,310000,3c974920a76ac9c1,SUV,292,1479,11,106
2016-01-01,云南,530000,3c974920a76ac9c1,SUV,466,1594,11,106
2016-01-01,内蒙古,150000,3c974920a76ac9c1,SUV,257,1479,11,106
2016-01-01,北京,110000,3c974920a76ac9c1,SUV,408,2370,11,106
2016-01-01,四川,510000,3c974920a76ac9c1,SUV,610,3562,11,106


In [110]:
# 按照种类、省市划分
total_data = total_data.groupby('model')
len(total_data)

60

In [111]:
total_data = get_model_province_data(total_data)

## 测试数据


In [112]:
evaluation_data = pd.read_csv('evaluation_public.csv')
print(evaluation_data.shape)
evaluation_data.head()

(5280, 7)


Unnamed: 0,id,province,adcode,model,regYear,regMonth,forecastVolum
0,1,上海,310000,3c974920a76ac9c1,2018,1,
1,2,云南,530000,3c974920a76ac9c1,2018,1,
2,3,内蒙古,150000,3c974920a76ac9c1,2018,1,
3,4,北京,110000,3c974920a76ac9c1,2018,1,
4,5,四川,510000,3c974920a76ac9c1,2018,1,


In [113]:
evaluation_data = genDate(evaluation_data)
evaluation_data = evaluation_data.groupby('model')
evaluation_data = get_model_province_data(evaluation_data)

# 输出测试

In [114]:
a = total_data[0]

In [115]:
a['model_num']

'02aab221aabc03b9'

In [116]:
b = a['model_province_data']
len(b)

22

In [117]:
b[0]['province_name']

'上海'

In [118]:
one_province_data = b[1]['province_data']
one_province_data

Unnamed: 0_level_0,province,adcode,model,bodyType,salesVolume,popularity,carCommentVolum,newsReplyVolum
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2016-01-01,云南,530000,02aab221aabc03b9,Sedan,238,204,132,399
2016-02-01,云南,530000,02aab221aabc03b9,Sedan,87,184,160,3043
2016-03-01,云南,530000,02aab221aabc03b9,Sedan,174,195,357,798
2016-04-01,云南,530000,02aab221aabc03b9,Sedan,151,192,243,3821
2016-05-01,云南,530000,02aab221aabc03b9,Sedan,202,189,283,933
2016-06-01,云南,530000,02aab221aabc03b9,Sedan,185,208,252,143
2016-07-01,云南,530000,02aab221aabc03b9,Sedan,247,240,557,623
2016-08-01,云南,530000,02aab221aabc03b9,Sedan,209,236,652,1809
2016-09-01,云南,530000,02aab221aabc03b9,Sedan,201,279,506,1033
2016-10-01,云南,530000,02aab221aabc03b9,Sedan,174,252,649,4699


# 模型训练

## 基于使用fbprophet

In [119]:
# from fbprophet import Prophet
# model = Prophet()
# model.fit(one_province_data)

In [120]:
# future = model.make_future_dataframe(periods=4, freq='M')
# future.tail()

In [121]:
# # 预测数据集
# forecast = model.predict(future)
# forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()

In [122]:
# # 展示预测结果
# model.plot(forecast);

In [123]:
# model = Prophet(seasonality_prior_scale=100,
#                 holidays_prior_scale=100,
#                 uncertainty_samples=30)
# model.fit(one_province_data)

In [124]:
# # 预测数据集
# forecast = model.predict(future)
# forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()

In [125]:
# # 展示预测结果
# model.plot(forecast);

## lightgbm

## 划分数据

In [134]:
to_drop = ['province', 'adcode', 'model', 'bodyType']
model_data =  one_province_data
model_data = model_data.drop(to_drop, axis='columns')


train = model_data[0:20]

x_train = train.drop('salesVolume', axis = 'columns')
y_train = train['salesVolume']

test = model_data[20:]
x_test = test.drop('salesVolume', axis = 'columns')
y_test = test['salesVolume']

In [135]:
x_test

Unnamed: 0_level_0,popularity,carCommentVolum,newsReplyVolum
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-09-01,1028,1058,378
2017-10-01,186,784,1947
2017-11-01,341,608,1113
2017-12-01,193,403,2037


In [136]:
y_test

Date
2017-09-01    174
2017-10-01    213
2017-11-01    221
2017-12-01    250
Name: salesVolume, dtype: int64

In [137]:
import lightgbm as lgb


lgb_train = lgb.Dataset(x_train, y_train)
lgb_test = lgb.Dataset(x_test, x_test, reference=lgb_train)

params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': {'l2', 'auc'},
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': 0
        }

model = lgb.train(params,
                  lgb_train,
                  num_boost_round=20,
                  valid_sets=lgb_test, 
                  early_stopping_rounds=5)

y_pred = model.predict(x_test, num_iteration=model.best_iteration)

ValueError: DataFrame for label cannot have multiple columns