In [1]:
import pandas as pd

# 数据读取

## 车辆于各省的销售数据

In [2]:
train_sales_data = pd.read_csv('train/train_sales_data.csv')
print(train_sales_data.shape)
train_sales_data.head()

(31680, 7)


Unnamed: 0,province,adcode,model,bodyType,regYear,regMonth,salesVolume
0,上海,310000,3c974920a76ac9c1,SUV,2016,1,292
1,云南,530000,3c974920a76ac9c1,SUV,2016,1,466
2,内蒙古,150000,3c974920a76ac9c1,SUV,2016,1,257
3,北京,110000,3c974920a76ac9c1,SUV,2016,1,408
4,四川,510000,3c974920a76ac9c1,SUV,2016,1,610


## 车辆于各省的搜索数据

In [3]:
train_search_data = pd.read_csv('train/train_search_data.csv')
print(train_search_data.shape)
train_search_data.head()

(31680, 6)


Unnamed: 0,province,adcode,model,regYear,regMonth,popularity
0,河南,410000,17bc272c93f19d56,2016,1,19036
1,河南,410000,17bc272c93f19d56,2016,2,17856
2,河南,410000,17bc272c93f19d56,2016,3,12517
3,河南,410000,17bc272c93f19d56,2016,4,9700
4,河南,410000,17bc272c93f19d56,2016,5,12780


## 评论、评价数据数据

In [4]:
train_user_reply_data = pd.read_csv('train/train_user_reply_data.csv')
print(train_user_reply_data.shape)
train_user_reply_data.head(24)

(1440, 5)


Unnamed: 0,model,regYear,regMonth,carCommentVolum,newsReplyVolum
0,02aab221aabc03b9,2016,1,132,399
1,02aab221aabc03b9,2016,2,160,3043
2,02aab221aabc03b9,2016,3,357,798
3,02aab221aabc03b9,2016,4,243,3821
4,02aab221aabc03b9,2016,5,283,933
5,02aab221aabc03b9,2016,6,252,143
6,02aab221aabc03b9,2016,7,557,623
7,02aab221aabc03b9,2016,8,652,1809
8,02aab221aabc03b9,2016,9,506,1033
9,02aab221aabc03b9,2016,10,649,4699


# 公共方法

In [5]:
def genDate(train_data):
    """
    将年月数据拼接起来，组成datetime类型
    """
    # 拼接时间数据
    train_data['Date'] = train_data['regYear'].apply(str) + '-' + train_data['regMonth'].apply(str)

    # 删除旧的年月数据
    del( train_data['regYear'] )
    del( train_data['regMonth'] )


    train_data['Date'] = pd.to_datetime(train_data['Date'])

    train_data = train_data.set_index('Date')
    return train_data

In [6]:
def get_model_province_data(train_data):
    """
    按照类型、省的层次构造数据。
    """
    car_models = []

    for model_num, model in train_data:
        provinces = model.groupby('province')

        model_province_data = []
        for province_name, province_data in provinces:
            model_province_data.append({'province_name' : province_name, 'province_data': province_data})

        car_models.append({'model_num' :  model_num,  'model_province_data' : model_province_data })
    
    return car_models

# 数据处理

## 销售数据与搜索数据处理

In [7]:
# 拼接销售数据与检索数据
sales_search_data = pd.merge(train_sales_data, train_search_data)
print(sales_search_data.shape)
sales_search_data.head()

(31680, 8)


Unnamed: 0,province,adcode,model,bodyType,regYear,regMonth,salesVolume,popularity
0,上海,310000,3c974920a76ac9c1,SUV,2016,1,292,1479
1,云南,530000,3c974920a76ac9c1,SUV,2016,1,466,1594
2,内蒙古,150000,3c974920a76ac9c1,SUV,2016,1,257,1479
3,北京,110000,3c974920a76ac9c1,SUV,2016,1,408,2370
4,四川,510000,3c974920a76ac9c1,SUV,2016,1,610,3562


In [8]:
sales_search_data.shape

(31680, 8)

In [9]:
train_user_reply_data.head()

Unnamed: 0,model,regYear,regMonth,carCommentVolum,newsReplyVolum
0,02aab221aabc03b9,2016,1,132,399
1,02aab221aabc03b9,2016,2,160,3043
2,02aab221aabc03b9,2016,3,357,798
3,02aab221aabc03b9,2016,4,243,3821
4,02aab221aabc03b9,2016,5,283,933


## 加入评论数据

In [10]:
total_data = pd.merge(sales_search_data, train_user_reply_data)

In [11]:
print(total_data.shape)
total_data.head()

(31680, 10)


Unnamed: 0,province,adcode,model,bodyType,regYear,regMonth,salesVolume,popularity,carCommentVolum,newsReplyVolum
0,上海,310000,3c974920a76ac9c1,SUV,2016,1,292,1479,11,106
1,云南,530000,3c974920a76ac9c1,SUV,2016,1,466,1594,11,106
2,内蒙古,150000,3c974920a76ac9c1,SUV,2016,1,257,1479,11,106
3,北京,110000,3c974920a76ac9c1,SUV,2016,1,408,2370,11,106
4,四川,510000,3c974920a76ac9c1,SUV,2016,1,610,3562,11,106


In [12]:
#  构建日期数据

# # 1.将日期数据设置为index
# total_data = genDate(total_data)

# 2.不设置为index但是构造新的一个ds列
total_data['ds'] = total_data['regYear'].apply(str) + '-' + total_data['regMonth'].apply(str)
del( total_data['regYear'] )
del( total_data['regMonth'] )

total_data.rename(columns = {'salesVolume' : 'y'}, inplace = True)

# 删除地区码，因为他和省重复了
del( total_data['adcode'] )
total_data.head()

Unnamed: 0,province,model,bodyType,y,popularity,carCommentVolum,newsReplyVolum,ds
0,上海,3c974920a76ac9c1,SUV,292,1479,11,106,2016-1
1,云南,3c974920a76ac9c1,SUV,466,1594,11,106,2016-1
2,内蒙古,3c974920a76ac9c1,SUV,257,1479,11,106,2016-1
3,北京,3c974920a76ac9c1,SUV,408,2370,11,106,2016-1
4,四川,3c974920a76ac9c1,SUV,610,3562,11,106,2016-1


In [13]:
# 按照种类、省市划分
total_data = total_data.groupby('model')
len(total_data)

60

In [14]:
total_data = get_model_province_data(total_data)

## 测试数据


In [15]:
evaluation_data = pd.read_csv('evaluation_public.csv')
print(evaluation_data.shape)
evaluation_data.head()

(5280, 7)


Unnamed: 0,id,province,adcode,model,regYear,regMonth,forecastVolum
0,1,上海,310000,3c974920a76ac9c1,2018,1,
1,2,云南,530000,3c974920a76ac9c1,2018,1,
2,3,内蒙古,150000,3c974920a76ac9c1,2018,1,
3,4,北京,110000,3c974920a76ac9c1,2018,1,
4,5,四川,510000,3c974920a76ac9c1,2018,1,


In [16]:
evaluation_data = genDate(evaluation_data)
evaluation_data = evaluation_data.groupby('model')
evaluation_data = get_model_province_data(evaluation_data)

# 输出测试

In [17]:
a = total_data[0]

In [18]:
a['model_num']

'02aab221aabc03b9'

In [19]:
b = a['model_province_data']
len(b)

22

In [20]:
b[0]['province_name']

'上海'

In [21]:
one_province_data = b[0]['province_data']
one_province_data

Unnamed: 0,province,model,bodyType,y,popularity,carCommentVolum,newsReplyVolum,ds
506,上海,02aab221aabc03b9,Sedan,617,515,132,399,2016-1
1826,上海,02aab221aabc03b9,Sedan,279,508,160,3043,2016-2
3146,上海,02aab221aabc03b9,Sedan,554,603,357,798,2016-3
4466,上海,02aab221aabc03b9,Sedan,507,515,243,3821,2016-4
5786,上海,02aab221aabc03b9,Sedan,485,481,283,933,2016-5
7106,上海,02aab221aabc03b9,Sedan,527,501,252,143,2016-6
8426,上海,02aab221aabc03b9,Sedan,653,549,557,623,2016-7
9746,上海,02aab221aabc03b9,Sedan,687,582,652,1809,2016-8
11066,上海,02aab221aabc03b9,Sedan,651,597,506,1033,2016-9
12386,上海,02aab221aabc03b9,Sedan,489,593,649,4699,2016-10


# 模型训练

In [22]:
from fbprophet import Prophet
model = Prophet()
model.fit(one_province_data)

ERROR:fbprophet:Importing plotly failed. Interactive plots will not work.
INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:fbprophet:n_changepoints greater than number of observations.Using 18.


<fbprophet.forecaster.Prophet at 0x7f868b3db6d8>

In [25]:
future = model.make_future_dataframe(periods=5, freq='M')
future

Unnamed: 0,ds
0,2016-01-01
1,2016-02-01
2,2016-03-01
3,2016-04-01
4,2016-05-01
5,2016-06-01
6,2016-07-01
7,2016-08-01
8,2016-09-01
9,2016-10-01
