In [2]:
%pylab inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import datetime
from sklearn.linear_model import Ridge, Lasso
from IPython import display

Populating the interactive namespace from numpy and matplotlib


In [3]:
from ipywidgets import widgets
from ipywidgets.widgets import interact, interactive, fixed
from ipywidgets import *

Загрузим все данные:
- количество поездок из района
- количество поездок в район
- суммарная стоимость поездок из района за час
- суммарное количество пассажиров за час

In [5]:
%%time
significant_regions = pd.read_csv('SignificantRegions.txt', sep=',', header = None)
reg_list = significant_regions.values.astype(str)[0]
reg_list.shape

if os.path.exists('./trips_from_region.csv'):
    trips_from_region = pd.read_csv('trips_from_region.csv', index_col=0)
else:
    data_dir = "./trip_count"
    file_list = os.listdir(data_dir)
    data_files = filter(lambda x: x.startswith('trip_count_'), file_list)
        
    trips_from_region = pd.DataFrame()
    for fname in data_files:
        tmp = pd.read_csv(data_dir + '/' + fname, index_col=0)    
        trips_from_region = trips_from_region.append(tmp)

    trips_from_region = trips_from_region[reg_list]
    trips_from_region.to_csv("trips_from_region.csv")
#=======================================================================    
if os.path.exists('./amount.csv'):
    amount = pd.read_csv('amount.csv', index_col=0)
else:
    data_dir = "./amount"
    file_list = os.listdir(data_dir)
    data_files = filter(lambda x: x.startswith('amount_'), file_list)

    amount = pd.DataFrame()
    for fname in data_files:
        tmp = pd.read_csv(data_dir + '/' + fname, index_col=0)
        amount = amount.append(tmp)
    amount = amount[reg_list]
    amount.to_csv("amount.csv")
#=======================================================================
if os.path.exists('./passengers.csv'):
    passengers = pd.read_csv('passengers.csv', index_col=0)
else:
    data_dir = "./passengers"
    file_list = os.listdir(data_dir)
    data_files = filter(lambda x: x.startswith('passengers_count_'), file_list)
    
    passengers = pd.DataFrame()
    for fname in data_files:
        tmp = pd.read_csv(data_dir + '/' + fname, index_col=0)
        passengers = passengers.append(tmp)
    passengers = passengers[reg_list]
    passengers.to_csv("passengers.csv")
#=======================================================================
if os.path.exists('./trips_to_region.csv'):
    trips_to_region = pd.read_csv('trips_to_region.csv', index_col=0)
else:
    data_dir = "./trip_to_region"
    file_list = os.listdir(data_dir)
    data_files = filter(lambda x: x.startswith('trip_to_region_count_'), file_list)
    
    trips_to_region = pd.DataFrame()
    for fname in data_files:
        tmp = pd.read_csv(data_dir + '/' + fname, index_col=0)
        trips_to_region = trips_to_region.append(tmp)
    trips_to_region = trip_to_region[reg_list]
    trips_to_region.to_csv("trips_to_region.csv")

Wall time: 2.21 s


Соберем все признаки, которые придумаем в одну таблицу, а при обучении моделей будем исключать признаки, которые ухудшают качество. 

In [6]:
def CreateFeatures(from_reg, to_reg, amt, psg, region):

    hinw = 24*7
    hiny = 24*365
    n_obj = from_reg.shape[0]
    
    #список признаков, которые нужно будет нормализовать
    to_normalize = []

    common_features = pd.DataFrame(index = pd.to_datetime(from_reg.index))    
    common_features['trend'] = np.arange(from_reg.shape[0])
    to_normalize.append('trend')

    #синусы и косинусы
    for k in xrange(1,50): 
        #для недельной сезонности:
        common_features['ws' + str(k)] = np.sin(np.arange(n_obj)*(2.0*np.pi*k/hinw))
        common_features['wc' + str(k)] = np.cos(np.arange(n_obj)*(2.0*np.pi*k/hinw))
        
        to_normalize.append('ws' + str(k))
        to_normalize.append('wc' + str(k))
        
    for k in xrange(1,37):
        #для годовой сезонности:
        common_features['ys' + str(k)] = np.sin(np.arange(n_obj)*(2.0*np.pi*k/hiny))
        common_features['yc' + str(k)] = np.cos(np.arange(n_obj)*(2.0*np.pi*k/hiny))        
        
        to_normalize.append('ys' + str(k))
        to_normalize.append('yc' + str(k))
        
    
    #месяцы года
    for i in xrange(1,13):
        common_features['m' + str(i)] = [1 if (x.month == i) else 0 for x in common_features.index]    

    #дни месяца
    for i in xrange(1,32):
        common_features['dom' + str(i)] = [1 if (x.day == i) else 0 for x in common_features.index]    

    #дни недели
    for i in xrange(0,7):
        common_features['dow' + str(i)] = [1 if (x.dayofweek == i) else 0 for x in common_features.index]   

    #часы
    for i in xrange(0,24):
        common_features['h' + str(i)] = [1 if (x.hour == i) else 0 for x in common_features.index]

    #те же признаки в категориальном виде
    common_features['m'] = [x.month for x in common_features.index]
    common_features['dom'] = [x.day for x in common_features.index]
    common_features['dow'] = [x.dayofweek for x in common_features.index]
    common_features['h'] = [x.hour for x in common_features.index]
    
    to_normalize.append('m')
    to_normalize.append('dom')
    to_normalize.append('dow')
    to_normalize.append('h')
    
    #Хеллоуин
    common_features['helloween'] = [1 if ((x.month == 10)&(x.day == 31)&(x.hour >= 22))|\
                                    ((x.month == 11)&(x.day == 1)&(x.hour <= 3)) else 0 for x in common_features.index]
        
    #Новый год
    common_features['new_year'] = [1 if ((x.month == 12)&(x.day == 31)&(x.hour >= 18))|\
                                   ((x.month == 1)&(x.day == 1)&(x.hour <= 6)) else 0 for x in common_features.index]
    
    
    region_features = pd.DataFrame(index = common_features.index)

    #значения в данный час
    region_features['y_t'] = from_reg[region]
    
    to_normalize.append('y_t')

    #значения за предыдущие часы
    for i in xrange(1,13):
        region_features['y_t-' + str(i)] = from_reg[region].shift(i)
        to_normalize.append('y_t-' + str(i))
    
    #количество поездок из района за 24 часа до прогнозируемого времени, т.е. за t-18 для прогноза на 6 часов, 
    #t-19 для прогноза на 5 часов и т.д.
    region_features['y_t-18'] = from_reg[region].shift(18)
    region_features['y_t-19'] = from_reg[region].shift(19)
    region_features['y_t-20'] = from_reg[region].shift(20)
    region_features['y_t-21'] = from_reg[region].shift(21)
    region_features['y_t-22'] = from_reg[region].shift(22)
    region_features['y_t-23'] = from_reg[region].shift(23)
    
    to_normalize.append('y_t-18')
    to_normalize.append('y_t-19')
    to_normalize.append('y_t-20')
    to_normalize.append('y_t-21')
    to_normalize.append('y_t-22')
    to_normalize.append('y_t-23')

    #значения за предыдущие сутки
    for i in xrange(1,3):
        region_features['y_t-' + str(24*i)] = from_reg[region].shift(24*i)
        to_normalize.append('y_t-' + str(24*i))
        

    
    #сумма поездок за предыдущие 3, 6, 12, 24, 48 часов
    #также возьмем эти суммы от моментов времени за 24 до прогнозируемого
    for i in xrange(6):
        region_features['sum_3h(t-'+str(i)+')'] = from_reg[region].shift(i).rolling(3, min_periods=3).sum()
        region_features['sum_6h(t-'+str(i)+')'] = from_reg[region].shift(i).rolling(6, min_periods=6).sum()
        region_features['sum_12h(t-'+str(i)+')'] = from_reg[region].shift(i).rolling(12, min_periods=12).sum()
        region_features['sum_24h(t-'+str(i)+')'] = from_reg[region].shift(i).rolling(24, min_periods=24).sum()
        region_features['sum_48h(t-'+str(i)+')'] = from_reg[region].shift(i).rolling(48, min_periods=48).sum()
        to_normalize.append('sum_3h(t-'+str(i)+')')
        to_normalize.append('sum_6h(t-'+str(i)+')')
        to_normalize.append('sum_12h(t-'+str(i)+')')
        to_normalize.append('sum_24h(t-'+str(i)+')')
        to_normalize.append('sum_48h(t-'+str(i)+')')
    
    #среднее число поездок за предыдущие 3, 6, 12, 24, 48 часов
    for i in xrange(6):
        region_features['mean_3h(t-'+str(i)+')'] = region_features['sum_3h(t-'+str(i)+')']/3.
        region_features['mean_6h(t-'+str(i)+')'] = region_features['sum_6h(t-'+str(i)+')']/6.
        region_features['mean_12h(t-'+str(i)+')'] = region_features['sum_12h(t-'+str(i)+')']/12.
        region_features['mean_24h(t-'+str(i)+')'] = region_features['sum_24h(t-'+str(i)+')']/24.
        region_features['mean_48h(t-'+str(i)+')'] = region_features['sum_48h(t-'+str(i)+')']/48.
    
        to_normalize.append('mean_3h(t-'+str(i)+')')
        to_normalize.append('mean_6h(t-'+str(i)+')')
        to_normalize.append('mean_12h(t-'+str(i)+')')
        to_normalize.append('mean_24h(t-'+str(i)+')')
        to_normalize.append('mean_48h(t-'+str(i)+')')
    
    #средняя стоимость поездки за 6, 12, 24 часа до прогнозируемого периода
    tmp = amt[region]/from_reg[region]
    tmp = tmp.fillna(0)
    for i in xrange(6):
        region_features['mean_amount_t-'+str(i)] = tmp.shift(i)
        region_features['mean_amount_t-'+str(i+6)] = tmp.shift(i+6)
        region_features['mean_amount_t-'+str(i+18)] = tmp.shift(i+18)
        to_normalize.append('mean_amount_t-'+str(i))
        to_normalize.append('mean_amount_t-'+str(i+6))
        to_normalize.append('mean_amount_t-'+str(i+18))
    
    #за предыдущие 3 часа
    region_features['mean_amount_3h'] = region_features['mean_amount_t-0'].shift(1).rolling(3, min_periods=3).sum()
    to_normalize.append('mean_amount_3h')
    
    #количество поездок в район    
    region_features['z_t'] = to_reg[region]
    
    to_normalize.append('z_t')
    
    #значения за предыдущие часы
    for i in xrange(1,7):
        region_features['z_t-' + str(i)] = to_reg[region].shift(i)
        to_normalize.append('z_t-' + str(i))
    
    #количество поездок в района за 24 часа до прогнозируемого времени
    region_features['z_t-18'] = to_reg[region].shift(18)
    region_features['z_t-19'] = to_reg[region].shift(19)
    region_features['z_t-20'] = to_reg[region].shift(20)
    region_features['z_t-21'] = to_reg[region].shift(21)
    region_features['z_t-22'] = to_reg[region].shift(22)
    region_features['z_t-23'] = to_reg[region].shift(23)
    
    to_normalize.append('z_t-18')
    to_normalize.append('z_t-19')
    to_normalize.append('z_t-20')
    to_normalize.append('z_t-21')
    to_normalize.append('z_t-22')
    to_normalize.append('z_t-23')
    
    #количество пассажиров
    region_features['p_t-0'] = passengers[region]
    to_normalize.append('p_t-0')

    #среднее количество пассажиров за предыдущие 3, 6, 12, 24, 48 часов    
    region_features['p_mean_3h'] = passengers[region].rolling(3, min_periods=3).sum()/3.
    region_features['p_mean_6h'] = passengers[region].rolling(6, min_periods=3).sum()/6.
    region_features['p_mean_12h'] = passengers[region].rolling(12, min_periods=3).sum()/12.
    region_features['p_mean_24h'] = passengers[region].rolling(24, min_periods=3).sum()/24.
    region_features['p_mean_48h'] = passengers[region].rolling(48, min_periods=3).sum()/48.
    
    to_normalize.append('p_mean_3h')
    to_normalize.append('p_mean_6h')
    to_normalize.append('p_mean_12h')
    to_normalize.append('p_mean_24h')
    to_normalize.append('p_mean_48h')    
        

    #прогнозы на 1-6 часов - целевые переменные для данного района
    for i in xrange(1,7):
        region_features['y_t+' + str(i)] = from_reg[region].shift(-1*i)

    #Датафрейм для всех признаков
    all_features = pd.DataFrame()
    all_features = common_features.join(region_features)
    
    #удаляем записи с пропусками в начале и в конце выборки
    all_features.dropna(inplace = True)
    
    #нормализуем данные
    all_features[to_normalize] = (all_features[to_normalize] - all_features[to_normalize].mean())/all_features[to_normalize].std()
    all_features.fillna(0, inplace=True)
    
    return all_features

In [7]:
#функция для разделения общей таблицы на признаки и целевые переменные в заданном диапазоне времени
def PrepareData(f, start, end):
    #f - датафрейм, созданный CreateFeatures
    start = pd.to_datetime(start)
    end = pd.to_datetime(end)
    f = f[start:end]
    
    #целевые переменные
    y = []
    y.append(f['y_t+1'].values)
    y.append(f['y_t+2'].values)
    y.append(f['y_t+3'].values)
    y.append(f['y_t+4'].values)
    y.append(f['y_t+5'].values)
    y.append(f['y_t+6'].values)    
        
    #признаки    
    X = f.drop(['y_t+1','y_t+2','y_t+3','y_t+4','y_t+5','y_t+6'], axis=1)
    return X, y

In [8]:
#печать коэффициентов модели
def PrintCoefs(regressors_list, features_list):
    
    col_names =[]
    data = np.zeros([regressors_list[0].coef_.shape[0], len(regressors_list)])
    for i in xrange(len(regressors_list)):
        col_names.append('reg_' + str(i))
        data[:,i] = regressors_list[i].coef_
    
    Coefs = pd.DataFrame(index = features_list, data = data, columns = col_names)
    
    pd.set_option('display.max_rows', Coefs.shape[0]+1)
    display.display(Coefs)
    pd.set_option('display.max_rows', 30)
    
    return Coefs

In [9]:
def PostProcValue(val):
    if (val>0):
        return int(round(val,0))        
    else:
        return int(0)

In [10]:
%%time
#посмотрим на результат
features = CreateFeatures(trips_from_region, trips_to_region, amount, passengers, '1285')
display.display(features.head())

Unnamed: 0,trend,ws1,wc1,ws2,wc2,ws3,wc3,ws4,wc4,ws5,...,p_mean_6h,p_mean_12h,p_mean_24h,p_mean_48h,y_t+1,y_t+2,y_t+3,y_t+4,y_t+5,y_t+6
2013-01-03 04:00:00,-1.731966,1.315816,-0.516329,-0.961168,-1.036624,-0.613832,1.273424,1.409932,0.106099,-0.416344,...,-1.518629,-0.790595,-0.375275,-1.100828,62.0,162.0,270.0,405.0,403.0,356.0
2013-01-03 05:00:00,-1.731853,1.295578,-0.565194,-1.035938,-0.961832,-0.467297,1.334106,1.409932,-0.105262,-0.660301,...,-1.678467,-1.094141,-0.35823,-1.127311,162.0,270.0,405.0,403.0,356.0,364.0
2013-01-03 06:00:00,-1.731739,1.273528,-0.613268,-1.104912,-0.88166,-0.314887,1.378003,1.378429,-0.314261,-0.881217,...,-1.632913,-1.307953,-0.350483,-1.094563,270.0,405.0,403.0,356.0,364.0,369.0
2013-01-03 07:00:00,-1.731626,1.249696,-0.660484,-1.167703,-0.796556,-0.15852,1.404565,1.316127,-0.516232,-1.071392,...,-1.401148,-1.387163,-0.356164,-1.006283,405.0,403.0,356.0,364.0,369.0,380.0
2013-01-03 08:00:00,-1.731513,1.224115,-0.706776,-1.22396,-0.706996,-0.000161,1.413456,1.224417,-0.706661,-1.224194,...,-0.971185,-1.344511,-0.388188,-0.871586,403.0,356.0,364.0,369.0,380.0,315.0


Wall time: 7.91 s


### Подбор признаков

In [19]:
#Создаем 6 моделей
alpha = 200
reg_1h  = Ridge(alpha = alpha, random_state = 30)
reg_2h  = Ridge(alpha = alpha, random_state = 30)
reg_3h  = Ridge(alpha = alpha, random_state = 30)
reg_4h  = Ridge(alpha = alpha, random_state = 30)
reg_5h  = Ridge(alpha = alpha, random_state = 30)
reg_6h  = Ridge(alpha = alpha, random_state = 30)

In [20]:
#таблица для записи качества каждой модели на тестовом районе
results = pd.DataFrame(index = ['Q','reg_1h','reg_2h','reg_3h','reg_4h','reg_5h','reg_6h'],
                      columns = ['current','prev','best','best_skipped_features'])
results.fillna(0, inplace=True)

In [11]:
results = pd.read_csv("model_tuning results2.csv", index_col=0)

Соберем пульт для ручного побора параметров

In [21]:
#Обработчик нажатия на кнопку
def on_recalc_clicked(b):
    features_set = []
    skip_columns = []
    K_w = 0
    K_y = 0
    
    for item in skip_box.children:
        if (type(item) == widgets.widget_bool.Checkbox):
            if (item.value == False):
                skip_columns.append(str(item.description))
        else:
            if (item.description.startswith('K wc') == True):
                K_w = item.value
            else:
                K_y = item.value
                
    for i in xrange(K_w+1,50):
        skip_columns.append('ws'+str(i))
        skip_columns.append('wc'+str(i))
    

    for i in xrange(K_y+1,37):
        skip_columns.append('ys'+str(i))
        skip_columns.append('yc'+str(i))
    
    display.clear_output()
    print('Skipped columns:')
    print(skip_columns)  
    print('\n')
    
    #сформируем обучающую выборку и обучим 6 моделей Ridge regression на 1285 районе
    features = CreateFeatures(trips_from_region, trips_to_region, amount, passengers, '1285')
    X_train, y_train = PrepareData(features, '2013-01-03 00:00:00', '2016-04-30 22:00:00')
        
    X_train = X_train.drop(skip_columns, axis=1)
    
    features_set = []
    features_set = list(X_train.columns.values)

    reg_1h.fit(X_train, y_train[0])
    reg_2h.fit(X_train, y_train[1])
    reg_3h.fit(X_train, y_train[2])
    reg_4h.fit(X_train, y_train[3])
    reg_5h.fit(X_train, y_train[4])
    reg_6h.fit(X_train, y_train[5])
    
    #Проверка на мае
    f_start = pd.to_datetime('2016-04-30 22:00:00')
    f_end = pd.to_datetime('2016-05-31 17:00:00')
    forecast = pd.DataFrame(index = features[f_start:f_end].index)
    true_data = pd.DataFrame(index = features[f_start:f_end].index)

    #строим прогноз на май
    X_test, y_test = PrepareData(features, f_start, f_end)
    X_test = X_test.drop(skip_columns, axis=1).values

    forecast['1h'] = reg_1h.predict(X_test)
    forecast['2h'] = reg_2h.predict(X_test)
    forecast['3h'] = reg_3h.predict(X_test)
    forecast['4h'] = reg_4h.predict(X_test)
    forecast['5h'] = reg_5h.predict(X_test)
    forecast['6h'] = reg_6h.predict(X_test)

    true_data['1h'] = y_test[0]
    true_data['2h'] = y_test[1]
    true_data['3h'] = y_test[2]
    true_data['4h'] = y_test[3]
    true_data['5h'] = y_test[4]
    true_data['6h'] = y_test[5]
    
    #записываем результаты
    results['prev'] = results['current']
    results.loc[1:,'current'] = list(abs(forecast - true_data).sum()/739)
    results.loc['Q','current'] = abs(forecast - true_data).sum().sum()/6/739
    
    #если качество како-то из моделей или общее качество оказалось лучшим из всех попыток,
    #сохраняем конфигурацию признаков
    for i in xrange(results.shape[0]):
        if(results.iloc[i]['best'] == 0):
            results.loc[results.index[i],'best'] = results.loc[results.index[i],'current']
            results.loc[results.index[i],'best_skipped_features'] = str(skip_columns)
        elif(results.iloc[i]['best'] > results.iloc[i]['current']):
            results.loc[results.index[i],'best'] = results.loc[results.index[i],'current']
            results.loc[results.index[i],'best_skipped_features'] = str(skip_columns)    
    
 
    display.display(results)
    #сумма Q по лучшим вариантам каждой из моделей.
    print "Sum of bests ", results.iloc[1:]['best'].sum()/6
    
    PrintCoefs([reg_1h,reg_2h,reg_3h,reg_4h,reg_5h,reg_6h], features_set)  

In [22]:
#Создаем элементы управления
items=[]
for s in features.drop(['y_t+1','y_t+2','y_t+3','y_t+4','y_t+5','y_t+6'], axis=1).columns:
    if ((s.startswith('wc') == False)&(s.startswith('ws') == False)&
        (s.startswith('yc') == False)&(s.startswith('ys') == False)):        
        items.append(widgets.Checkbox(value = True, description=s))
    else:
        continue
    
items.append(widgets.IntSlider(
    value=49,
    min=0,
    max=49,
    step=1,
    description='K wc/ws:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='i'))

items.append(widgets.IntSlider(
    value=36,
    min=0,
    max=36,
    step=1,
    description='K yc/ys:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='i'))

recalc_button = widgets.Button(
    description=u'Пересчитать',
    disabled=False,
    button_style='', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Recalculate',
    icon=''
)

recalc_button.on_click(on_recalc_clicked)

skip_box = widgets.Box(items)
skip_box.layout = Layout(width='100%',display='inline-flex',flex_flow='row wrap')

In [23]:
display.display(skip_box)
display.display(recalc_button)

Skipped columns:
['m1', 'm2', 'm3', 'm4', 'm5', 'm6', 'm7', 'm8', 'm9', 'm10', 'm11', 'm12', 'dom1', 'dom2', 'dom3', 'dom4', 'dom5', 'dom6', 'dom7', 'dom8', 'dom9', 'dom10', 'dom11', 'dom12', 'dom13', 'dom14', 'dom15', 'dom16', 'dom17', 'dom18', 'dom19', 'dom20', 'dom21', 'dom22', 'dom23', 'dom24', 'dom25', 'dom26', 'dom27', 'dom28', 'dom29', 'dom30', 'dom31', 'y_t-3', 'y_t-4', 'y_t-5', 'y_t-6', 'y_t-7', 'y_t-8', 'y_t-9', 'y_t-10', 'y_t-11', 'y_t-12', 'sum_3h(t-0)', 'sum_6h(t-0)', 'sum_12h(t-0)', 'sum_24h(t-0)', 'sum_48h(t-0)', 'sum_3h(t-1)', 'sum_6h(t-1)', 'sum_12h(t-1)', 'sum_24h(t-1)', 'sum_48h(t-1)', 'sum_3h(t-2)', 'sum_6h(t-2)', 'sum_12h(t-2)', 'sum_24h(t-2)', 'sum_48h(t-2)', 'sum_3h(t-3)', 'sum_6h(t-3)', 'sum_12h(t-3)', 'sum_24h(t-3)', 'sum_48h(t-3)', 'sum_3h(t-4)', 'sum_6h(t-4)', 'sum_12h(t-4)', 'sum_24h(t-4)', 'sum_48h(t-4)', 'sum_3h(t-5)', 'sum_6h(t-5)', 'sum_12h(t-5)', 'sum_24h(t-5)', 'sum_48h(t-5)', 'mean_6h(t-0)', 'mean_12h(t-0)', 'mean_48h(t-0)', 'mean_6h(t-1)', 'mean_12h(

Unnamed: 0,current,prev,best,best_skipped_features
Q,26.21899,26.243167,26.121745,"['m1', 'm2', 'm3', 'm4', 'm5', 'm6', 'm7', 'm8..."
reg_1h,22.106304,22.09553,21.99635,"['m1', 'm2', 'm3', 'm4', 'm5', 'm6', 'm7', 'm8..."
reg_2h,25.081217,25.114669,24.952249,"['m1', 'm2', 'm3', 'm4', 'm5', 'm6', 'm7', 'm8..."
reg_3h,26.798441,26.843907,26.722571,"['m1', 'm2', 'm3', 'm4', 'm5', 'm6', 'm7', 'm8..."
reg_4h,27.565897,27.558933,27.398637,"['m1', 'm2', 'm3', 'm4', 'm5', 'm6', 'm7', 'm8..."
reg_5h,27.764908,27.774127,27.652522,"['m1', 'm2', 'm3', 'm4', 'm5', 'm6', 'm7', 'm8..."
reg_6h,27.997173,28.071835,27.95937,"['m1', 'm2', 'm3', 'm4', 'm5', 'm6', 'm7', 'm8..."


Sum of bests  26.1136165585


Unnamed: 0,reg_0,reg_1,reg_2,reg_3,reg_4,reg_5
trend,1.938574,2.094021,1.503538,0.5727,-0.139565,-0.776481
ws1,-0.775673,-0.268521,3.273447,1.469978,1.162863,6.576954
wc1,-3.31424,-4.506834,-1.792849,-4.227684,-6.023882,-2.943194
ws2,4.115166,4.474158,4.970882,7.250531,7.408551,6.122346
wc2,0.713671,0.947906,4.241842,3.314676,3.008533,8.117843
ws3,0.890592,1.165896,-0.370662,-0.275854,-0.074531,-2.298342
wc3,3.106727,4.23232,5.090404,6.385366,7.150123,6.436361
ws4,-1.338894,-2.290943,-4.137242,-4.506495,-4.483268,-5.961826
wc4,0.629816,1.218104,1.244767,0.938744,0.439942,0.008661
ws5,0.890925,0.899089,1.32338,2.222204,2.663572,3.481674


In [32]:
results.to_csv("model_tuning results2.csv")

В результате многочисленных экспериментов можно заключить следующее:
- Бинарные признаки месяца и дня месяца лишь вносят дополнительный шум и их нужно исключить;
- Гармоники годовой сезонности также вносят искажения без видимой пользы;
- Признаки суммы поездок за n часов и среднее число поездок за n часов коррелируют между собой, поэтому нужно оставить что-нибудь одно. Я оставил среднее число поездок;
- Значения различных параметров, взятые за 24 часа до прогнозируемого времени оказались силиными признаками для соответствующих моделей.

### Проверка на данных за май

In [None]:
%%time

f_start = pd.to_datetime('2016-04-30 22:00:00')
f_end = pd.to_datetime('2016-05-31 17:00:00')

#features = CreateFeatures(trips_from_region, trips_to_region, amount, passengers, '1075')
dtindex = pd.date_range(f_start,f_end, freq='H')
forecast = pd.DataFrame(index = dtindex)
true_data = pd.DataFrame(index = dtindex)


skip_columns = results.best_skipped_features[0].replace('[','').replace(']','').replace("'",'').replace(' ','').split(',')
skip_cols1 = results.best_skipped_features[1].replace('[','').replace(']','').replace("'",'').replace(' ','').split(',')
skip_cols2 = results.best_skipped_features[2].replace('[','').replace(']','').replace("'",'').replace(' ','').split(',')
skip_cols3 = results.best_skipped_features[3].replace('[','').replace(']','').replace("'",'').replace(' ','').split(',')
skip_cols4 = results.best_skipped_features[4].replace('[','').replace(']','').replace("'",'').replace(' ','').split(',')
skip_cols5 = results.best_skipped_features[5].replace('[','').replace(']','').replace("'",'').replace(' ','').split(',')
skip_cols6 = results.best_skipped_features[6].replace('[','').replace(']','').replace("'",'').replace(' ','').split(',')

for region in trips_from_region.columns:
    
    features = CreateFeatures(trips_from_region, trips_to_region, amount, passengers, region)    
    
    #обучаем модели на данных до мая на всех районах
    X_train, y_train = PrepareData(features, '2013-01-03 00:00:00', '2016-04-30 22:00:00')
    #X_train = X_train.drop(skip_columns, axis=1).values

    reg_1h.fit(X_train.drop(skip_cols1, axis=1).values, y_train[0])
    reg_2h.fit(X_train.drop(skip_cols2, axis=1).values, y_train[1])
    reg_3h.fit(X_train.drop(skip_cols3, axis=1).values, y_train[2])
    reg_4h.fit(X_train.drop(skip_cols4, axis=1).values, y_train[3])
    reg_5h.fit(X_train.drop(skip_cols5, axis=1).values, y_train[4])
    reg_6h.fit(X_train.drop(skip_cols6, axis=1).values, y_train[5])

    #строим прогноз на май
    X_test, y_test = PrepareData(features, f_start, f_end)
    #X_test = X_test.drop(skip_columns, axis=1).values
    
    forecast[region + '_1h'] = reg_1h.predict(X_test.drop(skip_cols1, axis=1).values)
    forecast[region + '_2h'] = reg_2h.predict(X_test.drop(skip_cols2, axis=1).values)
    forecast[region + '_3h'] = reg_3h.predict(X_test.drop(skip_cols3, axis=1).values)
    forecast[region + '_4h'] = reg_4h.predict(X_test.drop(skip_cols4, axis=1).values)
    forecast[region + '_5h'] = reg_5h.predict(X_test.drop(skip_cols5, axis=1).values)
    forecast[region + '_6h'] = reg_6h.predict(X_test.drop(skip_cols6, axis=1).values)
    
    #forecast = forecast.applymap(round)
    
    true_data[region + '_1h'] = y_test[0]
    true_data[region + '_2h'] = y_test[1]
    true_data[region + '_3h'] = y_test[2]
    true_data[region + '_4h'] = y_test[3]
    true_data[region + '_5h'] = y_test[4]
    true_data[region + '_6h'] = y_test[5]
    
    print "Region ", region, " processed."

In [29]:
forecast.head(2)

Unnamed: 0,1075_1h,1075_2h,1075_3h,1075_4h,1075_5h,1075_6h,1076_1h,1076_2h,1076_3h,1076_4h,...,2119_3h,2119_4h,2119_5h,2119_6h,2168_1h,2168_2h,2168_3h,2168_4h,2168_5h,2168_6h
2016-04-30 22:00:00,88.048367,75.038438,55.078211,34.172232,22.792969,17.509176,110.285597,94.323988,64.667307,45.782331,...,-0.143019,-7.351583,-0.007093,-3.613484,66.021406,55.177307,25.64916,12.739317,8.094899,8.290564
2016-04-30 23:00:00,79.320877,56.9634,37.426097,26.096974,21.299891,17.750756,107.499557,77.866823,55.694202,37.048212,...,-3.700916,1.309653,-2.569124,2.462731,50.077942,24.200308,10.909476,6.301513,6.992216,57.665754


In [30]:
true_data.head(2)

Unnamed: 0,1075_1h,1075_2h,1075_3h,1075_4h,1075_5h,1075_6h,1076_1h,1076_2h,1076_3h,1076_4h,...,2119_3h,2119_4h,2119_5h,2119_6h,2168_1h,2168_2h,2168_3h,2168_4h,2168_5h,2168_6h
2016-04-30 22:00:00,92.0,71.0,37.0,10.0,14.0,6.0,118.0,64.0,39.0,46.0,...,0.0,0.0,0.0,0.0,39.0,85.0,35.0,0.0,0.0,1.0
2016-04-30 23:00:00,71.0,37.0,10.0,14.0,6.0,8.0,64.0,39.0,46.0,25.0,...,0.0,0.0,0.0,3.0,85.0,35.0,0.0,0.0,1.0,57.0


In [31]:
#Обнуляем отрицательные значения, округляем до целого
for cl in forecast.columns:
    forecast[cl] = map(PostProcValue, forecast[cl])
    
#Считаем Q    
Q = abs(forecast - true_data).sum().sum()
Q = Q/102/739/6
print Q

17.9362214439


Ошибка на 5 неделе составила 20,72. Результат удалось немного улучшить.
Построим прогноз для соревнования.

### Прогноз для kaggle на данных за июнь

In [33]:
%%time
f_start = pd.to_datetime('2016-05-31 23:00:00')
f_end = pd.to_datetime('2016-06-30 17:00:00')
forecast = pd.DataFrame(index = features[f_start:f_end].index)

for region in trips_from_region.columns:
    
    features = CreateFeatures(trips_from_region, trips_to_region, amount, passengers, region)
    
    X_train, y_train = PrepareData(features, '2013-01-03 00:00:00', '2016-05-31 22:00:00')
    #X_train = X_train.drop(skip_columns, axis=1).values
    
    #fitted_models = Parallel(n_jobs=6)(delayed(train_model)(X_train, y_train[i]) for i in range(6))
    #Parallel(n_jobs=4)(delayed(train_model)(X, y, seed) for seed in range(10))
    
    reg_1h.fit(X_train.drop(skip_cols1, axis=1), y_train[0])
    reg_2h.fit(X_train.drop(skip_cols2, axis=1), y_train[1])
    reg_3h.fit(X_train.drop(skip_cols3, axis=1), y_train[2])
    reg_4h.fit(X_train.drop(skip_cols4, axis=1), y_train[3])
    reg_5h.fit(X_train.drop(skip_cols5, axis=1), y_train[4])
    reg_6h.fit(X_train.drop(skip_cols6, axis=1), y_train[5])
    
    
    
    X_test, y_test = PrepareData(features, f_start, f_end)
    #X_test = X_test.drop(skip_columns, axis=1).values
    
    forecast[region + '_1h'] = reg_1h.predict(X_test.drop(skip_cols1, axis=1))
    forecast[region + '_2h'] = reg_2h.predict(X_test.drop(skip_cols2, axis=1))
    forecast[region + '_3h'] = reg_3h.predict(X_test.drop(skip_cols3, axis=1))
    forecast[region + '_4h'] = reg_4h.predict(X_test.drop(skip_cols4, axis=1))
    forecast[region + '_5h'] = reg_5h.predict(X_test.drop(skip_cols5, axis=1))
    forecast[region + '_6h'] = reg_6h.predict(X_test.drop(skip_cols6, axis=1))

for cl in forecast.columns:
    forecast[cl] = map(PostProcValue, forecast[cl])

Wall time: 16min 31s


In [34]:
submission = pd.DataFrame()

ids = []
vals = np.array([])

for time in forecast.index:
    for region in trips_from_region.columns:    
        ids += [region+'_'+str(time.date()) + '_' + str(time.hour) + '_' + str(x) for x in range(1,7)]

for time in forecast.index:
    vals = np.hstack([vals, forecast.loc[time].values])

vals = vals.astype(int)
submission = pd.DataFrame(index = ids, columns = ['y'], data = vals)
submission.to_csv('Week6_submission.csv', index_label = 'id')

In [36]:
forecast.to_csv("forecast_june_2016.csv")

Ссылка на сабмишн: https://inclass.kaggle.com/c/yellowtaxi/leaderboard?submissionId=5074994

![title](Kaggle sub 12.png)