In [None]:
!pip install catboost

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")
from catboost import CatBoostRegressor
from sklearn.mixture import GaussianMixture as GMM

In [None]:
data = pd.read_csv('/content/train_npf2.csv', parse_dates=['date'])
sample = pd.read_csv('/content/sample_npf2.csv')

In [None]:
train = data[data.demand.notna()]
test = data[data.demand.isna()]

test = sample.drop(columns=['demand']).merge(test, how='left', on='index')

In [None]:
def transform_date(data):
    data['month'] = data['date'].map(lambda x: x.month).copy()
    data['number_of_week_in_month'] = data['date'].map(lambda x: x.day // 7 + 1).copy()
    return data

def delete_holidays(data):
    mt = {'new_year': ['12-28', '01-10'], 'may':['05-01', '05-10']}
    dates = dict()
    for year in [2015, 2016, 2017, 2018, 2019]:
        dates[year] = []
        for moment in ['new_year', 'may']:
            start = '{}-{}'.format(year, mt[moment][0])
            end = '{}-{}'.format(year + int(moment == 'new_year'), mt[moment][1])
            dates[year].append([datetime.strptime(start, '%Y-%m-%d'), datetime.strptime(end, '%Y-%m-%d')])

    def drop(x):
        year = x.year
        if dates[year - 1][0][0] < x and x < dates[year - 1][0][1]:
            return None
        elif dates[year][0][0] < x and x < dates[year][0][1]:
            return None
        elif dates[year][1][0] < x and x < dates[year][1][1]:
            return None
        return 1
    
    data['hol'] = data['date'].map(lambda x: drop(x))
    data.dropna(inplace=True)
    data.drop(columns=['hol'], inplace=True)
    return data

def drop_useless_cols(data):
    columns = ['Unnamed: 0', 'index', 'STORE_LOCATION_RK', 'PRODUCT_RK']
    return data.drop(columns=columns)

def promo_flag(data):
    data['xor'] = data.apply(lambda x: int(bool(x.PROMO1_FLAG) ^ bool(x.PROMO2_FLAG)), axis=1).copy()
    data['and'] = data.apply(lambda x: int(bool(x.PROMO1_FLAG) & bool(x.PROMO2_FLAG)), axis=1).copy()
    data['or'] = data.apply(lambda x: int(bool(x.PROMO1_FLAG) | bool(x.PROMO2_FLAG)), axis=1).copy()
    return data

def price(data):
    data['ratio_price'] = data['PRICE_REGULAR'] /  data['PRICE_AFTER_DISC']
    data['diff_price'] = data['PRICE_REGULAR'] - data['PRICE_AFTER_DISC']
    return data

Чистка выходных

In [None]:
train = delete_holidays(train)
train = transform_date(train)
train = price(train)
train = drop_useless_cols(train)

In [None]:
train = train.dropna()

Черная пятница

In [None]:
def black_friday(x):
    e = '{}-12-01'
    s = '{}-11-20'
    year = x.year

    start = datetime.strptime(s.format(year), '%Y-%m-%d')
    end = datetime.strptime(e.format(year), '%Y-%m-%d')
    if start < x and x < end:
        return 1
    return 0

In [None]:
train['black_friday'] = train['date'].map(lambda x: black_friday(x))

Удаляем date

In [None]:
train = train.drop(columns=['date'])

Сумма demand по локациям



In [None]:
store_loc_demand = train.groupby('store_location_rk').agg({'demand':'sum'}).reset_index().sort_values('demand', ascending=False)
sld_list = store_loc_demand.store_location_rk.values
store_loc_demand.head()

Unnamed: 0,store_location_rk,demand
33,1281,9080.102886
35,1326,7405.892372
34,1316,6731.323266
36,1328,5700.010883
7,525,5644.209778


In [None]:
location_0 = set(sld_list[:2]) # highest demand
location_1 = set(sld_list[2:7])
location_2 = set(sld_list[7:16])
location_3 = set(sld_list[16:24])
location_4 = set(sld_list[24:32])
location_5 = set(sld_list[32:]) # lowest demand

Ставим группу локации

In [None]:
def figure_location_group(x):
    if x in location_0:
        return 0
    elif x in location_1:
        return 1
    elif x in location_2:
        return 2
    elif x in location_3:
        return 3
    elif x in location_4:
        return 4
    else:
        return 5

In [None]:
train['location_group'] = train['store_location_rk'].map(lambda x: figure_location_group(x))

Словарь: ключ - номер локации, значение - модель, обученная на этой локации

In [None]:
location_models = dict()

for i in [0, 1, 2, 3, 4, 5]:
    model = CatBoostRegressor(iterations=200)
    X = train[(train.location_group == i) & (train.black_friday == 0)].drop(columns=['black_friday', 'location_group'])
    y = X.demand
    X = X.drop(columns=['demand'])
    model.fit(X, y)
    location_models[i] = model

In [None]:
location_models

{0: <catboost.core.CatBoostRegressor at 0x7f4707d6e590>,
 1: <catboost.core.CatBoostRegressor at 0x7f4707dc6b10>,
 2: <catboost.core.CatBoostRegressor at 0x7f470b4b9150>,
 3: <catboost.core.CatBoostRegressor at 0x7f4707d6ff90>,
 4: <catboost.core.CatBoostRegressor at 0x7f4707dc6890>,
 5: <catboost.core.CatBoostRegressor at 0x7f4707d5a550>}

Сумма demand по продуктам

In [None]:
product_demand = train.groupby(['product_rk']).agg({'demand':'sum'}).reset_index().sort_values('demand', ascending=False)

In [None]:
product_0 = set(product_demand[(product_demand.demand > 1000)].product_rk.values)
product_1 = set(product_demand[(product_demand.demand < 1000) & (product_demand.demand > 500)].product_rk.values)
product_2 = set(product_demand[(product_demand.demand < 500) & (product_demand.demand > 250)].product_rk.values)
product_3 = set(product_demand[(product_demand.demand < 250) & (product_demand.demand > 100)].product_rk.values)
product_4 = set(product_demand[(product_demand.demand < 100)].product_rk.values)

In [None]:
def figure_product_group(x):
    if x in product_0:
        return 0
    elif x in product_1:
        return 1
    elif x in product_2:
        return 2
    elif x in product_3:
        return 3
    else:
        return 4

In [None]:
train['product_group'] = train['product_rk'].map(lambda x: figure_product_group(x))

In [None]:
product_models = dict()

for i in [0, 1, 2, 3, 4]:
    model = CatBoostRegressor(iterations=200)
    X = train[(train.product_group == i) & (train.black_friday == 0)].drop(columns=['black_friday', 'location_group', 'product_group'])
    y = X.demand
    X = X.drop(columns=['demand'])
    model.fit(X, y)
    product_models[i] = model

In [None]:
product_models

{0: <catboost.core.CatBoostRegressor at 0x7f4706c145d0>,
 1: <catboost.core.CatBoostRegressor at 0x7f4707d61710>,
 2: <catboost.core.CatBoostRegressor at 0x7f4707d32750>,
 3: <catboost.core.CatBoostRegressor at 0x7f470b54bd10>,
 4: <catboost.core.CatBoostRegressor at 0x7f4707d32490>}

Дата фрейм с черной пятницы чтобы сделать предикт именно для нее

In [None]:
black = train[train.black_friday == 1].drop(columns=['black_friday', 'location_group', 'product_group'])

In [None]:
black_X = black.drop(columns=['demand'])
black_y = black.demand

In [None]:
model_black = CatBoostRegressor(iterations=200)

In [None]:
model_black.fit(black_X, black_y)

Делаем также для теста

Черная пятница для теста

In [None]:
test['black_friday'] = test['date'].map(lambda x: black_friday(x))

In [None]:
test = transform_date(test)
test = price(test)

In [None]:
test_black = test[test.black_friday == 1].drop(columns=['black_friday'])

In [None]:
indexes = test_black['index']

In [None]:
test_black = drop_useless_cols(test_black)

In [None]:
test_black_X = test_black.drop(columns=['demand'])
test_black_y = test_black.demand

preds = model_black.predict(test_black_X)

In [None]:
black_predictions = pd.DataFrame({'index': indexes, 'black_pred': preds})

Локация для теста

In [None]:
test['location_group'] = test['store_location_rk'].map(lambda x: figure_location_group(x))

In [None]:
test['product_group'] = test['product_rk'].map(lambda x: figure_product_group(x))

In [None]:
predicted_data = dict()

for i in [0, 1, 2, 3, 4, 5]:
    model = CatBoostRegressor(iterations=200)
    X = test[(test.location_group == i) & (test.black_friday == 0)].drop(columns=['black_friday', 'location_group', 'product_group'])
    indexes = X['index']
    X = drop_useless_cols(X)
    X = X.drop(columns=['demand'])
    preds = location_models[i].predict(X)
    
    predicted_data[i] = pd.DataFrame({'index': indexes, 'location %d'%i:preds})

In [None]:
predicted_product = dict()

for i in [0, 1, 2, 3, 4]:
    model = CatBoostRegressor(iterations=200)
    X = test[(test.product_group == i) & (test.black_friday == 0)].drop(columns=['black_friday', 'location_group', 'product_group'])
    indexes = X['index']
    X = drop_useless_cols(X)
    X = X.drop(columns=['demand'])
    preds = product_models[i].predict(X)
    
    predicted_product[i] = pd.DataFrame({'index': indexes, 'product %d'%i:preds})

Делаю мердж ответов с sample

In [None]:
sample = sample.merge(predicted_data[0], how='left', on='index')
sample = sample.merge(predicted_data[1], how='left', on='index')
sample = sample.merge(predicted_data[2], how='left', on='index')
sample = sample.merge(predicted_data[3], how='left', on='index')
sample = sample.merge(predicted_data[4], how='left', on='index')
sample = sample.merge(predicted_data[5], how='left', on='index')

sample = sample.merge(predicted_product[0], how='left', on='index')
sample = sample.merge(predicted_product[1], how='left', on='index')
sample = sample.merge(predicted_product[2], how='left', on='index')
sample = sample.merge(predicted_product[3], how='left', on='index')
sample = sample.merge(predicted_product[4], how='left', on='index')

Удаляю нули чтобы красиво сложить

P.S. в строке будет ток 1 ненулевое число

In [None]:
sample = sample.fillna(0)

In [None]:
sample

Unnamed: 0,index,demand,location 0,location 1,location 2,location 3,location 4,location 5,product 0,product 1,product 2,product 3,product 4
0,902,0.0,0.000000,0.0,1.663742,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,1.932516
1,1163,0.0,0.000000,0.0,0.000000,0.0,0.0,1.330226,0.0,0.0,0.0,1.547588,0.000000
2,1167,0.0,0.000000,0.0,0.000000,0.0,0.0,1.104621,0.0,0.0,0.0,1.003743,0.000000
3,1172,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000
4,1202,0.0,1.738397,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,1.218632
...,...,...,...,...,...,...,...,...,...,...,...,...,...
523,209593,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000
524,209594,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000
525,209595,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000
526,209596,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000


In [None]:
sample = sample.merge(black_predictions, how='left', on='index')
sample = sample.fillna(0)

In [None]:
sample['demand'] = sample[sample.columns[2:]].max(axis=1)

In [None]:
sample = sample[['index', 'demand']].set_index('index')

In [None]:
sample.to_csv('black_&_location_&_product_&_max.csv')