In [93]:
import pandas as pd
import numpy as np
import os
import operator
import matplotlib.pyplot as plt
# Input path
prefix = r"C:\Users\Artem\Dropbox\Competition\git\input"

train_data = pd.read_csv(os.path.join(prefix, "sales_train_v2.csv"))
train_data['date'] = pd.to_datetime(train_data['date'], format='%d.%m.%Y')
train_data = train_data.sort_values(by='date').reset_index(drop=True)

In [94]:
# City
shops_df = pd.read_csv(os.path.join(prefix, "shops.csv"))

mapping = {}
for it, row in shops_df.iterrows():
    if row['shop_name'] not in mapping:
        mapping[row['shop_name']] = row['shop_name'].split(" ")[0].replace("!", "")
shops_df.set_index("shop_id", inplace=True)
shops_dict = shops_df.to_dict()['shop_name']

train_data['shop_city'] = train_data['shop_id'].apply(
    lambda x: mapping[shops_dict[x]]
)
print("City generated")
# Is big city
train_data['is_Moscow'] = train_data['shop_city'].apply(
    lambda x: x == "Москва"
)
print("Big city generated")
# Item category groups
categories_df = pd.read_csv(os.path.join(prefix, "item_categories.csv"))
items_df = pd.read_csv(os.path.join(prefix, "items.csv"))

category_mapping = {}
for it, row in categories_df.iterrows():
    if row['item_category_name'] not in category_mapping:
        category_mapping[row['item_category_name']] = row['item_category_name'].split(" ")[0]
        
categories_df.set_index("item_category_id", inplace=True)
categories_dict = categories_df.to_dict()['item_category_name']

items_df = items_df[["item_id", "item_category_id"]]
items_df.set_index("item_id", inplace=True)
items_dict = items_df.to_dict()['item_category_id']

train_data['item_category_group'] = train_data['item_id'].apply(
    lambda x: category_mapping[categories_dict[items_dict[x]]]
)
print("Item category generated")
# Number of weekends
start = train_data['date'].iloc[0].replace(day=1)
last = train_data['date'].iloc[-1]
weekdays = {}
while start <= last:
    begin = start.replace(day=1)
    end = (begin + pd.DateOffset(months=1)).replace(day=1)
    days_in_month = start.to_period("D").daysinmonth
    bus_days = np.busday_count(begin, end)
    weekdays[start] = days_in_month - bus_days
    start += pd.Timedelta(days=1)
train_data['number_of_weekends'] = train_data['date'].apply(lambda x: weekdays[x])
print("Weekends generated")
# total revenue of a shop in date_block_num
revenue = {}
for i in range(train_data['date_block_num'].max() + 1):
    month = train_data[train_data['date_block_num'] == i][['shop_id', 'item_price', 'item_cnt_day']]
    month['revenue'] = month['item_price'] * month['item_cnt_day']
    shop_revenue = month.groupby(['shop_id'])[['revenue']].sum().to_dict()['revenue']
    revenue[i] = shop_revenue
train_data['monthly_shop_revenue'] = train_data.apply(lambda x: revenue[x['date_block_num']][x['shop_id']], axis=1)
print("Revenue generated")
# is_item_category_the_most_popular_in_this_month? macro
train_data['is_item_category_group_most_popular_in_month'] = train_data.apply(
    lambda x: x['item_category_group'] == category_mapping[categories_dict[items_dict[max(revenue[x['date_block_num']].items(), key=operator.itemgetter(1))[0]]]], axis=1)
print("Popular item_category_group generated")

City generated
Big city generated
Item category generated
Weekends generated
Revenue generated
Popular item_category_group generated


In [95]:
train_data

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,shop_city,is_Moscow,item_category_group,number_of_weekends,monthly_shop_revenue,is_item_category_group_most_popular_in_month
0,2013-01-01,0,18,5823,2500.0,1.0,Красноярск,False,Карты,8,2742707.00,False
1,2013-01-01,0,27,5573,849.0,1.0,Москва,True,Аксессуары,8,4728830.00,False
2,2013-01-01,0,7,1006,399.0,1.0,Воронеж,False,Подарки,8,1870467.00,False
3,2013-01-01,0,19,17707,899.0,1.0,Курск,False,Игры,8,2118666.55,False
4,2013-01-01,0,14,19548,149.0,1.0,Казань,False,Кино,8,1264646.00,True
5,2013-01-01,0,27,5574,699.0,2.0,Москва,True,Аксессуары,8,4728830.00,False
6,2013-01-01,0,8,16993,399.0,1.0,Воронеж,False,Кино,8,952881.00,True
7,2013-01-01,0,28,6468,449.0,1.0,Москва,True,Игры,8,4718191.23,False
8,2013-01-01,0,19,13071,499.0,1.0,Курск,False,Аксессуары,8,2118666.55,False
9,2013-01-01,0,51,6450,483.0,1.0,Тюмень,False,Игры,8,939712.00,False
