In [None]:
######################################
# Developed by Baptiste PICARD       #
# picard.baptiste@laposte.net        #
# Started the 21th of April 2020     #
# Updated the 26th of August 2023    #
# Resume :                           #
# Evolution of the prices.           # 
# Model to predict the evolutions    #
#                                    #
######################################

# Modules to import 

# Utils
import os
import json
import random
import subprocess
import time 
import datetime as dt
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Pre-processing 
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score, mean_absolute_error, max_error

# Models : Regression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor # Gradient Boosting, Random Forest


# Environment
plt.style.use('seaborn') # Change the style of the figures.
pd.set_option('display.max_columns', 50)
GRAFS = True
OUTLIERS = True

# Constants 
train_set = './data/sales_train.csv'
test_set = './data/test.csv'
subs_set = './data/sample_submission.csv'
items = './data/items.csv'
categories = './data/item_categories.csv'
shops = './data/shops.csv'

results = './data/results.txt'

In [None]:
# Date parser
dateparse = lambda x: dt.datetime.strptime(x, "%d.%m.%Y")

# Open the .csv file  
df_train_set = pd.read_csv(train_set, parse_dates=["date"]) # Training
df_test_set = pd.read_csv(test_set) # Test
df_subs_set = pd.read_csv(subs_set) # Sample submission
df_shops_set = pd.read_csv(shops) # Shops
df_items_set = pd.read_csv(items) # Items
df_categories_set = pd.read_csv(categories) # Categories

print(f"Initial shape: {df_train_set.shape}.")

In [None]:
df_train_set.head()

In [None]:
df_test_set.head()

In [None]:
df_subs_set.head()

In [None]:
# Creating the df_train 
df_train_shop = pd.merge(df_train_set, df_shops_set, on='shop_id', how='inner')
df_train_shop_item = pd.merge(df_train_shop, df_items_set, on='item_id', how='inner')
df_train = pd.merge(df_train_shop_item, df_categories_set, on='item_category_id', how='inner')
df_train = df_train.drop(['shop_id', 'item_id', 'item_category_id'], axis=1)

# Creating the df_test / df_subs
df_test_shop = pd.merge(df_test_set, df_shops_set, on='shop_id', how='inner')
df_test_shop_items = pd.merge(df_test_shop, df_items_set, on='item_id', how='inner')
df_test = pd.merge(df_test_shop_items, df_categories_set, on='item_category_id', how='inner')
df_test = df_test.drop(['shop_id', 'item_id', 'item_category_id', 'ID'], axis=1)

# Train 
df_train_rows, df_train_columns = df_train.shape
# Test 
df_test_rows, df_test_columns = df_test.shape
print('Train set : \n train.csv : {} (rows/columns).'.format(df_train.shape))
print('Test set : \n test.csv : {} (rows/columns).'.format(df_test.shape))
print('Total nan values (train) : {}'.format(df_train.isnull().sum().sum()))
print('Total nan values (test) : {}'.format(df_test.isnull().sum().sum()))
df_train = (
    df_train
    .rename(
        columns={
            'shop_name': 'shop_name (RUSSIA)',
            'item_name': 'item_name (RUSSIA)',
            'item_category_name': 'item_category_name (RUSSIA)',
        }
    )
    .drop(
        columns=["date_block_num"],
    )
)
df_test = df_test.rename(columns={'shop_name' : 'shop_name (RUSSIA)', 'item_name' : 'item_name (RUSSIA)', 'item_category_name': 'item_category_name (RUSSIA)'})

print(f"Final shape: {df_train_set.shape}.")

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
df_train.dtypes

### Pick a random item

In [None]:
shop = 'Москва ТРК "Атриум"'
item = "ЯВЛЕНИЕ 2012 (BD)"
shop_selection = (df_train["shop_name (RUSSIA)"] == shop)
item_selection = (df_train["item_name (RUSSIA)"] == item)

example_df = df_train[shop_selection & item_selection].sort_values(by=["date"])
print(f"There are {example_df.shape} items for the example.")
example_df

In [None]:
plt.figure(figsize=(30,12))
plt.subplot(121)
plt.title(f"Visualization the price evolution for item: {item} for the shop; {shop}.")
plt.xlabel('Date')
plt.ylabel('Price')
sns.lineplot(data=example_df, x="date", y="item_price")

In [None]:
plt.figure(figsize=(30,12))
plt.subplot(121)
plt.title(f"Visualization the price evolution for item: {item} for the shop; {shop}.")
plt.xlabel('Date')
plt.ylabel('item_cnt_day')
sns.lineplot(data=example_df, x="date", y="item_cnt_day")

### Create date column

In [None]:
df_train["year"] = df_train["date"].dt.year
df_train["month"] = df_train["date"].dt.month
df_train["day"] = df_train["date"].dt.day
df_train["weekday"] = df_train["date"].dt.weekday
df_train["week"] = df_train["date"].apply(lambda x: x.weekofyear)
df_train

### Item category

In [None]:
items = {        
        # Cinema
        'Кино - Blu-Ray' : {'field' : 'Cinema', 'category' : 'Blu-ray'},
        'Кино - DVD' : {'field' : 'Cinema', 'category' : 'DVD'},        
        'Кино - Blu-Ray 3D' : {'field' : 'Cinema', 'category' : 'Blu-Ray 3D'},
        'Кино - Коллекционное' : {'field' : 'Cinema', 'category' : 'Collection'}, 
        'Кино - Blu-Ray 4K' : {'field' : 'Cinema', 'category' : 'Blu-Ray 4K'}, 

    
        # Accessories
        'Аксессуары - PSVita' : {'field' : 'Accessories', 'category' : 'PSVita'},
        'Аксессуары - PS3' : {'field' : 'Accessories', 'category' : 'PS3'},
        'Аксессуары - XBOX 360' : {'field' : 'Accessories', 'category' : 'XBOX 360'},
        'Аксессуары - PSP' : {'field' : 'Accessories', 'category' : 'PSP'},
        'Аксессуары - PS4' : {'field' : 'Accessories', 'category' : 'PS4'},
        'Аксессуары - XBOX ONE' : {'field' : 'Accessories', 'category' : 'XBOX ONE'},
        'PC - Гарнитуры/Наушники' : {'field' : 'Accessories', 'category' : 'Headphones'}, 
        'Аксессуары - PS2' : {'field' : 'Accessories', 'category' : 'PS2'},
        'Элементы питания' : {'field' : 'Accessories', 'category' : 'Battery'},
    
        # Music
        'Музыка - CD локального производства' : {'field' : 'Music', 'category' : 'CD'},
        'Музыка - Музыкальное видео' : {'field' : 'Music', 'category' : 'Clip video'},
        'Музыка - CD фирменного производства' : {'field' : 'Music', 'category' : 'CD'},
        'Музыка - MP3' : {'field' : 'Music', 'category' : 'MP3'}, 
        'Музыка - Подарочные издания' : {'field' : 'Music', 'category' : 'Gift Edition'},
        'Музыка - Винил' : {'field' : 'Music', 'category' : 'Vinyl'},
    
        # Games 
        'Игры - PSVita' : {'field' : 'Games', 'category' : 'PSVita'},
        'Игры - XBOX 360' : {'field' : 'Games', 'category' : 'XBOX 360'},
        'Игры - PSP' : {'field' : 'Games', 'category' : 'PSP'},
        'Игры PC - Стандартные издания' : {'field' : 'Games', 'category' : 'Pc Standard Editions'},
        'Игры - PS3' : {'field' : 'Games', 'category' : 'PS3'},
        'Игры PC - Дополнительные издания' : {'field' : 'Games', 'category' : 'Pc DLC'},
        'Игры PC - Коллекционные издания' : {'field' : 'Games', 'category' : 'Pc Collector'},
        'Игры - Аксессуары для игр' : {'field' : 'Games', 'category' : 'Accessories'},
        'Игры - PS4' : {'field' : 'Games', 'category' : 'PS4'}, 
        'Игры - XBOX ONE' : {'field' : 'Games', 'category' : 'XBOX ONE'}, 
        'Игры - PS2' : {'field' : 'Games', 'category' : 'PS2'},
        'Игры PC - Цифра' : {'field' : 'Games', 'category' : 'Digit'},
        'Игры Android - Цифра' : {'field' : 'Games', 'category' : 'Android'}, 
        'Игры MAC - Цифра' : {'field' : 'Games', 'category' : 'MAC'},
        
        # Gifts
        'Подарки - Гаджеты, роботы, спорт' : {'field' : 'Gifts', 'category' : 'Gadjet'},
        'Подарки - Мягкие игрушки' : {'field' : 'Gifts', 'category' : 'Toys'},        
        'Подарки - Сувениры' : {'field' : 'Gifts', 'category' : 'Souvenirs'},
        'Подарки - Настольные игры' : {'field' : 'Gifts', 'category' : 'Board games'},
        'Подарки - Настольные игры (компактные)' : {'field' : 'Gifts', 'category' : 'Board games'},
        'Подарки - Развитие' : {'field' : 'Gifts', 'category' : 'Développement'},
        'Подарки - Фигурки' : {'field' : 'Gifts', 'category' : 'Figurines'},
        'Подарки - Сувениры (в навеску)' : {'field' : 'Gifts', 'category' : 'Souvenirs'},
        'Подарки - Сумки, Альбомы, Коврики д/мыши' : {'field' : 'Gifts', 'category' : 'Bags, Albums, Mousepads'},
        'Подарки - Атрибутика' : {'field' : 'Gifts', 'category' : 'Attributes'},
        'Подарки - Открытки, наклейки' : {'field' : 'Gifts', 'category' : 'Postcards / Stickers'},
        'Подарки - Сертификаты, услуги' : {'field' : 'Gifts', 'category' : 'Certificates / Services'},
    
        # Payement Cards
        'Карты оплаты - PSN' : {'field' : 'Payement Cards', 'category' : 'PSN'},
        'Карты оплаты - Live!' : {'field' : 'Payement Cards', 'category' : 'Live!'},
        'Карты оплаты (Кино, Музыка, Игры)' : {'field' : 'Payement Cards', 'category' : 'Cinema / Music / Games'},
        'Карты оплаты - Live! (Цифра)' : {'field' : 'Payement Cards', 'category' : 'Live!'},
        'Карты оплаты - Windows (Цифра)' : {'field' : 'Payement Cards', 'category' : 'Windows'},
    
        # Programs
        'Программы - Для дома и офиса' : {'field' : 'Programs', 'category' : 'Home/Office'},
        'Программы - 1С:Предприятие 8' : {'field' : 'Programs', 'category' : 'Entreprise'},
        'Служебные' : {'field' : 'Programs', 'category' : 'Office'},
        'Программы - Обучающие' : {'field' : 'Programs', 'category' : 'Education'},
        'Программы - Для дома и офиса (Цифра)' : {'field' : 'Programs', 'category' : 'Home/Office'},
        'Программы - Обучающие (Цифра)' : {'field' : 'Programs', 'category' : 'Education'},
        'Программы - MAC (Цифра)' : {'field' : 'Programs', 'category' : 'MAC'},
    
        # Console
        'Игровые консоли - XBOX 360' : {'field' : 'Consoles', 'category' : 'XBOX 360'},
        'Игровые консоли - PSP' : {'field' : 'Consoles', 'category' : 'PSP'},  
        'Игровые консоли - PS3' : {'field' : 'Consoles', 'category' : 'PS3'},
        'Игровые консоли - PSVita' : {'field' : 'Consoles', 'category' : 'PSVita'},
        'Игровые консоли - Прочие' : {'field' : 'Consoles', 'category' : 'Other'}, 
        'Игровые консоли - XBOX ONE' : {'field' : 'Consoles', 'category' : 'XBOX ONE'},
        'Игровые консоли - PS4' : {'field' : 'Consoles', 'category' : 'PS4'},
        'Игровые консоли - PS2' : {'field' : 'Consoles', 'category' : 'PS2'},
    
        # Books
        'Книги - Методические материалы 1С' : {'field' : 'Books', 'category' : 'Methods'},
        'Книги - Аудиокниги' : {'field' : 'Books', 'category' : 'Audiobooks'},
        'Книги - Аудиокниги 1С' : {'field' : 'Books', 'category' : 'Audiobooks'},
        'Книги - Комиксы, манга' : {'field' : 'Books', 'category' : 'Manga / Comics'},
        'Книги - Артбуки, энциклопедии' : {'field' : 'Books', 'category' : 'Artbooks / Encyclopedias'},
        'Книги - Бизнес литература' : {'field' : 'Books', 'category' : 'Business Literature'},
        'Книги - Путеводители' : {'field' : 'Books', 'category' : 'Guides'},
        'Книги - Художественная литература' : {'field' : 'Books', 'category' : 'Fiction'},
        'Книги - Открытки' : {'field' : 'Books', 'category' : 'Postcards'},
        'Книги - Компьютерная литература' : {'field' : 'Books', 'category' : 'Computer literature'}, 
        'Книги - Познавательная литература' : {'field' : 'Books', 'category' : 'Education'}, 
        'Книги - Цифра' : {'field' : 'Books', 'category' : 'Digit'},
        'Книги - Аудиокниги (Цифра)' : {'field' : 'Books', 'category' : 'Audiobooks'},
    
        # Services
        'Билеты (Цифра)' : {'field' : 'Services', 'category' : 'Tickets'},
        'Служебные - Билеты' : {'field' : 'Services', 'category' : 'Tickets'},
        'Доставка товара' : {'field' : 'Services', 'category' : 'Delivery'},
    
        # Media
        'Чистые носители (штучные)' : {'field' : 'Media', 'category' : 'Piece'},
        'Чистые носители (шпиль)' : {'field' : 'Media', 'category' : 'Spire'},
}
df_train["item_field"] = df_train["item_category_name (RUSSIA)"].apply(lambda x: items[x]["field"])
df_train["item_category"] = df_train["item_category_name (RUSSIA)"].apply(lambda x: items[x]["category"])
df_train

### Shops

In [None]:
city_translations = {
    'Ярославль': 'Yaroslavl',
    'Москва': 'Moscow',
    'Воронеж': 'Voronezh',
    '!Якутск': 'Yakutsk',
    'Коломна': 'Kolomna',
    'Калуга': 'Kaluga',
    'Н.Новгород': 'Nizhny Novgorod',
    'Чехов': 'Chekhov',
    'Химки': 'Khimki',
    'Сургут': 'Surgut',
    'СПб': 'St. Petersburg',
    'Тюмень': 'Tyumen',
    'Омск': 'Omsk',
    'Самара': 'Samara',
    'Новосибирск': 'Novosibirsk',
    'Сергиев': 'Sergiyev Posad',
    'Якутск': 'Yakutsk',  # Appears twice in your list
    'Курск': 'Kursk',
    'Красноярск': 'Krasnoyarsk',
    'Волжский': 'Volzhsky',
    'Адыгея': 'Adygea',  # This is a region, not a city
    'Жуковский': 'Zhukovsky',
    'Казань': 'Kazan',
    'Интернет-магазин': 'Online Store',  # Not a city
    'Уфа': 'Ufa',
    'РостовНаДону': 'Rostov-on-Don',
    'Вологда': 'Vologda',
    'Томск': 'Tomsk',
    'Мытищи': 'Mytishchi',
    'Балашиха': 'Balashikha',
    'Выездная': 'Mobile',  # Not a city
    'Цифровой': 'Digital',  # Not a city
}
city_regions = {
    'Ярославль': 'Yaroslavl Oblast',
    'Москва': 'Moscow',
    'Воронеж': 'Voronezh Oblast',
    '!Якутск': 'Sakha Republic (Yakutia)',
    'Коломна': 'Moscow Oblast',
    'Калуга': 'Kaluga Oblast',
    'Н.Новгород': 'Nizhny Novgorod Oblast',
    'Чехов': 'Moscow Oblast',
    'Химки': 'Moscow Oblast',
    'Сургут': 'Khanty-Mansi Autonomous Okrug',
    'СПб': 'St. Petersburg',
    'Тюмень': 'Tyumen Oblast',
    'Омск': 'Omsk Oblast',
    'Самара': 'Samara Oblast',
    'Новосибирск': 'Novosibirsk Oblast',
    'Сергиев': 'Moscow Oblast',
    'Якутск': 'Sakha Republic (Yakutia)',  # Appears twice in your list
    'Курск': 'Kursk Oblast',
    'Красноярск': 'Krasnoyarsk Krai',
    'Волжский': 'Volgograd Oblast',
    'Адыгея': 'Republic of Adygea',  # This is a region, not a city
    'Жуковский': 'Moscow Oblast',
    'Казань': 'Republic of Tatarstan',
    'Интернет-магазин': 'Online Store',  # Not a city
    'Уфа': 'Republic of Bashkortostan',
    'РостовНаДону': 'Rostov Oblast',
    'Вологда': 'Vologda Oblast',
    'Томск': 'Tomsk Oblast',
    'Мытищи': 'Moscow Oblast',
    'Балашиха': 'Moscow Oblast',
    'Выездная': 'Mobile',  # Not a city
    'Цифровой': 'Digital',  # Not a city
}
type_translations = {
    'ТЦ': 'Shopping Center',
    'ТРК': 'Shopping and Entertainment Complex',
    'ТК': 'Trade Center',
    'МТРЦ': 'Mega Trade and Recreation Center',
    'Плехановская': 'Plekhanovskaya',
    'ТРЦ': 'Shopping Mall',
    'Орджоникидзе': 'Ordzhonikidze',
    'Посад': 'Posad',
    'Магазин': 'Shop',
    'ул.': 'Street',
    'ЧС': 'Emergency',
    'Торговля': 'Trade',
    'Распродажа': 'Sale',
    'склад': 'Warehouse'
}
mega_type = {
    'ТЦ': 'Shopping',
    'ТРК': 'Shopping',
    'ТК': 'Trade',
    'МТРЦ': 'Mega',
    'Плехановская': 'Other',
    'ТРЦ': 'Shopping',
    'Орджоникидзе': 'Other',
    'Посад': 'Other',
    'Магазин': 'Shopping',
    'ул.': 'Other',
    'ЧС': 'Other',
    'Торговля': 'Trade',
    'Распродажа': 'Sale',
    'склад': 'Other',    
}
df_train["city_rus"] = df_train["shop_name (RUSSIA)"].apply(lambda x: x.split(" ")[0].strip())
df_train["city_eng"] = df_train["shop_name (RUSSIA)"].apply(lambda x: city_translations[x.split(" ")[0]])
df_train["region"] = df_train["city_rus"].apply(lambda x: city_regions[x])
df_train["type_rus"] = df_train["shop_name (RUSSIA)"].apply(lambda x: x.split(" ")[1].strip().replace("(", "").replace(",", "").replace('"', ''))
df_train["type_eng"] = df_train["type_rus"].apply(lambda x: type_translations[x])
df_train["mega_type"] = df_train["type_rus"].apply(lambda x: mega_type[x])
df_train

### item_name (RUSSIA)

### Encoding

In [None]:
# Feature engineering -> No nan values.
encoders = {}
for index, c_name in enumerate(df_train.columns) :
    print("{}). {}".format(index, c_name))
    c_type = df_train[c_name].dtype
    print(f"Type : {c_type}")
    print("Nan count : {}".format(df_train[c_name].isnull().sum()))
    print("Unique values : {}".format(len(df_train[c_name].unique())))
    if c_name=='date':
        print("Encoding {}".format(c_name))
        df_train[c_name] = (pd.to_datetime(df_train[c_name]) - pd.datetime.now()).dt.total_seconds()
        enc = MinMaxScaler()
        df_train[c_name] = enc.fit_transform(df_train[c_name].values.reshape(-1, 1))
        encoders[c_name] = enc
    elif c_type == "int64":
        print("Encoding {}".format(c_name))
        enc = LabelEncoder()
        df_train[c_name] = enc.fit_transform(df_train[c_name])
        encoders[c_name] = enc
    elif c_type == "float64":
        print("Encoding {}".format(c_name))
        enc = MinMaxScaler()
        df_train[c_name] = enc.fit_transform(df_train[c_name].values.reshape(-1, 1))
        encoders[c_name] = enc
    elif c_name=='item_category_name (RUSSIA)':
        print("Encoding {}".format(c_name))
        enc = LabelEncoder()
        df_train[c_name] = enc.fit_transform(df_train[c_name])
        encoders[c_name] = enc
    elif c_name in ["item_field", "item_category", "city_eng", "region", "type_eng", "mega_type"]:
        print("Encoding {}".format(c_name))
        enc = LabelEncoder()
        df_train[c_name] = enc.fit_transform(df_train[c_name])
        encoders[c_name] = enc
df_train = df_train[list(encoders.keys())]
print(f"df_train shape: {df_train.shape}.")

In [None]:
df_train.head()

### Getiing X, Y and split 

In [None]:
X, y = df_train.drop(columns=["item_cnt_day"]), df_train["item_cnt_day"]
print(f"Shape X:{X.shape}, shape y:{y.shape}.")
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.3)
print(f"Shape X_train:{X_train.shape}, shape y_train:{y_train.shape}.")
print(f"Shape X_test:{X_test.shape}, shape y_test:{y_test.shape}.")

In [None]:
X_train.head()

### model random forest

In [None]:
model = RandomForestRegressor(n_estimators=150, max_depth=6, ccp_alpha=0.3, verbose=2)
model.fit(X_train, y_train)

In [None]:
MSE_score = mean_squared_error(y_test, model.predict(X_test))
MAE_score = mean_absolute_error(y_test, model.predict(X_test)) 
ME_score = max_error(y_test, model.predict(X_test))
print(f"MSE: {MSE_score}.")
print(f"MAE: {MAE_score}.")
print(f"ME: {ME_score}.")

### Push 

In [None]:
def generate_submission_file_based_on_prediction(result, source_file_path="data/sample_submission.csv"):
    df = pd.read_csv(
        filepath_or_buffer=source_file_path,
        sep=",",
    ).set_index("PassengerId")
    print(f"Entry: {df.shape}.")
    merge_df = pd.merge(
        left=df,
        right=result,
        how="left",
        left_index=True,
        right_index=True,
    )
    print(f"Output: {merge_df.shape}.")
    if df.shape[0] != merge_df.shape[0]:
        raise ValueError(f"Should be same size.")
    merge_df = merge_df[["Transported_x", "Transported_y"]]
    merge_df = merge_df.reset_index().rename(columns={"Transported_y": "Transported"}).drop(columns=["Transported_x"])
    merge_df.to_csv("data/my_submission.csv", sep=",", index=False)
    
def generate_random_submission(source_file_path="data/sample_submission.csv"):
    df = pd.read_csv(
        filepath_or_buffer=source_file_path,
        sep=",",
    )
    df["item_cnt_month"] = df["item_cnt_month"].apply(lambda x: random.uniform(0.0, 1.0))
    print(f"File data/my_submission.csv successfully generated.\n")
    df.to_csv("data/my_submission.csv", sep=",", index=False)
    
def submit_submission(submission_file="data/my_submission.csv"):
    with open("kaggle.json") as credential:
        json_credential = json.loads(credential.read())
        os.environ["KAGGLE_USERNAME"] = json_credential["username"]
        os.environ["KAGGLE_KEY"] = json_credential["key"]
    result = subprocess.check_output(
        [
            "kaggle",
            "competitions",
            "submit",
            "competitive-data-science-predict-future-sales",
            "-f",
            submission_file,
            "-m",
            f"{dt.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}: New submission",
        ]
    ).decode("utf-8")
    print(result)

def get_latest_score(team_id="10059555"):
    with open("kaggle.json") as credential:
        json_credential = json.loads(credential.read())
        os.environ["KAGGLE_USERNAME"] = json_credential["username"]
        os.environ["KAGGLE_KEY"] = json_credential["key"]
        os.environ["KAGGLE_TEAM_ID"] = team_id
    result = subprocess.check_output(["kaggle", "competitions", "submissions", "competitive-data-science-predict-future-sales"]).decode("utf-8")
    print(result)

generate_random_submission()
submit_submission()
get_latest_score()

In [None]:
get_latest_score()