In [1]:
import pandas as pd
import warnings
from path_utils import sales_fix_path, test_fix_path, save_final_to, shops_fix_path, final_data_path, \
    item_categories_fix_path, items_fix_path
from src.modeling.building_features import FeatureModeling
from src.modeling.validation_schema import ValidationSchema
from src.modeling.training_schema import TrainingModel
from src.modeling.utils import create_kaggle_data
from hyperopt import hp


warnings.filterwarnings("ignore")

I see 1 GPU devices


In [ ]:
in_features = ['date_block_num', 'shop_id', 'item_id']
target = ['item_cnt_month']
cat_cols = ['date_block_num', 'shop_id', 'item_id']

In [None]:
train = pd.read_csv(sales_fix_path)
test = pd.read_csv(test_fix_path)

In [None]:
fe_simple = FeatureModeling()

In [ ]:
fe_simple.set_data_paths(sales_path=sales_fix_path, shop_path=shops_fix_path, item_path=items_fix_path,
                         item_categories_path=item_categories_fix_path)

In [None]:
fe_simple.create_final_data(train, test, make_big=False)

In [None]:
fe_simple.load_data("final_data")

In [None]:
vs = ValidationSchema(data=final_data_path)

In [None]:
validation_dict = vs.train_test_spliter(train_size=32)

In [ ]:
model = TrainingModel(data=fe_simple.get_data(), validation_dict=validation_dict)

In [None]:
preds = model.train_model(in_features=in_features, target=target, cat_features=cat_cols)

In [ ]:
create_kaggle_data(predictions=preds, file_name="kaggle", save_path=save_final_to)



**Without any Feature Extraction methods and option to include all possible combination date_block_num, shop_id, and item_id if make_big, public kaggle score is 1.52 (what a fuck amigo)**


<img src="../data/images/kaggle_score_1.png">


In [None]:
train = pd.read_csv(sales_fix_path)
test = pd.read_csv(test_fix_path)

In [None]:
fe_with_comb = FeatureModeling()

In [ ]:
fe_with_comb.set_data_paths(sales_path=sales_fix_path, shop_path=shops_fix_path, item_path=items_fix_path,
                            item_categories_path=item_categories_fix_path)

In [None]:
fe_with_comb.create_final_data(train=train, test=test, make_big=True)

In [None]:
fe_with_comb.load_data('final_data')

In [None]:
vs = ValidationSchema(final_data_path)

In [None]:
validation_dict = vs.train_test_spliter(train_size=32)

In [ ]:
model = TrainingModel(fe_with_comb.get_data(), validation_dict=validation_dict)

In [None]:
preds = model.train_model(in_features=in_features, target=target, cat_features=cat_cols)

In [None]:
create_kaggle_data(predictions=preds, file_name="kaggle", save_path=save_final_to)


**Without Feature Extraction methods but using option to include all possible combination date_block_num, shop_id, and item_id if make_big, public kaggle score is 1.16**


<img src="../data/images/kaggle_score_2.png">


In [None]:
train = pd.read_csv(sales_fix_path)
test = pd.read_csv(test_fix_path)

In [None]:
fe = FeatureModeling()

In [None]:
fe.create_final_data(train=train, test=test, make_big=True)

In [None]:
df = fe.get_data()

In [None]:
# WARNING !!!! DON'T USE IT. CALCULATING 4 HOURS !!!

# shop_item_df = df.groupby(["shop_id", "item_id"], as_index=False)['item_cnt_month'].sum()

In [None]:
# WARNING !!!! DON'T USE IT. CALCULATING 4 HOURS !!!

# shop_item_df = shop_item_df[['shop_id', 'item_id']].to_dict(orient='records')

In [None]:
# WARNING !!!! DON'T USE IT. CALCULATING 4 HOURS !!!

# for it, row in enumerate(shop_item_df):
#     shop_item_ts: pd.DataFrame = df[(df["shop_id"] == row["shop_id"]) & (df["item_id"] == row["item_id"])]
#     if shop_item_ts.shape[0] >= 20:
#         proc_shop_item_ts: pd.DataFrame = fe.ts_nonstatinarity_processing(shop_item_ts, "item_cnt_month")
#         df.loc[
#             (df["shop_id"] == row["shop_id"]) & (df["item_id"] == row["item_id"]), "item_cnt_month"] = proc_shop_item_ts
#         print("Iter {}: TS for {} shop and {} item processed.".format(it, row['shop_id'], row['item_id']))

In [None]:
df.to_csv(save_final_to + 'diff_data' + '.csv', index=False)

In [None]:
vs = ValidationSchema('../data/final/diff_data.csv')

In [None]:
validation_dict = vs.train_test_spliter()

In [ ]:
model = TrainingModel(data=df, validation_dict=validation_dict)

In [None]:
preds = model.train_model(in_features=in_features, target=target, cat_features=cat_cols)

In [None]:
create_kaggle_data(predictions=preds, file_name="kaggle_diff", save_path=save_final_to)

**With using stationary methods and option to include all possible combination date_block_num, shop_id, and item_id if make_big, public kaggle score is 1.18**


<img src="../data/images/kaggle_score_3.png">

In [None]:
train = pd.read_csv(sales_fix_path)
test = pd.read_csv(test_fix_path)

In [None]:
fe = FeatureModeling()

In [None]:
fe.set_data_paths(sales_path=sales_fix_path, shop_path=shops_fix_path, item_path=items_fix_path,
                  item_categories_path=item_categories_fix_path)

In [None]:
fe.create_final_data(train=train, test=test, make_big=True)

In [None]:
feature_functions = [
    fe.add_mean_price,
    fe.add_city_features,
    fe.add_item_features,
    fe.add_item_categories_features,
    lambda: fe.add_mean_features(idx_features=['date_block_num', 'item_id']),
    lambda: fe.add_mean_features(idx_features=['date_block_num', 'item_id', 'city']),
    lambda: fe.add_lag_features(idx_features=['date_block_num', 'shop_id', 'item_id'], lag_feature='item_cnt_month',
                                nlags=3, clip=True),
    lambda: fe.add_lag_features(idx_features=['date_block_num', 'shop_id', 'item_id'], lag_feature='item_mean_price',
                                nlags=3),
    lambda: fe.add_lag_mean_features(idx_features=['date_block_num', 'shop_id', 'item_id'], drop_mean_features=True,
                                     nlags=3),
    lambda: fe.add_mean_features(idx_features=['date_block_num', 'shop_id', 'item_category_id']),
    lambda: fe.add_lag_mean_features(idx_features=['date_block_num', 'shop_id', 'item_category_id'],
                                     drop_mean_features=True, nlags=3),
    fe.final_process
]

In [None]:
fe.add_features(feature_functions)

In [None]:
fe.load_data(file_name='fina_data_simp')

In [None]:
vs = ValidationSchema(fe.get_data())

In [None]:
validation_dict = vs.train_test_spliter()

In [None]:
model = TrainingModel(data=fe.get_data(), validation_dict=validation_dict)

In [None]:
df.columns

In [9]:
in_features = ['date_block_num', 'shop_id', 'item_id', 'month',
               'city', 'shop_category', 'item_category_id',
               'duration_after_first_sale', 'category', 'item_cnt_month_lag1',
               'item_cnt_month_lag2', 'item_cnt_month_lag3', 'item_mean_price_lag1',
               'item_mean_price_lag2', 'item_mean_price_lag3',
               'item_id_mean_sales_lag1', 'item_id_mean_sales_lag2',
               'item_id_mean_sales_lag3', 'item_id_city_mean_sales_lag1',
               'item_id_city_mean_sales_lag2', 'item_id_city_mean_sales_lag3',
               'shop_id_item_category_id_mean_sales_lag1',
               'shop_id_item_category_id_mean_sales_lag2',
               'shop_id_item_category_id_mean_sales_lag3']
target = ['item_cnt_month']
cat_cols = ['date_block_num', 'duration_after_first_sale', 'shop_id', 'item_id', 'month', 'city', 'shop_category',
            'item_category_id', 'category']

In [None]:
preds = model.train_model(in_features=in_features, target=target, cat_features=cat_cols)

In [None]:
create_kaggle_data(predictions=preds, file_name='kaggle_with_features_1', save_path=save_final_to)

**With using feature modeling and option to include all possible combination date_block_num, shop_id, and item_id if make_big, public kaggle score is 1.07**


<img src="../data/images/kaggle_score_4.png">

In [2]:
df = pd.read_csv('../data/final/fina_data_simp.csv')

In [9]:
len(df.columns.to_list())

25

In [3]:
vs = ValidationSchema(df)

In [4]:
validation_dict = vs.train_test_spliter()

In [5]:
model = TrainingModel(data=df, validation_dict=validation_dict)

In [7]:
param_space = {'iterations': hp.randint('iterations', 100, 1000),
               'depth': hp.randint('depth', 4, 24),
               'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
               'l2_leaf_reg': hp.randint('l2_leaf_reg', 1, 10)
               }

In [10]:
best_params = model.parameter_search(param_space=param_space, in_features=in_features, target=target,
                                     cat_features=cat_cols)

ValueError: Invalid validation type technique. Requires full but got cv