# IMPORTS

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import datetime
from fbprophet import Prophet
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from scipy.fft import dct, dst


# Initial Cleaning

In [None]:
train_init = pd.read_csv('../input/av-demand-prediction/train_0irEZ2H.csv')
test_init = pd.read_csv('../input/av-demand-prediction/test_nfaJ3J5.csv')

In [None]:
train_init

In [None]:
train_init[train_init['week'].isnull() == True]['week']
train_init[train_init['store_id'].isnull() == True]['store_id']
train_init[train_init['sku_id'].isnull() == True]['sku_id']
train_init[train_init['total_price'].isnull() == True]['total_price']
train_init[train_init['base_price'].isnull() == True]['base_price']
train_init[train_init['is_featured_sku'].isnull() == True]['is_featured_sku']
train_init[train_init['is_display_sku'].isnull() == True]['is_display_sku']
train_init[train_init['units_sold'].isnull() == True]['units_sold']
train_init[train_init['record_ID'].isnull() == True]['record_ID']
train_init[train_init['total_price'] == 0]['total_price']
train_init['total_price'] = train_init['total_price'].fillna(0)

In [None]:
train_init

# Feature Extraction

In [None]:
def create_date_features(df,colDt,inclTime=False):
    cols_dat=[]

    #df['Year'] = pd.to_datetime(df[colDt]).dt.year
    #cols_dat.append('Year')
    df['Month'] = pd.to_datetime(df[colDt]).dt.month
    cols_dat.append('Month')
    #df['Day'] = pd.to_datetime(df[colDt]).dt.day
    #cols_dat.append('Day')
    df['Dayofweek'] = pd.to_datetime(df[colDt]).dt.dayofweek
    cols_dat.append('Dayofweek')
    #df['DayOfyear'] = pd.to_datetime(df[colDt]).dt.dayofyear
    #cols_dat.append('DayOfyear')
    df['Week'] = pd.to_datetime(df[colDt]).dt.week
    cols_dat.append('Week')
    df['Quarter'] = pd.to_datetime(df[colDt]).dt.quarter 
    cols_dat.append('Quarter')
    #df['Is_month_start'] = pd.to_datetime(df[colDt]).dt.is_month_start*1
    #cols_dat.append('Is_month_start')
    #df['Is_month_end'] = pd.to_datetime(df[colDt]).dt.is_month_end*1
    #cols_dat.append('Is_month_end')
    #df['Is_quarter_start'] = pd.to_datetime(df[colDt]).dt.is_quarter_start*1
    #cols_dat.append('Is_quarter_start')
    #df['Is_quarter_end'] = pd.to_datetime(df[colDt]).dt.is_quarter_end*1
    #cols_dat.append('Is_quarter_end')
    #df['Is_year_start'] = pd.to_datetime(df[colDt]).dt.is_year_start*1
    #cols_dat.append('Is_year_start')
    #df['Is_year_end'] = pd.to_datetime(df[colDt]).dt.is_year_end*1
    #cols_dat.append('Is_year_end')
    #df['Semester'] = np.where(df['Quarter'].isin([1,2]),1,2)
    #cols_dat.append('Semester')
    #df['Is_weekend'] = np.where(df['Dayofweek'].isin([5,6]),1,0)
    #cols_dat.append('Is_weekend')
    #df['Is_weekday'] = np.where(df['Dayofweek'].isin([0,1,2,3,4]),1,0)
    #cols_dat.append('Is_weekday')
    #df['Days_in_month'] = pd.to_datetime(df[colDt]).dt.days_in_month
    if inclTime:
        df['Hour'] = pd.to_datetime(df[colDt]).dt.hour
        cols_dat.append('Hour')
    #df['Time'] = [((date.hour*60+(date.minute))*60)+date.second for date in df.DateTime]

    return df,cols_dat

In [None]:
def create_aggregates_count(df,colagg,coltar,nsuffix):

    dfAg=df.dropna (subset=coltar).groupby (by=colagg).agg({coltar[0]:['count']})

    lstColsAg=dfAg.reset_index().columns.to_list() 
    print(lstColsAg)                                                           
    colsAg=[] 
    for n1 in range (len(lstColsAg)):
        if len(lstColsAg[n1][1])>0:
            colname=lstColsAg[n1][0]+"_"+lstColsAg[n1][1]+"_"+str(nsuffix)+str(len(colagg))
            colsAg.append(colname) 
    dfAg.columns=colsAg 
    dfAg=dfAg.reset_index() 
    df=df.merge(dfAg, how='left', on=colagg) 
    return df

def create_aggregates(df,colagg,coltar,nsuffix):
    ##coltar and colagg should both be passed as lists, but len(coltar) = 1 
    dfAg=df.dropna(subset=coltar).groupby(by=colagg).agg({coltar[0]:['skew']})#,'median','mad','min','max', 'std', 'var','sem','skew', 'quantile'],})#,'median','mad','min','max', 'std', 'var','sem','skew', 'quantile']})

    lstColsAg=dfAg.reset_index().columns.to_list() 
    print(lstColsAg)                                                           
    colsAg=[] 
    for n1 in range (len(lstColsAg)):
        if len(lstColsAg[n1][1])>0:
            colname=lstColsAg[n1][0]+"_"+lstColsAg[n1][1]+"_"+str(nsuffix)+str(len(colagg))
            colsAg.append(colname) 
    dfAg.columns=colsAg 
    dfAg=dfAg.reset_index() 
    df=df.merge(dfAg, how='left', on=colagg) 
    return df

In [None]:
def features(df):
    df,df_cols = create_date_features(df,'week')
    agg_cols = ['store_id']#,['sku_id','is_featured_sku'],['sku_id','is_display_sku'],['sku_id','is_featured_sku','is_display_sku']]
    from sklearn.preprocessing import LabelEncoder
    df['weeks'] = LabelEncoder().fit_transform(df['week'])
    df['cos'] = dct(df['weeks'])
    df['sin'] = dst(df['weeks'])
    #for agg_col in agg_cols:
        #df = create_aggregates(df, agg_col, ['base_price'], 0)
        #df = create_aggregates_count(df, agg_col, ['base_price'], 0)
        #df = create_aggregates(df, agg_col, ['total_price'], 0)
        #df = create_aggregates_count(df, agg_col, ['total_price'], 0)
    df = df.drop(['week'], axis = 1)
    return df
    

In [None]:
train_plus = features(train_init)
train_plus

In [None]:
cols_all=train_plus.columns.tolist()
cols_y=['units_sold']
cols_ignore=['record_ID']
cols_x=list(set(cols_all)-set(cols_y)-set(cols_ignore))

In [None]:
y = train_plus[cols_y]
X = train_plus[cols_x]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.4)

In [None]:
import math

def rmsle(preds, train_data):  
    labels = train_data.get_label()
    assert len(preds) == len(labels)
    score = mean_squared_log_error(labels, preds)
    #terms_to_sum = [(math.log(labels[i] + 1) - math.log(preds[i] + 1)) ** 2.0 for i,pred in enumerate(labels)]ksjzsi
    return score#(sum(terms_to_sum) * (1.0/len(preds))) ** 0.5

In [None]:
#def rmsle(preds, train_data):
    #labels = train_data.get_label()
    #preds = 1. / (1. + np.exp(-preds))dcft
    #return 'error', np.mean(labels != (preds > 0.5)), False



In [None]:
params = {
        "objective" : "regression",
        #"boosting_type":"dart",
        #"metric" : "rmse",
        "feval":"rmsle",
        "num_leaves" : 64,
        "learning_rate" : 0.005,
        "bagging_fraction" : 0.7,
        "feature_fraction" : 0.5,
        "bagging_frequency" : 6,
        "bagging_seed" : 42,
        "verbose" : 1,
        "seed": 42,
        }
lgtrain = lgb.Dataset(X_train, label=y_train)
lgval = lgb.Dataset(X_val, label=y_val)
evals_result = {}
model = lgb.train(params, lgtrain, 50000, valid_sets=[lgval], early_stopping_rounds=100, verbose_eval=50, evals_result=evals_result)

In [None]:
def submission(X_test):
    X_test = features(X_test)
    y_test = model.predict(X_test[cols_x], num_iteration=model.best_iteration)
    y_test[y_test<=1] = 1
    #y_test = np.abs(y_test)
    units_sold = pd.Series(y_test)
    sub = pd.concat([X_test['record_ID'], units_sold], axis = 1)
    sub = sub.rename({0:'units_sold'}, axis = 1).set_index('record_ID')
    return sub

In [None]:
sub = submission(test_init).to_csv('sub.csv')