In [39]:
#Check if container is passing through + general imports 
import torch
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [40]:
# Make Connection
from databricks_connect import connect_explicit
spark = connect_explicit()

#Joining data for demand forecasts
data = spark.sql("""
    SELECT 
        ss_sold_date_sk,
        ss_item_sk,
        ss_store_sk,
        ss_promo_sk,
        ss_quantity,
        d_date,
        d_day_name,
        d_holiday,
        d_following_holiday,
        d_weekend,
        i_item_id,
        s_store_id
        
    FROM samples.tpcds_sf1.store_sales AS ss
    INNER JOIN samples.tpcds_sf1.date_dim AS dd
    ON ss.ss_sold_date_sk = dd.d_date_sk
    INNER JOIN samples.tpcds_sf1.item AS i
    ON ss.ss_item_sk = i.i_item_sk
    INNER JOIN samples.tpcds_sf1.store AS s
    ON ss.ss_store_sk = s.s_store_sk
    """)

# Promotion table with actual dates
promo = spark.sql("""
    SELECT
        p.p_item_sk, 
        p.p_promo_sk,
        p.p_promo_id,
        dd_start.d_date as promo_start_date,
        dd_end.d_date as promo_end_date

    FROM samples.tpcds_sf1.promotion AS p
    LEFT JOIN samples.tpcds_sf1.date_dim AS dd_start
    ON p.p_start_date_sk = dd_start.d_date_sk
    LEFT JOIN samples.tpcds_sf1.date_dim AS dd_end
    ON p.p_end_date_sk = dd_end.d_date_sk
    """)

sales_promo = data.join(
    promo,
    (data.ss_item_sk == promo.p_item_sk) & 
    (data.d_date >= promo.promo_start_date) & 
    (data.d_date <= promo.promo_end_date),
    "left"
).select(
    data["*"],
    promo.p_promo_id,
    promo.promo_start_date,
    promo.promo_end_date
)

sales_promo.show()

+---------------+----------+-----------+-----------+-----------+----------+----------+---------+-------------------+---------+----------------+----------------+----------+----------------+--------------+
|ss_sold_date_sk|ss_item_sk|ss_store_sk|ss_promo_sk|ss_quantity|    d_date|d_day_name|d_holiday|d_following_holiday|d_weekend|       i_item_id|      s_store_id|p_promo_id|promo_start_date|promo_end_date|
+---------------+----------+-----------+-----------+-----------+----------+----------+---------+-------------------+---------+----------------+----------------+----------+----------------+--------------+
|        2451181|     14386|          1|        251|         77|1999-01-02|  Saturday|        N|                  Y|        Y|AAAAAAAACDIDAAAA|AAAAAAAABAAAAAAA|      NULL|            NULL|          NULL|
|        2451181|     11323|          1|          1|         84|1999-01-02|  Saturday|        N|                  Y|        Y|AAAAAAAALDMCAAAA|AAAAAAAABAAAAAAA|      NULL|            N

In [59]:
promo_df = sales_promo.toPandas()
promo_df.head()

Unnamed: 0,ss_sold_date_sk,ss_item_sk,ss_store_sk,ss_promo_sk,ss_quantity,d_date,d_day_name,d_holiday,d_following_holiday,d_weekend,i_item_id,s_store_id,p_promo_id,promo_start_date,promo_end_date
0,2451181,14386,1,251.0,77.0,1999-01-02,Saturday,N,Y,Y,AAAAAAAACDIDAAAA,AAAAAAAABAAAAAAA,,,
1,2451181,11323,1,1.0,84.0,1999-01-02,Saturday,N,Y,Y,AAAAAAAALDMCAAAA,AAAAAAAABAAAAAAA,,,
2,2451181,10141,1,44.0,96.0,1999-01-02,Saturday,N,Y,Y,AAAAAAAANJHCAAAA,AAAAAAAABAAAAAAA,,,
3,2451181,8059,1,72.0,51.0,1999-01-02,Saturday,N,Y,Y,AAAAAAAALHPBAAAA,AAAAAAAABAAAAAAA,,,
4,2451181,7508,1,36.0,96.0,1999-01-02,Saturday,N,Y,Y,AAAAAAAAEFNBAAAA,AAAAAAAABAAAAAAA,,,


In [60]:
promo_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2686004 entries, 0 to 2686003
Data columns (total 15 columns):
 #   Column               Dtype  
---  ------               -----  
 0   ss_sold_date_sk      int32  
 1   ss_item_sk           int32  
 2   ss_store_sk          int32  
 3   ss_promo_sk          float64
 4   ss_quantity          float64
 5   d_date               object 
 6   d_day_name           object 
 7   d_holiday            object 
 8   d_following_holiday  object 
 9   d_weekend            object 
 10  i_item_id            object 
 11  s_store_id           object 
 12  p_promo_id           object 
 13  promo_start_date     object 
 14  promo_end_date       object 
dtypes: float64(2), int32(3), object(10)
memory usage: 276.7+ MB


In [61]:
df = promo_df.iloc[:,4:]
df.head()

Unnamed: 0,ss_quantity,d_date,d_day_name,d_holiday,d_following_holiday,d_weekend,i_item_id,s_store_id,p_promo_id,promo_start_date,promo_end_date
0,77.0,1999-01-02,Saturday,N,Y,Y,AAAAAAAACDIDAAAA,AAAAAAAABAAAAAAA,,,
1,84.0,1999-01-02,Saturday,N,Y,Y,AAAAAAAALDMCAAAA,AAAAAAAABAAAAAAA,,,
2,96.0,1999-01-02,Saturday,N,Y,Y,AAAAAAAANJHCAAAA,AAAAAAAABAAAAAAA,,,
3,51.0,1999-01-02,Saturday,N,Y,Y,AAAAAAAALHPBAAAA,AAAAAAAABAAAAAAA,,,
4,96.0,1999-01-02,Saturday,N,Y,Y,AAAAAAAAEFNBAAAA,AAAAAAAABAAAAAAA,,,


In [62]:
df = df.astype({'ss_quantity':float, 
                'd_day_name':str, 
                'd_holiday':str,
                'd_following_holiday':str,
                'd_weekend':str,
                'i_item_id':str,
                's_store_id':str,
                'p_promo_id':str,})

df["promo"] = df["p_promo_id"] != "None"
df = df.drop(["p_promo_id", "promo_start_date", "promo_end_date"], axis = 1)
df["d_date"] = pd.to_datetime(df["d_date"])
df.head()

Unnamed: 0,ss_quantity,d_date,d_day_name,d_holiday,d_following_holiday,d_weekend,i_item_id,s_store_id,promo
0,77.0,1999-01-02,Saturday,N,Y,Y,AAAAAAAACDIDAAAA,AAAAAAAABAAAAAAA,False
1,84.0,1999-01-02,Saturday,N,Y,Y,AAAAAAAALDMCAAAA,AAAAAAAABAAAAAAA,False
2,96.0,1999-01-02,Saturday,N,Y,Y,AAAAAAAANJHCAAAA,AAAAAAAABAAAAAAA,False
3,51.0,1999-01-02,Saturday,N,Y,Y,AAAAAAAALHPBAAAA,AAAAAAAABAAAAAAA,False
4,96.0,1999-01-02,Saturday,N,Y,Y,AAAAAAAAEFNBAAAA,AAAAAAAABAAAAAAA,False


In [63]:
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2653735 entries, 0 to 2686003
Data columns (total 9 columns):
 #   Column               Dtype         
---  ------               -----         
 0   ss_quantity          float64       
 1   d_date               datetime64[ns]
 2   d_day_name           object        
 3   d_holiday            object        
 4   d_following_holiday  object        
 5   d_weekend            object        
 6   i_item_id            object        
 7   s_store_id           object        
 8   promo                bool          
dtypes: bool(1), datetime64[ns](1), float64(1), object(6)
memory usage: 184.7+ MB


In [64]:
df.describe()

Unnamed: 0,ss_quantity,d_date
count,2653735.0,2653735
mean,50.50909,2000-08-20 18:28:13.620500608
min,1.0,1998-01-02 00:00:00
25%,25.0,1999-06-05 00:00:00
50%,51.0,2000-09-14 00:00:00
75%,76.0,2001-11-17 00:00:00
max,100.0,2003-01-02 00:00:00
std,28.88071,


In [65]:
df_grouped = df.groupby(["d_date","i_item_id"]).sum()
df_grouped.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,ss_quantity,d_day_name,d_holiday,d_following_holiday,d_weekend,s_store_id,promo
d_date,i_item_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1998-01-02,AAAAAAAAAAACAAAA,50.0,Friday,N,Y,Y,AAAAAAAACAAAAAAA,0
1998-01-02,AAAAAAAAAAHBAAAA,90.0,Friday,N,Y,Y,AAAAAAAAHAAAAAAA,0
1998-01-02,AAAAAAAAAAHDAAAA,13.0,Friday,N,Y,Y,AAAAAAAAHAAAAAAA,0
1998-01-02,AAAAAAAAAALDAAAA,21.0,Friday,N,Y,Y,AAAAAAAAHAAAAAAA,0
1998-01-02,AAAAAAAAAAMCAAAA,15.0,Friday,N,Y,Y,AAAAAAAAIAAAAAAA,0


In [72]:
df_years = df.groupby(df['d_date'].dt.year).count()
df_years

Unnamed: 0_level_0,ss_quantity,d_date,d_day_name,d_holiday,d_following_holiday,d_weekend,i_item_id,s_store_id,promo
d_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1998,528590,528590,528590,528590,528590,528590,528590,528590,528590
1999,532133,532133,532133,532133,532133,532133,532133,532133,532133
2000,531822,531822,531822,531822,531822,531822,531822,531822,531822
2001,525842,525842,525842,525842,525842,525842,525842,525842,525842
2002,529552,529552,529552,529552,529552,529552,529552,529552,529552
2003,5796,5796,5796,5796,5796,5796,5796,5796,5796
