# Analyze data

# 1. Imports

## 1.1 Packages

In [1]:
import numpy as np
import pandas as pd

import plotly.express as px

## 1.2 Options

## 1.3 Dataset

In [15]:
df_train = pd.read_csv("../data/01_raw/train.csv")
df_stores = pd.read_csv("../data/01_raw/stores.csv")
df_transactions = pd.read_csv("../data/01_raw/transactions.csv").sort_values(["store_nbr", "date"])
df_oil = pd.read_csv("../data/01_raw/oil.csv")
df_holidays = pd.read_csv("../data/01_raw/holidays_events.csv")

# 2. Prepare data

In [3]:
# Prepare datetime
df_train["date"] = pd.to_datetime(df_train.date)
df_transactions["date"] = pd.to_datetime(df_transactions.date)

In [4]:
df_train.sample(5)

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
2658576,2658576,2017-02-04,53,PRODUCE,2518.7969,4
1763009,1763009,2015-09-19,26,HOME APPLIANCES,0.0,0
2971150,2971150,2017-07-30,24,POULTRY,599.02,0
545621,545621,2013-11-03,18,SEAFOOD,5.687,0
115506,115506,2013-03-06,5,CELEBRATION,0.0,0


In [None]:
df_transactions.sample(5)

Unnamed: 0,date,store_nbr,transactions
54439,2016-02-11,37,1542
24347,2014-06-10,12,1143
18033,2014-01-25,49,2778
19273,2014-02-21,15,1202
48388,2015-10-16,25,871


In [14]:
df_stores.sample(5)

Unnamed: 0,store_nbr,city,state,type,cluster
48,49,Quito,Pichincha,A,11
50,51,Guayaquil,Guayas,A,17
53,54,El Carmen,Manabi,C,3
9,10,Quito,Pichincha,C,15
35,36,Libertad,Guayas,E,10


In [6]:
df_oil.sample(5)

Unnamed: 0,date,dcoilwtico
655,2015-07-07,52.33
429,2014-08-25,95.39
52,2013-03-14,93.03
1109,2017-04-03,50.25
873,2016-05-06,44.58


In [16]:
df_holidays.sample(5)

Unnamed: 0,date,type,locale,locale_name,description,transferred
241,2016-05-06,Event,National,Ecuador,Terremoto Manabi+20,False
30,2012-12-08,Holiday,Local,Loja,Fundacion de Loja,False
73,2013-10-11,Transfer,National,Ecuador,Traslado Independencia de Guayaquil,False
326,2017-08-15,Holiday,Local,Riobamba,Fundacion de Riobamba,False
314,2017-06-23,Holiday,Local,Guaranda,Cantonizacion de Guaranda,False


In [7]:
df_temp = df_train.merge(df_transactions, on=["date", "store_nbr"], how="left")
df_temp

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,transactions
0,0,2013-01-01,1,AUTOMOTIVE,0.000,0,
1,1,2013-01-01,1,BABY CARE,0.000,0,
2,2,2013-01-01,1,BEAUTY,0.000,0,
3,3,2013-01-01,1,BEVERAGES,0.000,0,
4,4,2013-01-01,1,BOOKS,0.000,0,
...,...,...,...,...,...,...,...
3000883,3000883,2017-08-15,9,POULTRY,438.133,0,2155.0
3000884,3000884,2017-08-15,9,PREPARED FOODS,154.553,1,2155.0
3000885,3000885,2017-08-15,9,PRODUCE,2419.729,148,2155.0
3000886,3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.000,8,2155.0


In [8]:
df_oil["date"] = pd.to_datetime(df_oil.date)
# Resample
df_oil = df_oil.set_index("date").dcoilwtico.resample("D").sum().reset_index()
# Interpolate
df_oil["dcoilwtico"] = np.where(df_oil["dcoilwtico"] == 0, np.nan, df_oil["dcoilwtico"])
df_oil["dcoilwtico_interpolated"] = df_oil.dcoilwtico.interpolate()

# 3. Analyze data

## 3.1 Transactions

In [9]:
# px.line(df_transactions.sort_values(["store_nbr", "date"]), x='date', y='transactions', color='store_nbr',title = "Transactions" )

In [10]:
df_tp = df_transactions.copy()
df_tp["year"] = df_tp.date.dt.year
df_tp["month"] = df_tp.date.dt.month
fig = px.box(df_tp, x="year", y="transactions" , color = "month", title = "Transactions")
del df_tp

In [11]:
fig = px.scatter(df_temp, x="transactions", y="sales", trendline="ols", trendline_color_override="red")

In [12]:
p = df_oil.melt(id_vars=['date'] + list(df_oil.keys()[5:]), var_name='Legend')
fig = px.line(p.sort_values(["Legend", "date"], ascending = [False, True]), x='date', y='value', color='Legend', title="Daily Oil Price")
del p

In [13]:
df_sales = df_train.set_index("date").groupby("store_nbr").resample("D").sales.sum().reset_index()
fig = px.line(df_sales, x="date", y="sales", color="store_nbr", title="Daily total sales of the stores")
del df_sales