# Analyze data

# 1. Imports

## 1.1 Packages

In [1]:
import numpy as np
import pandas as pd

import plotly.express as px

## 1.2 Options

## 1.3 Dataset

In [2]:
df_train = pd.read_csv("../data/01_raw/train.csv")
df_stores = pd.read_csv("../data/01_raw/stores.csv")
df_transactions = pd.read_csv("../data/01_raw/transactions.csv").sort_values(["store_nbr", "date"])
df_oil = pd.read_csv("../data/01_raw/oil.csv")

# 2. Prepare data

In [3]:
# Prepare datetime
df_train["date"] = pd.to_datetime(df_train.date)
df_transactions["date"] = pd.to_datetime(df_transactions.date)

In [4]:
df_train.sample(5)

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
2411643,2411643,2016-09-18,26,BEVERAGES,1109.0,38
2706547,2706547,2017-03-03,5,LADIESWEAR,4.0,0
318604,318604,2013-06-28,48,"LIQUOR,WINE,BEER",86.0,0
2845378,2845378,2017-05-20,45,LADIESWEAR,38.0,0
1807451,1807451,2015-10-14,23,DAIRY,434.0,4


In [5]:
df_transactions.sample(5)

Unnamed: 0,date,store_nbr,transactions
49198,2015-10-31,40,1374
54503,2016-02-12,48,2760
75066,2017-03-12,18,1361
11723,2013-09-11,37,1449
51385,2015-12-12,1,1463


In [6]:
df_temp = df_train.merge(df_transactions, on=["date", "store_nbr"], how="left")
df_temp

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,transactions
0,0,2013-01-01,1,AUTOMOTIVE,0.000,0,
1,1,2013-01-01,1,BABY CARE,0.000,0,
2,2,2013-01-01,1,BEAUTY,0.000,0,
3,3,2013-01-01,1,BEVERAGES,0.000,0,
4,4,2013-01-01,1,BOOKS,0.000,0,
...,...,...,...,...,...,...,...
3000883,3000883,2017-08-15,9,POULTRY,438.133,0,2155.0
3000884,3000884,2017-08-15,9,PREPARED FOODS,154.553,1,2155.0
3000885,3000885,2017-08-15,9,PRODUCE,2419.729,148,2155.0
3000886,3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.000,8,2155.0


In [7]:
df_oil["date"] = pd.to_datetime(df_oil.date)
# Resample
df_oil = df_oil.set_index("date").dcoilwtico.resample("D").sum().reset_index()
# Interpolate
df_oil["dcoilwtico"] = np.where(df_oil["dcoilwtico"] == 0, np.nan, df_oil["dcoilwtico"])
df_oil["dcoilwtico_interpolated"] = df_oil.dcoilwtico.interpolate()

# 3. Analyze data

## 3.1 Transactions

In [8]:
# px.line(df_transactions.sort_values(["store_nbr", "date"]), x='date', y='transactions', color='store_nbr',title = "Transactions" )

In [9]:
df_tp = df_transactions.copy()
df_tp["year"] = df_tp.date.dt.year
df_tp["month"] = df_tp.date.dt.month
fig = px.box(df_tp, x="year", y="transactions" , color = "month", title = "Transactions")

In [10]:
fig = px.scatter(df_temp, x="transactions", y="sales", trendline="ols", trendline_color_override="red")

In [11]:
p = df_oil.melt(id_vars=['date']+list(df_oil.keys()[5:]), var_name='Legend')
fig = px.line(p.sort_values(["Legend", "date"], ascending = [False, True]), x='date', y='value', color='Legend',title = "Daily Oil Price" )