## ETL layer

### **Description:**

- Create an ETL layer based on DQC

In [2]:
import numpy as np
import pandas as pd
import matplotlib as plt

import sys
sys.path.append('../')
import scripts.etl as etl # etl.py module
import scripts.dqc as dqc # for "check_negative_values" function

## 1. Change dtypes for **df_train** columns

### Load all necessary data into dataframes at first

In [3]:
train_df = pd.read_csv('../data/sales_train.csv')
test_df = pd.read_csv('../data/test.csv')
items_df = pd.read_csv('../data/items.csv')
categories_df = pd.read_csv('../data/item_categories.csv')
shops_df = pd.read_csv('../data/shops.csv')

In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2935849 entries, 0 to 2935848
Data columns (total 6 columns):
 #   Column          Dtype  
---  ------          -----  
 0   date            object 
 1   date_block_num  int64  
 2   shop_id         int64  
 3   item_id         int64  
 4   item_price      float64
 5   item_cnt_day    float64
dtypes: float64(2), int64(3), object(1)
memory usage: 134.4+ MB


As it was mentioned earlier, all numerical data can be safely put into int/float-32 dtypes. Moreover, all **item_cnt_day** values are actually integers as well as **date** feature should be of 'datetime' type

In [5]:
train_df['date'] = pd.to_datetime(train_df['date'], format='%d.%m.%Y')

int_columns = ['date_block_num', 'shop_id', 'item_id', 'item_cnt_day']
float_columns = ['item_price']

train_df = etl.transform_df_types(train_df, int_columns, float_columns)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2935849 entries, 0 to 2935848
Data columns (total 6 columns):
 #   Column          Dtype         
---  ------          -----         
 0   date            datetime64[ns]
 1   date_block_num  int32         
 2   shop_id         int32         
 3   item_id         int32         
 4   item_price      float32       
 5   item_cnt_day    int32         
dtypes: datetime64[ns](1), float32(1), int32(4)
memory usage: 78.4 MB


### Do the same for other dataframes' integer columns

In [6]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214200 entries, 0 to 214199
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype
---  ------   --------------   -----
 0   ID       214200 non-null  int64
 1   shop_id  214200 non-null  int64
 2   item_id  214200 non-null  int64
dtypes: int64(3)
memory usage: 4.9 MB


In [7]:
test_df = etl.transform_df_types(test_df, int_columns=test_df.columns.to_list())

In [8]:
items_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22170 entries, 0 to 22169
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   item_name         22170 non-null  object
 1   item_id           22170 non-null  int64 
 2   item_category_id  22170 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 519.7+ KB


In [9]:
items_df = etl.transform_df_types(items_df, int_columns=['item_id', 'item_category_id'], object_columns=['item_name'])

In [10]:
categories_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84 entries, 0 to 83
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   item_category_name  84 non-null     object
 1   item_category_id    84 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 1.4+ KB


In [11]:
categories_df = etl.transform_df_types(categories_df, int_columns=['item_category_id'], object_columns=['item_category_name'])

In [12]:
shops_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   shop_name  60 non-null     object
 1   shop_id    60 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 1.1+ KB


In [13]:
shops_df = etl.transform_df_types(shops_df, int_columns=['shop_id'], object_columns=['shop_name'])

## 2. Delete rows with negative values

As we already now from the DQC Layer, 'item_price' feature has one negative value that should be deleted. 

In [14]:
dqc.check_negative_values(train_df, 'item_price')

3.406169731481422e-05 percent of values are negative


In [15]:
train_df = etl.del_negative(train_df, 'item_price')

### Check for negative values again

In [16]:
dqc.check_negative_values(train_df, 'item_price')

No negative values found


In [17]:
train_df.reset_index(drop=True, inplace=True)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2935848 entries, 0 to 2935847
Data columns (total 6 columns):
 #   Column          Dtype         
---  ------          -----         
 0   date            datetime64[ns]
 1   date_block_num  int32         
 2   shop_id         int32         
 3   item_id         int32         
 4   item_price      float32       
 5   item_cnt_day    int32         
dtypes: datetime64[ns](1), float32(1), int32(4)
memory usage: 78.4 MB


## 3. Fix **shops_df** duplicate data 

In [18]:
shops_df = etl.change_shop_attributes(shops_df)

## 3. Create a copy of **df_train** to aggregate monthly sales

In [19]:
train_aggregated = train_df.groupby(['date_block_num', 'shop_id', 'item_id']).agg({'item_cnt_day': 'sum', 'item_price': 'mean'}).reset_index()

train_aggregated.rename(columns={'item_cnt_day': 'item_cnt_month'}, inplace=True)
train_aggregated.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,item_price
0,0,0,32,6,221.0
1,0,0,33,3,347.0
2,0,0,35,1,247.0
3,0,0,43,1,221.0
4,0,0,51,2,128.5


## 4. Add **year** and **month** columns to received datasets

In [20]:
train_df = etl.add_month_year_columns(train_df)
train_aggregated = etl.add_month_year_columns(train_aggregated)

train_df.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,month,year
0,2013-01-02,0,59,22154,999.0,1,0,0
1,2013-01-03,0,25,2552,899.0,1,0,0
2,2013-01-05,0,25,2552,899.0,-1,0,0
3,2013-01-06,0,25,2554,1709.050049,1,0,0
4,2013-01-15,0,25,2555,1099.0,1,0,0


## 5. Merge **train_df** with **items_df, categories_df, shops_df**

In [21]:
merged_train_df = etl.merge_df(train_df, items_df, categories_df, shops_df)
merged_train_aggregated_df = etl.merge_df(train_aggregated, items_df, categories_df, shops_df)

merged_train_df.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,month,year,item_name,item_category_id,item_category_name,shop_name
0,2013-01-02,0,59,22154,999.0,1,0,0,ЯВЛЕНИЕ 2012 (BD),37,Кино - Blu-Ray,"Ярославль ТЦ ""Альтаир"""
1,2013-01-03,0,25,2552,899.0,1,0,0,DEEP PURPLE The House Of Blue Light LP,58,Музыка - Винил,"Москва ТРК ""Атриум"""
2,2013-01-05,0,25,2552,899.0,-1,0,0,DEEP PURPLE The House Of Blue Light LP,58,Музыка - Винил,"Москва ТРК ""Атриум"""
3,2013-01-06,0,25,2554,1709.050049,1,0,0,DEEP PURPLE Who Do You Think We Are LP,58,Музыка - Винил,"Москва ТРК ""Атриум"""
4,2013-01-15,0,25,2555,1099.0,1,0,0,DEEP PURPLE 30 Very Best Of 2CD (Фирм.),56,Музыка - CD фирменного производства,"Москва ТРК ""Атриум"""


In [22]:
merged_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3125216 entries, 0 to 3125215
Data columns (total 12 columns):
 #   Column              Dtype         
---  ------              -----         
 0   date                datetime64[ns]
 1   date_block_num      int32         
 2   shop_id             int32         
 3   item_id             int32         
 4   item_price          float32       
 5   item_cnt_day        int32         
 6   month               int32         
 7   year                int32         
 8   item_name           category      
 9   item_category_id    int32         
 10  item_category_name  category      
 11  shop_name           category      
dtypes: category(3), datetime64[ns](1), float32(1), int32(7)
memory usage: 131.8 MB


In [23]:
merged_train_aggregated_df.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,item_price,month,year,item_name,item_category_id,item_category_name,shop_name
0,0,0,32,6,221.0,0,0,1+1,40,Кино - DVD,
1,0,0,33,3,347.0,0,0,1+1 (BD),37,Кино - Blu-Ray,
2,0,0,35,1,247.0,0,0,10 ЛЕТ СПУСТЯ,40,Кино - DVD,
3,0,0,43,1,221.0,0,0,100 МИЛЛИОНОВ ЕВРО,40,Кино - DVD,
4,0,0,51,2,128.5,0,0,100 лучших произведений классики (mp3-CD) (Dig...,57,Музыка - MP3,


In [24]:
merged_train_aggregated_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1707421 entries, 0 to 1707420
Data columns (total 11 columns):
 #   Column              Dtype   
---  ------              -----   
 0   date_block_num      int32   
 1   shop_id             int32   
 2   item_id             int32   
 3   item_cnt_month      int32   
 4   item_price          float32 
 5   month               int32   
 6   year                int32   
 7   item_name           category
 8   item_category_id    int32   
 9   item_category_name  category
 10  shop_name           category
dtypes: category(3), float32(1), int32(7)
memory usage: 59.3 MB


## 6. Do the same for **test_df**

In [25]:
merged_test_df = etl.merge_df(test_df, items_df, categories_df, shops_df)
merged_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 224400 entries, 0 to 224399
Data columns (total 7 columns):
 #   Column              Non-Null Count   Dtype   
---  ------              --------------   -----   
 0   ID                  224400 non-null  int32   
 1   shop_id             224400 non-null  int32   
 2   item_id             224400 non-null  int32   
 3   item_name           224400 non-null  category
 4   item_category_id    224400 non-null  int32   
 5   item_category_name  224400 non-null  category
 6   shop_name           219300 non-null  category
dtypes: category(3), int32(4)
memory usage: 5.0 MB


## 7. Export dataframes to .csv files

In [26]:
merged_test_df.to_csv('../data/merged_test.csv', index=False)
merged_train_df.to_csv('../data/merged_train.csv', index=False)
merged_train_aggregated_df.to_csv('../data/merged_train_aggregated.csv', index=False)