# Order Items Dataset
This dataset includes data about the items purchased within each order.

## Initial Column Description


|**Column Title**|**order_id -> str** |**order_item_id -> str** |**product_id -> int** |**seller_id -> str**| **shipping_limit_date -> timestamp**|**price -> float** |**freight_value -> float**|
|--|--|--|--|--|--|--|--|
|Description |order unique identifier |sequential number identifying number of items included in the same order. |product unique identifier |seller unique identifier |Shows the seller shipping limit date for handling the order over to the logistic partner. |item price |item freight value item (if an order has more than one item the freight value is splitted between items) |
|Example |00010242fe8c5a6d1ba2dd792cb16214 |1 |4244733e06e7ecb4970a6e2683c13e61 |48436dade18ac8b2bce089ec2a041202 |2017-09-19 09:45:35 |58.90 |13.29 |

### Errors found
+ For this table the raw data didn't contain null or empties values.
+ Cities names contains variations and special characters like:
    + "santana do livramento" / "sant ana do livramento"
    + "varre-sai", "xique-xique"
    + "jaragua do sul" / "jaragua d sul" / "jaragua da sul"

## Required Libraries

In [9]:
import os
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Data Preprocessing


lack of information in this notebook



### Data Correction

In [10]:
dataset_path = "../../data/raw/" 

In [11]:
files = os.listdir(dataset_path)
print(f'The dataset contains {len(files)} files:')
for file in files:
    print(f'    * {file}')

The dataset contains 9 files:
    * olist_customers_dataset.csv
    * olist_geolocation_dataset.csv
    * olist_orders_dataset.csv
    * olist_order_items_dataset.csv
    * olist_order_payments_dataset.csv
    * olist_order_reviews_dataset.csv
    * olist_products_dataset.csv
    * olist_sellers_dataset.csv
    * product_category_name_translation.csv


In [12]:
def tables_info(df):
    # csv_file_name = 'olist_customers_dataset.csv'
    # csv_file_path = os.path.join(dataset_path, csv_file_name)
    # df = pd.read_csv(csv_file_path)
    # print(df.head(5))
    # print(df.columns.shape[0])    
    info_df = pd.DataFrame(data=[], index=df.columns)            
    #info_df.loc[:, 'table_name'] = csv_file_name.split('.')[0]
    #info_df.loc[:, 'entities'] = info_df['table_name'].str.split('_dataset').str[0].str.split('olist_').str[-1]
    info_df.loc[:, 'column_name'] = df.columns
    for column in df.columns:           
        info_df.loc[column, 'dtype'] = df[column].dtypes   
        info_df.loc[column,'rows_count'] = df[column].shape[0]
        info_df.loc[column,'non_null_rows_count'] =df[column].notnull().sum()
        info_df.loc[column,'null_row_count'] =df[column].isna().sum()
        info_df.loc[column,'unique_value_count'] = df[column].unique().shape[0]

    info_df.reset_index(drop=True, inplace=True)        
    return info_df

In [13]:
def data_subtype(df, column):
    # column = df.columns[4]
    # print(column)
    df = df[df[column].notnull()].reset_index(drop=True) #Remove possible not null values
    # dtype
    # print(df[column].dtype)
    if df[column].dtype == 'object':    
        #Date/time or str?
        try: #Date/time: Up to now only datetime  
            x = pd.to_datetime(df[column], infer_datetime_format=True)                   
            data_subtype = x.dtype

        except: #if it fails -> str
            if df[column].str.len().unique().shape[0] == 1: #All the elements have the same length if == 1
                data_subtype = f"CHAR({df[column].str.len().unique()[0]})"
            elif df[column].map(lambda x: len(x)).max() > 255: #Variable length higher than 255
                data_subtype = 'TEXT()' 
            else:
                # data_type = f"VARCHAR({df[column].map(lambda x: len(x)).max()})"   
                data_subtype = f"VARCHAR(255)"       
    elif df[column].dtype == 'float64':
        # print(df[column].dtype)
        data_subtype = df[column].dtype
    else: #int64
        # print(df[column].dtype)
        data_subtype = df[column].dtype
    # print(data_subtype)
    example = df[column][random.randint(0,df[column].shape[0])]
    return [data_subtype, example]

In [14]:
def tables_info2(df):
    # csv_file_name = 'olist_customers_dataset.csv'
    # csv_file_path = os.path.join(dataset_path, csv_file_name)
    # df = pd.read_csv(csv_file_path)
    # print(df.head(5))
    # print(df.columns.shape[0])    
    info_df = pd.DataFrame(data=[], index=df.columns)            
    #info_df.loc[:, 'table_name'] = csv_file_name.split('.')[0]
    #info_df.loc[:, 'entities'] = info_df['table_name'].str.split('_dataset').str[0].str.split('olist_').str[-1]
    info_df.loc[:, 'column_name'] = df.columns
    for column in df.columns:           
        info_df.loc[column, 'dtype'] = df[column].dtypes
        data_subtype_example = data_subtype(df, column)
        info_df.loc[column, 'subtype'] = data_subtype_example[0]
        info_df.loc[column, 'example'] = data_subtype_example[1]
        info_df.loc[column,'unique_value_count'] = df[column].unique().shape[0]
        info_df.loc[column,'is_not_null'] =~df[column].isnull().values.any()
        info_df.loc[column, 'is_unique'] = df[column].is_unique        
        info_df.loc[column, 'is_primary_key'] = (column.find('_id') != -1) & info_df.loc[column,'is_not_null'] & info_df.loc[column, 'is_unique']
        info_df.loc[column, 'is_foreign_key'] = (column.find('_id') != -1) & ~info_df.loc[column, 'is_primary_key']
        info_df.loc[column,'is_id'] = (column.find('_id') != -1)       

    info_df.reset_index(drop=True, inplace=True)        
    return info_df

In [15]:
csv_file_name = 'olist_order_items_dataset.csv'
csv_file_path = os.path.join(dataset_path, csv_file_name)
df = pd.read_csv(csv_file_path)

### Getting cleaned dataset

In [16]:
unique_ids = pd.read_csv('../../data/interim/orders_items_dataset_unique_ids.csv')
unique_ids = unique_ids['order_id'].to_list()

In [17]:
# Selecting "allowed" rows 
df_clean = df[df['order_id'].isin(unique_ids)].reset_index(drop=True)
# Defining a new PK
df_clean['order_id_order_item_id'] = df_clean['order_id'] + '_' + df_clean['order_item_id'].astype(str)
df_clean = df_clean.loc[:, ['order_id_order_item_id', 'order_id', 'order_item_id', 'product_id', 'seller_id', 'shipping_limit_date', 'price', 'freight_value']]
df_clean.to_csv('../../data/interim/order_items_dataset_clean.csv', sep=',', index=False, encoding='utf-8-sig')

In [18]:
info_df = tables_info(df_clean)
info_df

Unnamed: 0,column_name,dtype,rows_count,non_null_rows_count,null_row_count,unique_value_count
0,order_id_order_item_id,object,112647.0,112647.0,0.0,112647.0
1,order_id,object,112647.0,112647.0,0.0,98665.0
2,order_item_id,int64,112647.0,112647.0,0.0,21.0
3,product_id,object,112647.0,112647.0,0.0,32951.0
4,seller_id,object,112647.0,112647.0,0.0,3095.0
5,shipping_limit_date,object,112647.0,112647.0,0.0,93317.0
6,price,float64,112647.0,112647.0,0.0,5968.0
7,freight_value,float64,112647.0,112647.0,0.0,6999.0


In [19]:
info_df = tables_info2(df_clean)
info_df

Unnamed: 0,column_name,dtype,subtype,example,unique_value_count,is_not_null,is_unique,is_primary_key,is_foreign_key,is_id
0,order_id_order_item_id,object,VARCHAR(255),d5cb76792b06c2a00edb03001d5f52ad_1,112647.0,True,True,True,False,True
1,order_id,object,CHAR(32),55a1a4bc56ccf7a49bc62bcd975d3162,98665.0,True,False,False,True,True
2,order_item_id,int64,int64,1,21.0,True,False,False,True,True
3,product_id,object,CHAR(32),7f457254a89d62960399e075711b3deb,32951.0,True,False,False,True,True
4,seller_id,object,CHAR(32),a17f621c590ea0fab3d5d883e1630ec6,3095.0,True,False,False,True,True
5,shipping_limit_date,object,datetime64[ns],2018-08-10 01:10:09,93317.0,True,False,False,False,False
6,price,float64,float64,130.0,5968.0,True,False,False,False,False
7,freight_value,float64,float64,9.34,6999.0,True,False,False,False,False
