# Data Visualization With Python - Part 1

## Steps used in this script

#### 1. Create customers Data Frame
#### 2. Cleaning customers Data Frame
#### 3. Create ords_prods_merged Data Frame
#### 4. Cleaning ords_prods_merged Data Frame
#### 5. Merging Data Frames to create instacart_data Data Frame
#### 6. Cleaning and optimizing data types in instacart_data Data Frame
#### 7. Saving instacart_data Data Frame to .pkl

&nbsp;
&nbsp;

In [1]:
# Importing libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

#### 1. Create customers Data Frame

In [2]:
path = r"C:\Users\David\Desktop\CareerFoundry\Achievement 4\Instacart Basket Analysis"

customers = pd.read_csv(os.path.join(path, '02 Data', 'Original Data',  'customers.csv'))

ParserError: Error tokenizing data. C error: out of memory

#### 2. Cleaning customers Data Frame

In [None]:
# Verifying columns for customer data frame

customers.head()

In [None]:
# Renaming columns in customers Data Frame

customers.rename(columns={'First Name': 'first_name', 'Surnam': 'last_name', 'Gender': 'gender', 'STATE': 'state', 'Age': 'age',
                  'n_dependants': 'dependants'}, inplace=True)

In [None]:
# Verifying column name changes

customers.head()

In [None]:
# Checking data types in customers Data Frame

customers.dtypes

In [None]:
# Checking for unusual values in customers Data Frame

customers.describe()

In [None]:
# Checking for null values in customers Data Frame

customers.isnull().sum()

In [None]:
# Checking for mixed values in columns in customers Data Frame

for col in customers.columns.tolist():
  weird = (customers[[col]].applymap(type) != customers[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (customers[weird]) > 0:
    print (col)

In [None]:
# Dropping personally identifying information columns in customers Data Frame (not needed for analysis)

customers.drop(['first_name', 'last_name'], axis=1, inplace=True)

In [None]:
# Verifying that columns have been removed

customers.head()

In [None]:
# Checking for duplicates in customers Data Frame

customers.duplicated().value_counts()

#### 3. Create ords_prods_merged Data Frame

In [None]:
ords_prods_merged = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'merged_prods_ords_flagged.pkl'))


#### 4. Cleaning ords_prods_merged Data Frame

In [None]:
# Checking for duplicated in ords_prods_merged Data Frame

ords_prods_merged.duplicated().value_counts()

In [None]:
# Checking duplicated values in ords_prods_merged Data Frame

ords_prods_merged[ords_prods_merged.duplicated() == True]

In [None]:
# Verifying duplicate by taking values from subset above and confirming that duplicates do exist

ords_prods_merged[(ords_prods_merged.order_id == 2564286) & (ords_prods_merged.product_id == 35306)]

In [None]:
# Dropping duplicates

ords_prods_merged.drop_duplicates(inplace=True)

In [None]:
# Verifying that duplicates have been removed

ords_prods_merged[ords_prods_merged.duplicated() == True]

In [None]:
# Verifying that duplicates have been removed

ords_prods_merged.duplicated().value_counts()

#### 5. Merging Data Frames to create instacart_data Data Frame

In [None]:
# Merging customers Data Frame and ords_prods_merged Data Frame on 'user_id' column

instacart_data = ords_prods_merged.merge(customers, on='user_id', indicator=True)

In [None]:
# Checking merged Data Frame

instacart_data.head()

#### 6. Cleaning and optimizing data types in instacart_data Data Frame

In [None]:
# Verifying merged values

instacart_data._merge.value_counts()

In [None]:
# Checking the shape of the merged instacart_data Data Frame

instacart_data.shape

In [None]:
# Verifying data types of instacart_data Data Frame

instacart_data.dtypes

In [None]:
# Using a for-loop to optimize int data types based on maximum positive values (to save memory)

for col in list(instacart_data.columns)[:-1]:
    if instacart_data[col].dtype != 'object' and instacart_data[col].dtype != 'float':
        if instacart_data[col].max() < 127:
            instacart_data[col] = instacart_data[col].astype('int8')
        elif instacart_data[col].max() <= 32767:
            instacart_data[col] = instacart_data[col].astype('int16')
        elif instacart_data[col].max() <= 2147483647:
            instacart_data[col] = instacart_data[col].astype('int32')
        else:
            instacart_data[col] = instacart_data[col].astype('int64')

In [None]:
# Verifying changes after running for-loop

instacart_data.dtypes

In [None]:
# Verifying that no duplicate values exist in merged Data Frame

instacart_data.duplicated().value_counts()

In [None]:
# Removing the indicator '_merge' column from the merged Data Frame

instacart_data.drop('_merge', axis=1, inplace=True)

In [None]:
# Verifying that column has been removed

instacart_data.head()

In [None]:
# Verifying new shape of Data Frame

instacart_data.shape

#### 7. Saving instacart_data Data Frame to .pkl

In [None]:
# Saving instacart_data Data Frame to .pkl file

instacart_data.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'instacart_data1.pkl'))