## Exploratory Data Analysis of Shoprite Data

In [1]:
import numpy as np  # mathematical functions
import pandas as pd  # data analysis
import matplotlib as plt  # data visualization

%matplotlib inline

## Load our Data

In [3]:
df = pd.read_csv('shop_rite_messy_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,names,prices,sku,categories,tag
0,0,Chewy Caramel Alpenliebe 6.5G,₦34.99,Candy Milk Chewy Caramel Alpenliebe 6.5G,Category: Confectionery and Snacks,Tag: Candy
1,1,Milk Chewy Caramel Alpenliebe 6.5G,₦34.99,Milk Chewy Caramel Alpenliebe 6.5G,Category: Candy & bubble Gum,Tag: Alpenliebe
2,2,Seasoning Powder Jollof Maggi 8G,₦52.99,Seasoning Powder Jollof Maggi 8G,"Category: Condiments, Oils & Spices",Tag: Seasoning
3,3,Seasoning Powder Chicken Maggi 10G,₦52.99,Seasoning Powder Chicken Maggi 10G,"Category: Condiments, Oils & Spices",Tag: Seasoning
4,4,Chocolate Slab Milk Dune 5G,₦54.99,Chocolate Slab Milk Dune 5G,Category: Chocolate,Tag: Chocolate


## understand the Data

In [6]:
# drop the unnamed : o column
df.drop(columns ='Unnamed: 0',inplace = True)

In [7]:
#after dropping the columns
df.columns

Index(['names', 'prices', 'sku', 'categories', 'tag'], dtype='object')

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   names       100 non-null    object
 1   prices      100 non-null    object
 2   sku         100 non-null    object
 3   categories  100 non-null    object
 4   tag         100 non-null    object
dtypes: object(5)
memory usage: 4.0+ KB


##  Clean the price column

Remove the currency symbol

In [10]:
df['prices'] = df['prices'].str.replace('₦','').astype(float)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   names       100 non-null    object 
 1   prices      100 non-null    float64
 2   sku         100 non-null    object 
 3   categories  100 non-null    object 
 4   tag         100 non-null    object 
dtypes: float64(1), object(4)
memory usage: 4.0+ KB


### Clean the category Column

In [12]:
df['categories'] = df['categories'].str.replace('category: ', '')

### Clean the tag column

In [13]:
df['tag'] = df['tag'].str.replace('tag: ', '')

In [15]:
df.head()

Unnamed: 0,names,prices,sku,categories,tag
0,Chewy Caramel Alpenliebe 6.5G,34.99,Candy Milk Chewy Caramel Alpenliebe 6.5G,Category: Confectionery and Snacks,Tag: Candy
1,Milk Chewy Caramel Alpenliebe 6.5G,34.99,Milk Chewy Caramel Alpenliebe 6.5G,Category: Candy & bubble Gum,Tag: Alpenliebe
2,Seasoning Powder Jollof Maggi 8G,52.99,Seasoning Powder Jollof Maggi 8G,"Category: Condiments, Oils & Spices",Tag: Seasoning
3,Seasoning Powder Chicken Maggi 10G,52.99,Seasoning Powder Chicken Maggi 10G,"Category: Condiments, Oils & Spices",Tag: Seasoning
4,Chocolate Slab Milk Dune 5G,54.99,Chocolate Slab Milk Dune 5G,Category: Chocolate,Tag: Chocolate


## Find fields with Null

In [None]:
def replace-null(col):
    if (df[col] == 'Null'