# Data Exploration

In [1]:
import pandas as pd
import numpy as np
import json
import re
import html
import import_ipynb
import pickle

## Amazon Data

### Reading Amazon Data

In [2]:
AMAZON_DATA_FILE_PATH = "meta_Grocery_and_Gourmet_Food.json"
data = []
with open(AMAZON_DATA_FILE_PATH) as f:
    for line in f:
        data.append(json.loads(line.strip()))
amazon_data = pd.DataFrame.from_dict(data)
print(f"Number of columns: {amazon_data.shape[1]} | Number of records= {amazon_data.shape[0]}")
print(f"Columns in amazon dataset: {list(amazon_data.columns)}")

Number of columns: 19 | Number of records= 287051
Columns in amazon dataset: ['category', 'tech1', 'description', 'fit', 'title', 'also_buy', 'tech2', 'brand', 'feature', 'rank', 'also_view', 'main_cat', 'similar_item', 'date', 'price', 'asin', 'imageURL', 'imageURLHighRes', 'details']


### Data Overview

In [3]:
amazon_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 287051 entries, 0 to 287050
Data columns (total 19 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   category         287051 non-null  object
 1   tech1            287051 non-null  object
 2   description      287051 non-null  object
 3   fit              287051 non-null  object
 4   title            287051 non-null  object
 5   also_buy         287051 non-null  object
 6   tech2            287051 non-null  object
 7   brand            287051 non-null  object
 8   feature          287051 non-null  object
 9   rank             287051 non-null  object
 10  also_view        287051 non-null  object
 11  main_cat         287051 non-null  object
 12  similar_item     287051 non-null  object
 13  date             287051 non-null  object
 14  price            287051 non-null  object
 15  asin             287051 non-null  object
 16  imageURL         287051 non-null  object
 17  imageURLHi

In [4]:
amazon_data.head(2)

Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes,details
0,"[Grocery & Gourmet Food, Dairy, Cheese & Eggs,...",,"[BEEMSTER GOUDA CHEESE AGED 18/24 MONTHS, Stat...",,Beemster Gouda - Aged 18/24 Months - App. 1.5 Lbs,[],,Ariola Imports,[],"165,181 in Grocery & Gourmet Food (","[B0000D9MYM, B0000D9MYL, B00ADHIGBA, B00H9OX59...",Grocery,,,$41.91,681727810,[],[],
1,"[Grocery & Gourmet Food, Cooking & Baking, Sug...",,"[Shipped from UK, please allow 10 to 21 busine...",,Trim Healthy Mama Xylitol,"[B01898YHXK, B01BCM6LAC, B00Q4OL47O, B00Q4OL5Q...",,,[],"315,867 in Grocery & Gourmet Food (",[],Grocery,,,,853347867,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,


In [5]:
amazon_data["main_cat"].value_counts().reset_index().head(5)

Unnamed: 0,index,main_cat
0,Grocery,261845
1,Amazon Home,8953
2,Health & Personal Care,8543
3,Toys & Games,1934
4,All Beauty,1604


In [6]:
# Investigating primitive type of data
print(f"{'Column Name':<20} | {'Pandas Data Type':<20} | {'Primitive Type':<20}")
print("-"*60)
amazon_data.attrs = {}
amazon_data.attrs["dtype"] = {}
for col in amazon_data.columns:
    primitive_types = set([type(i) for i in amazon_data[col]])
    print(f"{col:<20} | {str(amazon_data[col].dtype):<20} | {str([t.__name__ for t in primitive_types]):<20}")
    amazon_data.attrs["dtype"][col] = primitive_types

Column Name          | Pandas Data Type     | Primitive Type      
------------------------------------------------------------
category             | object               | ['list']            
tech1                | object               | ['str']             
description          | object               | ['list']            
fit                  | object               | ['str']             
title                | object               | ['str']             
also_buy             | object               | ['list']            
tech2                | object               | ['str']             
brand                | object               | ['str']             
feature              | object               | ['list']            
rank                 | object               | ['str', 'list']     
also_view            | object               | ['list']            
main_cat             | object               | ['str']             
similar_item         | object               | ['str']             
d

In [7]:
# Setting empty lists and string NaN
for col in amazon_data.columns:
    if len(amazon_data.attrs["dtype"][col])>1:
        continue
    if list in amazon_data.attrs["dtype"][col]:
        amazon_data[col] = amazon_data[col].apply(lambda x: np.nan if len(x)==0 else x)
    elif str in amazon_data.attrs["dtype"][col]:
        amazon_data[col] = amazon_data[col].apply(lambda x: np.nan if len(x.strip())==0 else x)

In [8]:
amazon_data.isna().sum()

category                0
tech1              286307
description         35384
fit                287047
title                   3
also_buy           203757
tech2              287051
brand               11419
feature            270671
rank                    0
also_view          166113
main_cat             1363
similar_item       286795
date               277389
price              153193
asin                    0
imageURL           136901
imageURLHighRes    136901
details                24
dtype: int64

In [9]:
amazon_data["brand"].value_counts().head(5)

Unknown                  1780
Black Tie Mercantile     1458
Trader Joe's             1234
McCormick                1041
The Nutty Fruit House     998
Name: brand, dtype: int64

In [10]:
amazon_data["category"].value_counts().head(5)

[Grocery & Gourmet Food, Beverages, Coffee, Tea & Cocoa, Tea, Tea Samplers]                                        7609
[Grocery & Gourmet Food, Candy & Chocolate, Candy & Chocolate Bars]                                                7512
[Grocery & Gourmet Food, Beverages, Coffee, Tea & Cocoa, Coffee, Single-Serve Capsules & Pods]                     6745
[Grocery & Gourmet Food, Beverages, Coffee, Tea & Cocoa, Coffee, Ground Coffee]                                    5326
[Grocery & Gourmet Food, Cooking & Baking, Frosting, Icing & Decorations, Cake & Cupcake Toppers, Cake Toppers]    4821
Name: category, dtype: int64

In [11]:
for col in ["title", "description"]:
    special_chars = amazon_data[col].apply(lambda x: set(re.findall(r'&\w+', str(x))) if x else set()).aggregate({'Whitespace Characters': lambda x: set.union(*x)})[0]
    print(f"Sample of HTML encoded characters in column: '{col}'")
    print(f"{'HTML Encoded Char':>20} | {'Decoded Char':>20}")
    print("-"*50)
    for ch in list(special_chars)[:5]:
        print(f"{ch.lower(): >20} | {html.unescape(ch.lower()): >20}")
    print(f"{'TOTAL': >20} | {len(special_chars): >20}")    

Sample of HTML encoded characters in column: 'title'
   HTML Encoded Char |         Decoded Char
--------------------------------------------------
              &ldquo |               &ldquo
              &aring |                    å
               &ordf |                    ª
             &oslash |                    ø
                &eth |                    ð
               TOTAL |                  103
Sample of HTML encoded characters in column: 'description'
   HTML Encoded Char |         Decoded Char
--------------------------------------------------
               &gold |                &gold
                &ike |                 &ike
               &mash |                &mash
         &strawberry |          &strawberry
                 &no |                  &no
               TOTAL |                  302


In [12]:
col="description"
special_chars = amazon_data[col].apply(lambda x: set(re.findall(r'<\w*>', str(x))) if x else set()).aggregate({'Whitespace Characters': lambda x: set.union(*x)})[0]
print(f"HTML tags in column: '{col}': {special_chars}")

HTML tags in column: 'description': {'<TR>', '<th>', '<br>', '<td>', '<span>', '<B>', '<colgroup>', '<EOL>', '<h4>', '<blockquote>', '<Br>', '<div>', '<P>', '<H2>', '<big>', '<h2>', '<LI>', '<TD>', '<xml>', '<Kafemio>', '<small>', '<Li>', '<sup>', '<b>', '<u>', '<BR>', '<lu>', '<tr>', '<em>', '<milk>', '<Strong>', '<style>', '<li>', '<ul>', '<STRONG>', '<EM>', '<title>', '<bR>', '<html>', '<DIV>', '<center>', '<UL>', '<U>', '<table>', '<I>', '<strong>', '<tbody>', '<h5>', '<sub>', '<h3>', '<head>', '<ol>', '<body>', '<h1>', '<>', '<i>', '<H4>', '<H5>'}


In [13]:
category_count = amazon_data["category"].apply(lambda categories: len(categories))
category_count.name = "category_count"
category_count.value_counts()

4     126693
3      79167
5      67752
2      10992
6       2395
8         19
9         15
7          8
10         7
11         2
13         1
Name: category_count, dtype: int64

## Ontology Data

### Reading Ontology Data

In [14]:
ONTOLOGY_DATA_PATH = "I:\MSc\Product Dataset\\"
product_tags = pd.read_csv(ONTOLOGY_DATA_PATH + "product_tags.csv")
preferences = pd.read_csv(ONTOLOGY_DATA_PATH + "preferences.csv")
field_test_preferences = pd.read_csv(ONTOLOGY_DATA_PATH + "field_test_preferences.csv")
preference_tags = pd.read_csv(ONTOLOGY_DATA_PATH + "preference_tags.csv")
preference_to_tags = pd.read_csv(ONTOLOGY_DATA_PATH + "preference_to_tags.csv")
associations = pd.read_csv(ONTOLOGY_DATA_PATH + "associations.csv")

In [15]:
summary = pd.DataFrame()
summary["Total No. of Records"] = [product_tags.shape[0], 
                    preferences.shape[0], 
                    field_test_preferences.shape[0], 
                    preference_tags.shape[0], 
                    preference_to_tags.shape[0],
                    associations.shape[0]]

In [16]:
# Filtering data present in the latest version (i.e, not removed from any version)
product_tags = product_tags[product_tags["version removed"]==-1]
preferences = preferences[preferences["version removed"]==-1]
preference_tags = preference_tags[preference_tags["version removed"]==-1]
preference_to_tags = preference_to_tags[preference_to_tags["version removed"]==-1]
associations = associations[associations["version removed"]==-1]

# Dropping 'version removed' column
product_tags.drop("version removed", axis=1, inplace=True)
preferences.drop("version removed", axis=1, inplace=True)
preference_tags.drop("version removed", axis=1, inplace=True)
preference_to_tags.drop("version removed", axis=1, inplace=True)
associations.drop("version removed", axis=1, inplace=True)

field_test_preferences = field_test_preferences.iloc[:,:-3] 

In [17]:

summary["No. of Records (Latest)"] = [product_tags.shape[0], 
                    preferences.shape[0], 
                    field_test_preferences.shape[0], 
                    preference_tags.shape[0], 
                    preference_to_tags.shape[0],
                    associations.shape[0]]
summary["No. of Columns"] = [product_tags.shape[1], 
                    preferences.shape[1], 
                    field_test_preferences.shape[1], 
                    preference_tags.shape[1], 
                    preference_to_tags.shape[1],
                    associations.shape[1]]
summary["Column Names"] = [product_tags.columns.values, 
                    preferences.columns.values, 
                    field_test_preferences.columns.values, 
                    preference_tags.columns.values, 
                    preference_to_tags.columns.values,
                    associations.columns.values]
summary.index = ["product_tags", 
                    "preferences", 
                    "field_test_preferences", 
                    "preference_tags", 
                    "preference_to_tags",
                    "associations"]

#### Summary

In [18]:
summary

Unnamed: 0,Total No. of Records,No. of Records (Latest),No. of Columns,Column Names
product_tags,290,196,2,"[product tag id, product tag]"
preferences,99,25,2,"[preference id, preference]"
field_test_preferences,25,25,5,"[preference category, preference id, preferenc..."
preference_tags,103,61,2,"[preference tag id, preference tag]"
preference_to_tags,217,148,2,"[preference id, preference tag id]"
associations,1203,795,3,"[preference tag id, product tag id, score]"
