In [1]:
import pandas as pd
import pandas_profiling
from pandas.io.json import json_normalize

## Products data set 

In [2]:
df_products = pd.read_json("data/products.json.gz")

In [3]:
df_products.head(3)

Unnamed: 0,category,description,image,manufacturer,model,name,price,shipping,sku,type,upc,url
0,"[{'id': 'pcmcat312300050015', 'name': 'Connect...",Compatible with select electronic devices; AAA...,http://img.bbystatic.com/BestBuy_US/images/pro...,Duracell,MN2400B4Z,Duracell - AAA Batteries (4-Pack),5.49,5.49,43900,HardGood,41333424019,http://www.bestbuy.com/site/duracell-aaa-batte...
1,"[{'id': 'pcmcat312300050015', 'name': 'Connect...",Long-lasting energy; DURALOCK Power Preserve t...,http://img.bbystatic.com/BestBuy_US/images/pro...,Duracell,MN1500B4Z,Duracell - AA 1.5V CopperTop Batteries (4-Pack),5.49,5.49,48530,HardGood,41333415017,http://www.bestbuy.com/site/duracell-aa-1-5v-c...
2,"[{'id': 'pcmcat312300050015', 'name': 'Connect...",Compatible with select electronic devices; AA ...,http://img.bbystatic.com/BestBuy_US/images/pro...,Duracell,MN1500B8Z,Duracell - AA Batteries (8-Pack),7.49,5.49,127687,HardGood,41333825014,http://www.bestbuy.com/site/duracell-aa-batter...


#### Desciption of attributes
* image: url to product image
* upc: Universal Product Code (UPC)
* sku: Best Buy unique 7-digit product identifier

##### Links/references: 
* https://bestbuyapis.github.io/api-documentation/#detail
* https://bestbuyapis.github.io/bby-query-builder/#/productSearch

#### Processing "category" column
* As the **category** field consists of a list of jsons, we will process the column and create multiple columns for each category. 
* Why are we doing this? If you would like to predict a category you will need to use this feature...

In [4]:
# Processing category field with json_normalize [http://pandas.pydata.org/pandas-docs/version/0.17.0/generated/pandas.io.json.json_normalize.html]
df_list = []
for x in range(len(df_products)):
    df_temp = json_normalize(df_products["category"][x])
    df_temp = df_temp.transpose()
    df_temp = df_temp.drop(["name"])
    df_temp = df_temp.rename(index={"id": x})
    df_list.append(df_temp)
df_category = pd.concat(df_list)
df_products = df_products.join(df_category)
# Delete temp dataframes..
del df_list, df_temp, df_category
# Renaming category column to respective fields
df_products.rename(index=str, columns={0: "Category_F1", 1: "Category_F2", 2: "Category_F3", 3: "Category_F4", 4: "Category_F5", 5: "Category_F6", 6: "Category_F7"}, inplace=True)
# Dropping columns that dont provide substantial information for analysis of data
df_products.drop(['sku', 'upc','url', 'image','category'], axis=1, inplace=True)

In [5]:
df_products.head(3)

Unnamed: 0,description,manufacturer,model,name,price,shipping,type,Category_F1,Category_F2,Category_F3,Category_F4,Category_F5,Category_F6,Category_F7
0,Compatible with select electronic devices; AAA...,Duracell,MN2400B4Z,Duracell - AAA Batteries (4-Pack),5.49,5.49,HardGood,pcmcat312300050015,pcmcat248700050021,pcmcat303600050001,abcat0208002,,,
1,Long-lasting energy; DURALOCK Power Preserve t...,Duracell,MN1500B4Z,Duracell - AA 1.5V CopperTop Batteries (4-Pack),5.49,5.49,HardGood,pcmcat312300050015,pcmcat248700050021,pcmcat303600050001,abcat0208002,,,
2,Compatible with select electronic devices; AA ...,Duracell,MN1500B8Z,Duracell - AA Batteries (8-Pack),7.49,5.49,HardGood,pcmcat312300050015,pcmcat248700050021,pcmcat303600050001,abcat0208002,,,


## Analysis of data
* Tool used for explanation is pandas-profiling
    * Link: https://github.com/pandas-profiling/pandas-profiling
* Some features of pandas-profiling:
    * Generates a neat html report of a dataframe
    * User can interactively explore attributes

In [6]:
pandas_profiling.ProfileReport(df_products)

0,1
Number of variables,15
Number of observations,51646
Total Missing (%),23.8%
Total size in memory,5.9 MiB
Average record size in memory,120.0 B

0,1
Numeric,1
Categorical,13
Boolean,0
Date,0
Text (Unique),1
Rejected,0
Unsupported,0

0,1
Distinct count,62
Unique (%),0.1%
Missing (%),0.0%
Missing (n),0

0,1
abcat0900000,9317
abcat0800000,6905
abcat0500000,6088
Other values (59),29336

Value,Count,Frequency (%),Unnamed: 3
abcat0900000,9317,18.0%,
abcat0800000,6905,13.4%,
abcat0500000,6088,11.8%,
abcat0700000,5837,11.3%,
pcmcat312300050015,4968,9.6%,
abcat0400000,3645,7.1%,
abcat0200000,3603,7.0%,
abcat0207000,3013,5.8%,
abcat0300000,2673,5.2%,
abcat0100000,2215,4.3%,

0,1
Distinct count,173
Unique (%),0.3%
Missing (%),1.5%
Missing (n),755

0,1
abcat0811002,6176
abcat0912000,3708
pcmcat232900050017,3556
Other values (169),37451

Value,Count,Frequency (%),Unnamed: 3
abcat0811002,6176,12.0%,
abcat0912000,3708,7.2%,
pcmcat232900050017,3556,6.9%,
abcat0515000,2414,4.7%,
abcat0410000,2204,4.3%,
pcmcat165900050023,1491,2.9%,
abcat0904000,1345,2.6%,
pcmcat241600050001,1334,2.6%,
pcmcat309900050001,1119,2.2%,
abcat0204000,1115,2.2%,

0,1
Distinct count,628
Unique (%),1.2%
Missing (%),12.9%
Missing (n),6643

0,1
pcmcat191200050015,2079
abcat0811006,1847
pcmcat331600050007,1351
Other values (624),39726
(Missing),6643

Value,Count,Frequency (%),Unnamed: 3
pcmcat191200050015,2079,4.0%,
abcat0811006,1847,3.6%,
pcmcat331600050007,1351,2.6%,
abcat0807000,1091,2.1%,
abcat0515025,845,1.6%,
pcmcat242000050002,821,1.6%,
abcat0811004,801,1.6%,
abcat0205000,788,1.5%,
pcmcat748301695371,724,1.4%,
pcmcat179100050006,715,1.4%,

0,1
Distinct count,778
Unique (%),1.5%
Missing (%),49.4%
Missing (n),25488

0,1
pcmcat214700050000,1892
pcmcat165900050031,1092
abcat0807001,497
Other values (774),22677
(Missing),25488

Value,Count,Frequency (%),Unnamed: 3
pcmcat214700050000,1892,3.7%,
pcmcat165900050031,1092,2.1%,
abcat0807001,497,1.0%,
pcmcat226900050013,486,0.9%,
abcat0515028,432,0.8%,
pcmcat332100050000,409,0.8%,
pcmcat179200050008,358,0.7%,
pcmcat251300050004,346,0.7%,
pcmcat326000050010,344,0.7%,
pcmcat247400050000,343,0.7%,

0,1
Distinct count,203
Unique (%),0.4%
Missing (%),89.1%
Missing (n),46000

0,1
pcmcat165900050033,739
pcmcat350800050010,233
pcmcat748300580023,214
Other values (199),4460
(Missing),46000

Value,Count,Frequency (%),Unnamed: 3
pcmcat165900050033,739,1.4%,
pcmcat350800050010,233,0.5%,
pcmcat748300580023,214,0.4%,
pcmcat165900050034,171,0.3%,
pcmcat328900050008,163,0.3%,
pcmcat258900050007,135,0.3%,
pcmcat748300579994,126,0.2%,
pcmcat748300322875,120,0.2%,
pcmcat194000050023,115,0.2%,
pcmcat258900050003,114,0.2%,

0,1
Distinct count,24
Unique (%),0.0%
Missing (%),99.3%
Missing (n),51300

0,1
abcat0511004,51
pcmcat748300323194,44
pcmcat748300323342,42
Other values (20),209
(Missing),51300

Value,Count,Frequency (%),Unnamed: 3
abcat0511004,51,0.1%,
pcmcat748300323194,44,0.1%,
pcmcat748300323342,42,0.1%,
pcmcat223500050006,36,0.1%,
pcmcat748300322984,33,0.1%,
abcat0511005,31,0.1%,
pcmcat748300323090,31,0.1%,
pcmcat223500050007,15,0.0%,
pcmcat165900050028,9,0.0%,
pcmcat165900050027,9,0.0%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),100.0%
Missing (n),51645

0,1
pcmcat223500050009,1
(Missing),51645

Value,Count,Frequency (%),Unnamed: 3
pcmcat223500050009,1,0.0%,
(Missing),51645,100.0%,

0,1
Distinct count,38537
Unique (%),74.6%
Missing (%),0.0%
Missing (n),0

0,1
Perfect gift card? Piece of cake. All Best Buy gift cards are shipped free and are good toward future purchases online and in U.S. or Puerto Rico Best Buy stores. Best Buy gift cards do not have an expiration date.,370
Drive evil crazy to save Skylands,37
"Android 5.0 Lollipop operating system4G LTE speedWi-Fi5.1"" Super AMOLED touch screenBluetooth enabled",36
Other values (38534),51203

Value,Count,Frequency (%),Unnamed: 3
Perfect gift card? Piece of cake. All Best Buy gift cards are shipped free and are good toward future purchases online and in U.S. or Puerto Rico Best Buy stores. Best Buy gift cards do not have an expiration date.,370,0.7%,
Drive evil crazy to save Skylands,37,0.1%,
"Android 5.0 Lollipop operating system4G LTE speedWi-Fi5.1"" Super AMOLED touch screenBluetooth enabled",36,0.1%,
"5.5-inch Retina HD displayA9 chip with 64-bit desktop-class architectureNew 12MP iSight camera with Live Photos3D TouchiOS 9 and iCloud7.3mm thin, seamless design",30,0.1%,
"4.7-inch Retina HD displayA9 chip with 64-bit desktop-class architectureNew 12MP iSight camera with Live Photos3D TouchiOS and iCloud7.1mm thin, seamless design",30,0.1%,
LA CUISINE Cast-Iron Round Covered Casserole: Wide-angle wedge-profile handles; tiered lid design; preseasoned; cast-iron construction; easy to clean,27,0.1%,
Dial control; 3 temperature settings; dishwasher-safe stoneware and lid; lid-mounted locking system; recipes included; team colors,26,0.1%,
Expand your LEGO Dimensions experience,25,0.0%,
"Only at Best Buy Compatible with Apple iPhone SE, 5s and 5; polycarbonate and TPU materials; raised bezel; snap-on installation; play-through design",25,0.0%,
"Compatible with iPhone 5, 5c, 5s, 6 and 6 Plus with iOS 8.2 or later; Wi-Fi; Bluetooth 4.0; Digital Crown; Taptic Engine; Force Touch; Retina display",25,0.0%,

First 3 values
449
15620
15160

Last 3 values
16760
17953
3861

Value,Count,Frequency (%),Unnamed: 3
0,1,0.0%,
1,1,0.0%,
10,1,0.0%,
100,1,0.0%,
1000,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
9995,1,0.0%,
9996,1,0.0%,
9997,1,0.0%,
9998,1,0.0%,
9999,1,0.0%,

0,1
Distinct count,2364
Unique (%),4.6%
Missing (%),0.1%
Missing (n),62

0,1
Metra,1064
Incipio,972
Samsung,910
Other values (2360),48638

Value,Count,Frequency (%),Unnamed: 3
Metra,1064,2.1%,
Incipio,972,1.9%,
Samsung,910,1.8%,
GE,794,1.5%,
Insignia™,752,1.5%,
Apple,733,1.4%,
Hal Leonard,727,1.4%,
HP,723,1.4%,
Sony,681,1.3%,
Activision,573,1.1%,

0,1
Distinct count,47184
Unique (%),91.4%
Missing (%),5.5%
Missing (n),2863

0,1
PREOWNED,517
Pre-Owned G,117
PRE OWNED,76
Other values (47180),48073
(Missing),2863

Value,Count,Frequency (%),Unnamed: 3
PREOWNED,517,1.0%,
Pre-Owned G,117,0.2%,
PRE OWNED,76,0.1%,
12345,66,0.1%,
BND200,35,0.1%,
TBD,30,0.1%,
PRE-OWNED,28,0.1%,
BND300,24,0.0%,
E3,18,0.0%,
PREOWN,13,0.0%,

0,1
Distinct count,48558
Unique (%),94.0%
Missing (%),0.0%
Missing (n),1

0,1
Lenmar - Lithium-Ion Battery - Black,31
DigiPower - Lithium-Ion Battery - Black,20
Ultralast - Lithium-Ion Battery for Select LG Cell Phones - White/Black,19
Other values (48554),51575

Value,Count,Frequency (%),Unnamed: 3
Lenmar - Lithium-Ion Battery - Black,31,0.1%,
DigiPower - Lithium-Ion Battery - Black,20,0.0%,
Ultralast - Lithium-Ion Battery for Select LG Cell Phones - White/Black,19,0.0%,
Ultralast - Lithium-Ion Battery for Select Samsung Cell Phones - Multi,19,0.0%,
DigiPower - Rechargeable Lithium-Ion Battery - Black,17,0.0%,
Ultralast - Lithium-Ion Battery for Select Samsung Cell Phones - Black,16,0.0%,
Bower - External Flash - Black,16,0.0%,
Lenmar - Rechargeable Lithium-Ion Battery - Black,15,0.0%,
Energizer - High-Capacity Rechargeable Lithium-Ion Battery - Black,14,0.0%,
DENAQ - Lithium-Ion Battery - Black,14,0.0%,

0,1
Distinct count,1513
Unique (%),2.9%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,251.36
Minimum,0.01
Maximum,28000
Zeros (%),0.0%

0,1
Minimum,0.01
5-th percentile,9.99
Q1,24.99
Median,59.99
Q3,199.99
95-th percentile,1200.0
Maximum,28000.0
Range,28000.0
Interquartile range,175.0

0,1
Standard deviation,576.95
Coef of variation,2.2954
Kurtosis,197.82
Mean,251.36
MAD,294.36
Skewness,8.8622
Sum,12982000
Variance,332880
Memory size,403.6 KiB

Value,Count,Frequency (%),Unnamed: 3
19.99,2882,5.6%,
9.99,2776,5.4%,
29.99,2576,5.0%,
49.99,2308,4.5%,
39.99,2278,4.4%,
99.99,1661,3.2%,
14.99,1539,3.0%,
24.99,1526,3.0%,
59.99,1490,2.9%,
34.99,1317,2.6%,

Value,Count,Frequency (%),Unnamed: 3
0.01,9,0.0%,
0.99,16,0.0%,
1.0,3,0.0%,
1.49,63,0.1%,
1.99,16,0.0%,

Value,Count,Frequency (%),Unnamed: 3
12999.99,1,0.0%,
14999.98,1,0.0%,
16299.99,1,0.0%,
17999.99,1,0.0%,
27999.98,1,0.0%,

0,1
Distinct count,201
Unique (%),0.4%
Missing (%),0.0%
Missing (n),0

0,1
0,32870
5.49,7284
3.99,4923
Other values (198),6569

Value,Count,Frequency (%),Unnamed: 3
0,32870,63.6%,
5.49,7284,14.1%,
3.99,4923,9.5%,
,3055,5.9%,
5.99,1167,2.3%,
6.49,625,1.2%,
6.99,380,0.7%,
7.49,237,0.5%,
109.99,128,0.2%,
7.99,114,0.2%,

0,1
Distinct count,6
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
HardGood,46103
Game,5037
Software,444
Other values (3),62

Value,Count,Frequency (%),Unnamed: 3
HardGood,46103,89.3%,
Game,5037,9.8%,
Software,444,0.9%,
BlackTie,49,0.1%,
Movie,11,0.0%,
Music,2,0.0%,

Unnamed: 0,description,manufacturer,model,name,price,shipping,type,Category_F1,Category_F2,Category_F3,Category_F4,Category_F5,Category_F6,Category_F7
0,Compatible with select electronic devices; AAA...,Duracell,MN2400B4Z,Duracell - AAA Batteries (4-Pack),5.49,5.49,HardGood,pcmcat312300050015,pcmcat248700050021,pcmcat303600050001,abcat0208002,,,
1,Long-lasting energy; DURALOCK Power Preserve t...,Duracell,MN1500B4Z,Duracell - AA 1.5V CopperTop Batteries (4-Pack),5.49,5.49,HardGood,pcmcat312300050015,pcmcat248700050021,pcmcat303600050001,abcat0208002,,,
2,Compatible with select electronic devices; AA ...,Duracell,MN1500B8Z,Duracell - AA Batteries (8-Pack),7.49,5.49,HardGood,pcmcat312300050015,pcmcat248700050021,pcmcat303600050001,abcat0208002,,,
3,4-pack AA alkaline batteries; battery tester i...,Energizer,E91BP-4,Energizer - MAX Batteries AA (4-Pack),4.99,5.49,HardGood,pcmcat312300050015,pcmcat248700050021,pcmcat303600050001,abcat0208002,,,
4,Compatible with select electronic devices; C s...,Duracell,MN1400R4Z,Duracell - C Batteries (4-Pack),8.99,5.49,HardGood,pcmcat312300050015,pcmcat248700050021,pcmcat303600050001,abcat0208002,,,
