In [3]:
import matplotlib.pyplot as plt

plt.annotate('[ProPublica https://www.propublica.org/datastore/dataset/amazon-pricing-algorithm-data-set]', (0,0), (-80,-20), fontsize=6, 
             xycoords='axes fraction', textcoords='offset points', va='top')

Text(-80,-20,'[ProPublica https://www.propublica.org/datastore/dataset/amazon-pricing-algorithm-data-set]')

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [5]:
url = './data/09-19-Amazon-Ranking-Analysis.csv'

In [6]:
amazon = pd.read_csv(url, parse_dates = True, index_col='ScrapeDate')

In [7]:
amazon.head(5)

Unnamed: 0_level_0,Index,ProductName,ScrapedIndex,ScrapedIndexTrueRank,CorrectedIndexTrueRank,ScrapedIndexPrice,CorrectedPrice,ScrapedIndexVendor,ScrapedIndexVendorType,BBScrapedPrice,BBVendorType,BBVendor,BBCorrectedPrice
ScrapeDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2016-06-14,0,#1 BEST Probiotic Supplement - 60 Day Supply w...,2,2,1,20.99,20.99,Supplement Warriors,Other,-1.0,FBA,Sold by Sharp Nutrition and Fulfilled by Amazon.,25.02
2016-06-14,1,#1 BEST Probiotic Supplement - 60 Day Supply w...,1,1,2,19.99,25.02,Sharp Nutrition,FBA,-1.0,FBA,Sold by Sharp Nutrition and Fulfilled by Amazon.,25.02
2016-06-27,2,"10 Panel Dip Drug Testing Kit, Test for 10 Dif...",1,1,1,3.96,3.96,speedy tests,Other,4.9,FBA,Sold by Michael Minyety and Fulfilled by Amazon.,9.97
2016-06-27,3,"10 Panel Dip Drug Testing Kit, Test for 10 Dif...",2,1,1,3.96,3.96,Drug Abuse Control,Other,4.9,FBA,Sold by Michael Minyety and Fulfilled by Amazon.,9.97
2016-06-27,4,"10 Panel Dip Drug Testing Kit, Test for 10 Dif...",3,1,1,3.96,3.96,Lowest Priced Tests,Other,4.9,FBA,Sold by Michael Minyety and Fulfilled by Amazon.,9.97


Number of rows and columns

In [8]:
amazon.shape

(6973, 13)

Types of data in columns

In [9]:
amazon.dtypes

Index                       int64
ProductName                object
ScrapedIndex                int64
ScrapedIndexTrueRank        int64
CorrectedIndexTrueRank      int64
ScrapedIndexPrice         float64
CorrectedPrice            float64
ScrapedIndexVendor         object
ScrapedIndexVendorType     object
BBScrapedPrice            float64
BBVendorType               object
BBVendor                   object
BBCorrectedPrice          float64
dtype: object

Column labels

In [10]:
amazon.columns

Index(['Index', 'ProductName', 'ScrapedIndex', 'ScrapedIndexTrueRank',
       'CorrectedIndexTrueRank', 'ScrapedIndexPrice', 'CorrectedPrice',
       'ScrapedIndexVendor', 'ScrapedIndexVendorType', 'BBScrapedPrice',
       'BBVendorType', 'BBVendor', 'BBCorrectedPrice'],
      dtype='object')

DataFrame contents & memory use

In [11]:
amazon.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6973 entries, 2016-06-14 to 2016-08-30
Data columns (total 13 columns):
Index                     6973 non-null int64
ProductName               6973 non-null object
ScrapedIndex              6973 non-null int64
ScrapedIndexTrueRank      6973 non-null int64
CorrectedIndexTrueRank    6973 non-null int64
ScrapedIndexPrice         6973 non-null float64
CorrectedPrice            6973 non-null float64
ScrapedIndexVendor        6973 non-null object
ScrapedIndexVendorType    6973 non-null object
BBScrapedPrice            6973 non-null float64
BBVendorType              6973 non-null object
BBVendor                  6809 non-null object
BBCorrectedPrice          6973 non-null float64
dtypes: float64(4), int64(4), object(5)
memory usage: 762.7+ KB


In [12]:
amazon.describe()

Unnamed: 0,Index,ScrapedIndex,ScrapedIndexTrueRank,CorrectedIndexTrueRank,ScrapedIndexPrice,CorrectedPrice,BBScrapedPrice,BBCorrectedPrice
count,6973.0,6973.0,6973.0,6973.0,6973.0,6973.0,6973.0,6973.0
mean,3486.0,25.041876,22.730962,21.5593,54.904896,55.567631,36.295817,45.827961
std,2013.076046,23.618871,20.520442,19.154709,74.293356,74.476092,61.838063,63.889925
min,0.0,1.0,1.0,1.0,2.38,2.38,-1.0,0.0
25%,1743.0,8.0,7.0,7.0,19.53,20.01,8.99,15.79
50%,3486.0,17.0,16.0,16.0,31.9,32.08,14.94,23.34
75%,5229.0,35.0,33.0,31.0,55.89,56.1,36.99,47.92
max,6972.0,162.0,113.0,107.0,931.8,931.8,698.0,706.11


Checking for missing data

In [13]:
amazon.isnull().sum()

Index                       0
ProductName                 0
ScrapedIndex                0
ScrapedIndexTrueRank        0
CorrectedIndexTrueRank      0
ScrapedIndexPrice           0
CorrectedPrice              0
ScrapedIndexVendor          0
ScrapedIndexVendorType      0
BBScrapedPrice              0
BBVendorType                0
BBVendor                  164
BBCorrectedPrice            0
dtype: int64

Data is relatively clean; missing various vendors

Checking FBA

In [14]:
fulfillment_by_amazon = amazon.loc[:, 'BBVendorType'] == 'FBA'
amazon.loc[fulfillment_by_amazon, :]

Unnamed: 0_level_0,Index,ProductName,ScrapedIndex,ScrapedIndexTrueRank,CorrectedIndexTrueRank,ScrapedIndexPrice,CorrectedPrice,ScrapedIndexVendor,ScrapedIndexVendorType,BBScrapedPrice,BBVendorType,BBVendor,BBCorrectedPrice
ScrapeDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2016-06-14,0,#1 BEST Probiotic Supplement - 60 Day Supply w...,2,2,1,20.99,20.99,Supplement Warriors,Other,-1.00,FBA,Sold by Sharp Nutrition and Fulfilled by Amazon.,25.02
2016-06-14,1,#1 BEST Probiotic Supplement - 60 Day Supply w...,1,1,2,19.99,25.02,Sharp Nutrition,FBA,-1.00,FBA,Sold by Sharp Nutrition and Fulfilled by Amazon.,25.02
2016-06-27,2,"10 Panel Dip Drug Testing Kit, Test for 10 Dif...",1,1,1,3.96,3.96,speedy tests,Other,4.90,FBA,Sold by Michael Minyety and Fulfilled by Amazon.,9.97
2016-06-27,3,"10 Panel Dip Drug Testing Kit, Test for 10 Dif...",2,1,1,3.96,3.96,Drug Abuse Control,Other,4.90,FBA,Sold by Michael Minyety and Fulfilled by Amazon.,9.97
2016-06-27,4,"10 Panel Dip Drug Testing Kit, Test for 10 Dif...",3,1,1,3.96,3.96,Lowest Priced Tests,Other,4.90,FBA,Sold by Michael Minyety and Fulfilled by Amazon.,9.97
2016-06-27,5,"10 Panel Dip Drug Testing Kit, Test for 10 Dif...",11,9,1,3.96,3.96,speedy tests,Other,4.90,FBA,Sold by Michael Minyety and Fulfilled by Amazon.,9.97
2016-06-27,6,"10 Panel Dip Drug Testing Kit, Test for 10 Dif...",5,3,2,6.50,6.50,"Uritox, LLC",Other,4.90,FBA,Sold by Michael Minyety and Fulfilled by Amazon.,9.97
2016-06-27,7,"10 Panel Dip Drug Testing Kit, Test for 10 Dif...",6,4,3,7.99,7.99,mynewwhip,Other,4.90,FBA,Sold by Michael Minyety and Fulfilled by Amazon.,9.97
2016-06-27,8,"10 Panel Dip Drug Testing Kit, Test for 10 Dif...",7,5,4,8.49,8.49,ChopShopDeals,Other,4.90,FBA,Sold by Michael Minyety and Fulfilled by Amazon.,9.97
2016-06-27,9,"10 Panel Dip Drug Testing Kit, Test for 10 Dif...",8,6,5,8.99,8.99,American Screening Corporation,Other,4.90,FBA,Sold by Michael Minyety and Fulfilled by Amazon.,9.97


In [15]:
amazon.loc[fulfillment_by_amazon, :].describe()

Unnamed: 0,Index,ScrapedIndex,ScrapedIndexTrueRank,CorrectedIndexTrueRank,ScrapedIndexPrice,CorrectedPrice,BBScrapedPrice,BBCorrectedPrice
count,470.0,470.0,470.0,470.0,470.0,470.0,470.0,470.0
mean,4313.157447,10.62766,8.76383,8.685106,53.358319,54.893447,36.571553,46.602383
std,2129.520953,8.096413,6.581745,6.479654,105.926661,106.344166,100.835335,100.908712
min,0.0,1.0,1.0,1.0,2.97,2.97,-1.0,0.0
25%,2145.25,4.0,3.0,3.0,13.99,15.2475,4.41,11.28
50%,5154.5,9.0,7.0,7.0,22.95,24.84,10.76,17.05
75%,6071.75,15.0,12.0,12.0,41.87,43.405,20.69,36.43
max,6958.0,39.0,30.0,30.0,931.8,931.8,698.0,706.11


In [16]:
amazon.loc[:, 'ProductName'].value_counts()

Fujifilm INSTAX Mini Instant Film (Rainbow)                                                                                                                                                               154
Suncast GHW1732 Resin Wicker Trash Hideaway                                                                                                                                                               115
Fiskars 45mm Contour Rotary Cutter (195210-1001)                                                                                                                                                          106
K&N 99-5000 Aerosol Recharger Filter Care Service Kit                                                                                                                                                      95
Honeywell HT-908 Turbo Force Room Air Circulator Fan                                                                                                                            

In [17]:
amazon.loc[:, 'ProductName'].value_counts().head(10)

Fujifilm INSTAX Mini Instant Film (Rainbow)                                154
Suncast GHW1732 Resin Wicker Trash Hideaway                                115
Fiskars 45mm Contour Rotary Cutter (195210-1001)                           106
K&N 99-5000 Aerosol Recharger Filter Care Service Kit                       95
Honeywell HT-908 Turbo Force Room Air Circulator Fan                        90
Sassy Developmental Bumpy Ball                                              88
ASUS (RT-AC68U) Wireless-AC1900 Dual-Band Gigabit Router                    87
Bushnell H2O Waterproof Compact Roof Prism Binocular, Black, 10 x 25-mm     87
K&N KN-204 Motorcycle/Powersports High Performance Oil Filter               87
Land Lakes Mini Moos Creamer, Half and Half Cups, 192 Count                 86
Name: ProductName, dtype: int64

In [19]:
top_ten = amazon.loc[:, 'ProductName'].head(10)

In [21]:
amazon.columns

Index(['Index', 'ProductName', 'ScrapedIndex', 'ScrapedIndexTrueRank',
       'CorrectedIndexTrueRank', 'ScrapedIndexPrice', 'CorrectedPrice',
       'ScrapedIndexVendor', 'ScrapedIndexVendorType', 'BBScrapedPrice',
       'BBVendorType', 'BBVendor', 'BBCorrectedPrice'],
      dtype='object')

In [22]:
amazon['ScrapedIndexVendor'].value_counts()

Amazon.com                                                           192
UnbeatableSale, Inc                                                   47
Gatzies                                                               41
DDN INC                                                               37
MINDeyes                                                              35
Toys US                                                               35
DropAir                                                               34
V1-Store                                                              33
ShopChanute                                                           32
Mild Shop                                                             31
MeagaDeal                                                             30
XoXoGroupLLC                                                          29
TheNewMall                                                            28
Happy Titan                                        

In [26]:
plot_frame = amazon[['ProductName', 'ScrapedIndexVendor']]