In [10]:
# Basic libraries
import pandas as pd

# Project libraries
# set path to local modules and submodules
import sys, os
sys.path.append(os.path.abspath("src")) # add src folder to path
# import local modules and submodules
import data_ravers_utils.file_handler 
import data_ravers_utils.eda_utils as eda

# Settings
pd.set_option('display.max_columns', None) # display all columns
import warnings
warnings.filterwarnings('ignore') # ignore warnings


# Logging
import logging
# setup logging level
logging.getLogger().setLevel(logging.DEBUG)

Bandcamp sales dataset contains 1,000,000 items from Bandcamp's sales feed between 9/9/2020 and 10/2/2020.

It is a slice of the whole dataset used in The Chaos Bazaar. It contains the following columns:

- _id: unique identifier combining the sale's URL and UTC timestamp.
- url: the path to the item on Bandcamp. Use this column to join this dataset to the dataset of Bandcamp items.
- artist_name: Name of the artist.
- album_title: Title of the album, if applicable.
- art_url: path to the item's art image.
- item_type: denotes the type of object. a for digital albums, p for physical items, and t for digital tracks.
- slug_type: also denotes the type of object. a for all albums, p for merch, and t for tracks.
- utc_date: the UTC timestamp of the sale datetime.
- country_code: country code of the buyer.
- country: full country code name of the buyer.
- item_price: price of the item in the seller's currency.
- currency: the seller's currency.
- amount_paid: amount paid in the seller's currency.
- amount_paid_fmt: amount paid in the seller's currency, with the currency symbol.
- amount_paid_usd: amount paid converted to US Dollars.
- amount_over_fmt: amount voluntarily paid over the item price in the seller's currency.

In [11]:
df_filename = 'bandcamp-sales'
data = read_df_pickle('bandcamp-sales')
df = data.copy()
df.head(5)

Unnamed: 0,_id,art_url,item_type,utc_date,country_code,track_album_slug_text,country,slug_type,amount_paid_fmt,item_price,item_description,art_id,url,amount_paid,releases,artist_name,currency,album_title,amount_paid_usd,package_image_id,amount_over_fmt,item_slug,addl_count
0,1599688803.5175&//girlbanddublin.bandcamp.com/...,https://f4.bcbits.com/img/a0206405257_7.jpg,a,1599689000.0,gb,,United Kingdom,a,$9.99,9.99,Live at Vicar Street,206405300.0,//girlbanddublin.bandcamp.com/album/live-at-vi...,9.99,,Girl Band,USD,,9.99,,,,
1,1599688805.27838&//maharettarecords.bandcamp.c...,https://f4.bcbits.com/img/a2984241552_7.jpg,a,1599689000.0,fi,,Finland,a,£1,1.0,Neurogen,2984242000.0,//maharettarecords.bandcamp.com/album/neurogen,1.0,,Jirah,GBP,,1.3,,,,
2,1599688805.90646&//maharettarecords.bandcamp.c...,https://f4.bcbits.com/img/a3320494770_7.jpg,a,1599689000.0,fi,,Finland,a,£3,3.0,The Last Snare Bender,3320495000.0,//maharettarecords.bandcamp.com/album/the-last...,3.0,,D-Ther,GBP,,3.9,,,,
3,1599688806.94234&//alicesitski.bandcamp.com/al...,https://f4.bcbits.com/img/0020476345_37.jpg,p,1599689000.0,gb,,United Kingdom,a,€10.50,10.5,Limited Edition Compact Disc,,//alicesitski.bandcamp.com/album/white-noise-tv,10.5,,WHITE NOISE TV,EUR,WHITE NOISE TV,12.39,20476345.0,,,
4,1599688809.07942&//linguaignota.bandcamp.com/t...,https://f4.bcbits.com/img/a3428873396_7.jpg,t,1599689000.0,us,,United States,t,$1,1.0,O Ruthless Great Divine Director,3428873000.0,//linguaignota.bandcamp.com/track/o-ruthless-g...,1.0,,LINGUA IGNOTA,USD,,1.0,,,,


#### Dropping columns that are not important for sure in the context of this project
- artworks
- keep only 'amount_paid_usd' for the purchase value, as it is unified
- currency is dropped because because there is a column country and all payments are unified to usd

In [12]:
columns_to_drop = ['track_album_slug_text', 'package_image_id', 'art_url', 'amount_paid_fmt', 'amount_paid', 'addl_count', 'item_price', 'currency']
df.drop(columns=columns_to_drop, inplace=True)

In [13]:
eda.auto_cleanup(df)

INFO:root:Number of rows before cleanup: 1000000
INFO:root:Dataset has no empty spaces.
INFO:root:Dataset has no duplicates.
INFO:root:Number of rows after cleanup: 1000000


In [14]:
df_nulls = eda.count_nulls(df)
print(df_nulls)

                   is_na   not_na  na_percent na_percent_pretty
releases          988416    11584     98.8416            98.84%
item_slug         978258    21742     97.8258            97.83%
amount_over_fmt   880867   119133     88.0867            88.09%
album_title       643539   356461     64.3539            64.35%
art_id            235480   764520     23.5480            23.55%
slug_type          11584   988416      1.1584             1.16%
item_description      23   999977      0.0023             0.00%
artist_name           10   999990      0.0010             0.00%
_id                    0  1000000      0.0000             0.00%
item_type              0  1000000      0.0000             0.00%
utc_date               0  1000000      0.0000             0.00%
country_code           0  1000000      0.0000             0.00%
country                0  1000000      0.0000             0.00%
url                    0  1000000      0.0000             0.00%
amount_paid_usd        0  1000000      0