In [43]:
# Basic libraries
import pandas as pd

# Project libraries
# set path to local modules and submodules
import sys, os
sys.path.append(os.path.abspath("src")) # add src folder to path
# import local modules and submodules
import data_ravers_utils.file_handler as fl
import data_ravers_utils.eda_utils as eda

# Settings
pd.set_option('display.max_columns', None) # display all columns
import warnings
warnings.filterwarnings('ignore') # ignore warnings


# Logging
import logging
# setup logging level
logging.getLogger().setLevel(logging.DEBUG)

Bandcamp sales dataset contains 1,000,000 items from Bandcamp's sales feed between 9/9/2020 and 10/2/2020.

- _id: unique identifier combining the sale's URL and UTC timestamp.
- url: the path to the item on Bandcamp. Use this column to join this dataset to the dataset of Bandcamp items.
- artist_name: Name of the artist.
- album_title: Title of the album, if applicable.
- art_url: path to the item's art image.
- item_type: denotes the type of object. a for digital albums, p for physical items, and t for digital tracks.
- slug_type: also denotes the type of object. a for all albums, p for merch, and t for tracks.
- utc_date: the UTC timestamp of the sale datetime.
- country_code: country code of the buyer.
- country: full country code name of the buyer.
- item_price: price of the item in the seller's currency.
- currency: the seller's currency.
- amount_paid: amount paid in the seller's currency.
- amount_paid_fmt: amount paid in the seller's currency, with the currency symbol.
- amount_paid_usd: amount paid converted to US Dollars.
- amount_over_fmt: amount voluntarily paid over the item price in the seller's currency.

In [44]:
df_filename = 'bandcamp-sales'
data = fl.read_df_pickle('bandcamp-sales')
df = data.copy()
df.head(5)

Unnamed: 0,_id,art_url,item_type,utc_date,country_code,track_album_slug_text,country,slug_type,amount_paid_fmt,item_price,item_description,art_id,url,amount_paid,releases,artist_name,currency,album_title,amount_paid_usd,package_image_id,amount_over_fmt,item_slug,addl_count
0,1599688803.5175&//girlbanddublin.bandcamp.com/...,https://f4.bcbits.com/img/a0206405257_7.jpg,a,1599689000.0,gb,,United Kingdom,a,$9.99,9.99,Live at Vicar Street,206405300.0,//girlbanddublin.bandcamp.com/album/live-at-vi...,9.99,,Girl Band,USD,,9.99,,,,
1,1599688805.27838&//maharettarecords.bandcamp.c...,https://f4.bcbits.com/img/a2984241552_7.jpg,a,1599689000.0,fi,,Finland,a,£1,1.0,Neurogen,2984242000.0,//maharettarecords.bandcamp.com/album/neurogen,1.0,,Jirah,GBP,,1.3,,,,
2,1599688805.90646&//maharettarecords.bandcamp.c...,https://f4.bcbits.com/img/a3320494770_7.jpg,a,1599689000.0,fi,,Finland,a,£3,3.0,The Last Snare Bender,3320495000.0,//maharettarecords.bandcamp.com/album/the-last...,3.0,,D-Ther,GBP,,3.9,,,,
3,1599688806.94234&//alicesitski.bandcamp.com/al...,https://f4.bcbits.com/img/0020476345_37.jpg,p,1599689000.0,gb,,United Kingdom,a,€10.50,10.5,Limited Edition Compact Disc,,//alicesitski.bandcamp.com/album/white-noise-tv,10.5,,WHITE NOISE TV,EUR,WHITE NOISE TV,12.39,20476345.0,,,
4,1599688809.07942&//linguaignota.bandcamp.com/t...,https://f4.bcbits.com/img/a3428873396_7.jpg,t,1599689000.0,us,,United States,t,$1,1.0,O Ruthless Great Divine Director,3428873000.0,//linguaignota.bandcamp.com/track/o-ruthless-g...,1.0,,LINGUA IGNOTA,USD,,1.0,,,,


In [45]:
eda.auto_cleanup(df)

INFO:root:Number of rows before cleanup: 1000000
INFO:root:Dataset has no empty spaces.
INFO:root:Dataset has no duplicates.
INFO:root:Number of rows after cleanup: 1000000


In [46]:
df_nulls = eda.count_nulls(df)
print(df_nulls)

                        is_na   not_na  na_percent na_percent_pretty
track_album_slug_text  997763     2237     99.7763            99.78%
releases               988416    11584     98.8416            98.84%
item_slug              978258    21742     97.8258            97.83%
addl_count             978258    21742     97.8258            97.83%
amount_over_fmt        880867   119133     88.0867            88.09%
package_image_id       767292   232708     76.7292            76.73%
album_title            643539   356461     64.3539            64.35%
art_id                 235480   764520     23.5480            23.55%
slug_type               11584   988416      1.1584             1.16%
item_description           23   999977      0.0023             0.00%
artist_name                10   999990      0.0010             0.00%
_id                         0  1000000      0.0000             0.00%
art_url                     0  1000000      0.0000             0.00%
item_type                   0  100

## Preserving target data

- `amount_paid_usd` column contains the total cost of purchase in unified format - converted from original currency to USD for compatibility. 
  - This is the target variable for understanding sales trends.
  - This data is derived from columns:
    - `amount_paid_fmt`, `amount_paid`, `item_price`, `currency`, `amount_over_fmt`
    - Data relation formula in pseudocode:
    - ```amount_paid_usd = (item_price + amount_over_fmt = amount_paid) convert from currency to USD```
    - This assumption must be verified before any of the columns will be dropped.

- `amount_over_fmt` column contains information about voluntarily paid over the item price in the seller's currency. It is important for the project objective of exploring fan generosity.
  - this data is stored in seller's currency and needs to be unified by converting into dollars. For this `currency` column is necessary. 
  - 88.09% of the column is missing data. That are cases where buyers did not pay extra on top of `item_price`. Must be filled with 0.


## Analysing data types and unique values to reduce data junk

In [47]:
from IPython.utils.capture import capture_output

# this code will be captured in memory and not displayed
with capture_output() as output:
    eda.print_eda_report(df)

In [48]:
print(output.stdout)

report_path = f'{fl.PROJECT_ROOT}/docs/auto_eda_raw_report.md'

with open(report_path, "w") as f:
    f.write(output.stdout)

print(f"Output has been saved to file:\n{report_path}")

Dataset has shape (1000000, 23)

Dataset has numerical data in columns: ['utc_date', 'item_price', 'art_id', 'amount_paid', 'releases', 'amount_paid_usd', 'package_image_id', 'addl_count']
- Column "utc_date" has 999990 unique values.
- Column "art_id" has 271430 unique values.
- Column "package_image_id" has 63940 unique values.
- Column "amount_paid_usd" has 5866 unique values.
- Column "amount_paid" has 3315 unique values.
- Column "item_price" has 2878 unique values.
- Column "releases" has 220 unique values.
- Column "addl_count" has 17 unique values.
  -- Unique values are:
 [nan  1.  2.  3.  9.  8.  5.  4.  7. 12.  6. 19. 11. 14. 10. 22. 13. 15.]

Dataset has categorical data in columns: ['_id', 'art_url', 'item_type', 'country_code', 'track_album_slug_text', 'country', 'slug_type', 'amount_paid_fmt', 'item_description', 'url', 'artist_name', 'currency', 'album_title', 'amount_over_fmt', 'item_slug']
- Column "_id" has 1000000 unique values.
- Column "art_url" has 335212 unique 

#### Identifying columns that are not important for sure in the context of this project


In [49]:
columns_to_drop = []

What to do with `track_album_slug_text`, `item_slug`?!

In [50]:
# 5 examples of unique values for 'item_slug'
eda.unique_values_list(df, 'item_slug')[:5]

array([nan, '/album/', '/album/prabhupadas-dissp-2015-3-vishakha-brhmch',
       '/album/friends-from-childhood', '/album/will-he-save-us'],
      dtype=object)

In [51]:
# items are sorted alphabetically, show the end of the list
eda.unique_values_list(df, 'item_slug')[-5:][::-1]

array(['/album/failure-to-return',
       '/album/fuck-you-diggy-1-d-x-nem-x-lil-woofy-woof-x-dj-killa-c-x-g-lo-key-x-mr-sisco-prod-nem',
       '/album/individuum-ii', '/album/hand-covering-sun-extended-mix',
       '/album/airbuccaneers-2'], dtype=object)

In [52]:
# 5 examples of unique values for 'track_album_slug_text'
eda.unique_values_list(df, 'track_album_slug_text')[:5]

array([nan, 'pachuco', 'dale-roberts',
       'prabhupadas-dissp-2015-3-vishakha-brhmch', '--348'], dtype=object)

In [53]:
# items are sorted alphabetically, show the end of the list
eda.unique_values_list(df, 'track_album_slug_text')[-5:][::-1]

array(['fetus-tacos-2', '04-allmiladies-3', 'party-plannin-intro',
       'aquawave', 'failure-to-return'], dtype=object)

In [54]:
columns_to_drop += ['item_slug', 'track_album_slug_text']

There is no description of `addl_count` column, there are 97.83% null values. Therefore this column should be dropped.

In [55]:
eda.unique_values_list(df, 'addl_count')

array([nan,  1.,  2.,  3.,  9.,  8.,  5.,  4.,  7., 12.,  6., 19., 11.,
       14., 10., 22., 13., 15.])

In [56]:
columns_to_drop.append('addl_count')

Columns related to artworks should be dropped for the scope of this project as retrieving and interpreting the relevant information is costly.

In [57]:
columns_to_drop += ['package_image_id', 'art_url']

### Dropping unusful columns

In [58]:
df.drop(columns=columns_to_drop, inplace=True)

## Treating target related columns

## Treating absence of data

## Backup dataframe

In [59]:
fl.save_df_pickle(df, df_filename)

INFO:root:Backup file is created: /Users/bubblegum_doubledrops/Library/Mobile Documents/com~apple~CloudDocs/0prio - Important heavy backups/IronHack/big_projects/midproject-bandcamp-insights/data/bandcamp-sales.pkl
