In [39]:
# Basic libraries
import pandas as pd
import numpy as np
import re

# Project libraries
# set path to local modules and submodules
import sys, os
sys.path.append(os.path.abspath("src")) # add src folder to path
# import local modules and submodules
import data_ravers_utils.file_handler as fl
import data_ravers_utils.eda_utils as eda

# Settings
pd.set_option('display.max_columns', None) # display all columns
import warnings
warnings.filterwarnings('ignore') # ignore warnings


# Logging
import logging
# setup logging level
logging.getLogger().setLevel(logging.DEBUG)

Bandcamp sales dataset contains 1,000,000 items from Bandcamp's sales feed between 9/9/2020 and 10/2/2020.

- _id: unique identifier combining the sale's URL and UTC timestamp.
- url: the path to the item on Bandcamp. Use this column to join this dataset to the dataset of Bandcamp items.
- artist_name: Name of the artist.
- album_title: Title of the album, if applicable.
- art_url: path to the item's art image.
- item_type: denotes the type of object. a for digital albums, p for physical items, and t for digital tracks.
- slug_type: also denotes the type of object. a for all albums, p for merch, and t for tracks.
- utc_date: the UTC timestamp of the sale datetime.
- country_code: country code of the buyer.
- country: full country code name of the buyer.
- item_price: price of the item in the seller's currency.
- currency: the seller's currency.
- amount_paid: amount paid in the seller's currency.
- amount_paid_fmt: amount paid in the seller's currency, with the currency symbol.
- amount_paid_usd: amount paid converted to US Dollars.
- amount_over_fmt: amount voluntarily paid over the item price in the seller's currency.

In [40]:
df_filename = 'bandcamp-sales-v0-raw'
data = fl.read_df_pickle(df_filename)
df = data.copy()
df.head(5)

Unnamed: 0,_id,art_url,item_type,utc_date,country_code,track_album_slug_text,country,slug_type,amount_paid_fmt,item_price,item_description,art_id,url,amount_paid,releases,artist_name,currency,album_title,amount_paid_usd,package_image_id,amount_over_fmt,item_slug,addl_count
0,1599688803.5175&//girlbanddublin.bandcamp.com/...,https://f4.bcbits.com/img/a0206405257_7.jpg,a,1599689000.0,gb,,United Kingdom,a,$9.99,9.99,Live at Vicar Street,206405300.0,//girlbanddublin.bandcamp.com/album/live-at-vi...,9.99,,Girl Band,USD,,9.99,,,,
1,1599688805.27838&//maharettarecords.bandcamp.c...,https://f4.bcbits.com/img/a2984241552_7.jpg,a,1599689000.0,fi,,Finland,a,£1,1.0,Neurogen,2984242000.0,//maharettarecords.bandcamp.com/album/neurogen,1.0,,Jirah,GBP,,1.3,,,,
2,1599688805.90646&//maharettarecords.bandcamp.c...,https://f4.bcbits.com/img/a3320494770_7.jpg,a,1599689000.0,fi,,Finland,a,£3,3.0,The Last Snare Bender,3320495000.0,//maharettarecords.bandcamp.com/album/the-last...,3.0,,D-Ther,GBP,,3.9,,,,
3,1599688806.94234&//alicesitski.bandcamp.com/al...,https://f4.bcbits.com/img/0020476345_37.jpg,p,1599689000.0,gb,,United Kingdom,a,€10.50,10.5,Limited Edition Compact Disc,,//alicesitski.bandcamp.com/album/white-noise-tv,10.5,,WHITE NOISE TV,EUR,WHITE NOISE TV,12.39,20476345.0,,,
4,1599688809.07942&//linguaignota.bandcamp.com/t...,https://f4.bcbits.com/img/a3428873396_7.jpg,t,1599689000.0,us,,United States,t,$1,1.0,O Ruthless Great Divine Director,3428873000.0,//linguaignota.bandcamp.com/track/o-ruthless-g...,1.0,,LINGUA IGNOTA,USD,,1.0,,,,


In [41]:
eda.auto_cleanup(df)

INFO:root:Number of rows before cleanup: 1000000
INFO:root:Dataset has no empty spaces.
INFO:root:Dataset has no duplicates.
INFO:root:Number of rows after cleanup: 1000000


In [42]:
df_nulls = eda.count_nulls(df)
print(df_nulls)

                        is_na   not_na  na_percent na_percent_pretty
track_album_slug_text  997763     2237     99.7763            99.78%
releases               988416    11584     98.8416            98.84%
item_slug              978258    21742     97.8258            97.83%
addl_count             978258    21742     97.8258            97.83%
amount_over_fmt        880867   119133     88.0867            88.09%
package_image_id       767292   232708     76.7292            76.73%
album_title            643539   356461     64.3539            64.35%
art_id                 235480   764520     23.5480            23.55%
slug_type               11584   988416      1.1584             1.16%
item_description           23   999977      0.0023             0.00%
artist_name                10   999990      0.0010             0.00%
_id                         0  1000000      0.0000             0.00%
art_url                     0  1000000      0.0000             0.00%
item_type                   0  100

## Preserving target data

- `amount_paid_usd` column contains the total cost of purchase in unified format - converted from original currency to USD for compatibility.
- `amount_over_fmt` column contains information about voluntarily paid over the item price in the seller's currency. It is important for the project objective of exploring fan generosity.


## Analysing data types and unique values to reduce data junk

In [43]:
from IPython.utils.capture import capture_output

# this code will be captured in memory and not displayed
with capture_output() as output:
    eda.print_eda_report(df)

In [44]:
print(output.stdout)

report_path = f'{fl.PROJECT_ROOT}/docs/auto_eda_raw_report.md'

with open(report_path, "w") as f:
    f.write(output.stdout)

print(f"Output has been saved to file:\n{report_path}")

Dataset has shape (1000000, 23)

Dataset has numerical data in columns: ['utc_date', 'item_price', 'art_id', 'amount_paid', 'releases', 'amount_paid_usd', 'package_image_id', 'addl_count']
- Column "utc_date" has 999990 unique values.
- Column "art_id" has 271430 unique values.
- Column "package_image_id" has 63940 unique values.
- Column "amount_paid_usd" has 5866 unique values.
- Column "amount_paid" has 3315 unique values.
- Column "item_price" has 2878 unique values.
- Column "releases" has 220 unique values.
- Column "addl_count" has 17 unique values.
  -- Unique values are:
 [nan  1.  2.  3.  9.  8.  5.  4.  7. 12.  6. 19. 11. 14. 10. 22. 13. 15.]

Dataset has categorical data in columns: ['_id', 'art_url', 'item_type', 'country_code', 'track_album_slug_text', 'country', 'slug_type', 'amount_paid_fmt', 'item_description', 'url', 'artist_name', 'currency', 'album_title', 'amount_over_fmt', 'item_slug']
- Column "_id" has 1000000 unique values.
- Column "art_url" has 335212 unique 

## Identifying columns that are not important for sure in the context of this project


### What to do with `_id` and `utc_date`?
- `_id`: unique identifier combining the sale's URL and UTC timestamp.
  - Column "_id" has 1000000 unique values.
- `utc_date`: the UTC timestamp of the sale datetime.
  - Column "utc_date" has 999990 unique values.
  - Tere are no null values

Plan:
- Fill null values in `utc_date` with parsed timestamps from `_id`.
- Drop `_id` column.
- Split `utc_date` into columns that can be used for Inferencial Statistics and ML.
- Drop `utc_date` column.


In [45]:
df["utc_date"].isnull().sum()

np.int64(0)

In [46]:
# Drop _id column
df = df.drop(columns=["_id"])

In [47]:
# Convert timestamps to datetime objects:
df["utc_date"] = pd.to_datetime(df["utc_date"], unit="s")

In [48]:
#  Extract useful features (don’t use raw timestamps!):
df["hour"] = df["utc_date"].dt.hour
df["dayofweek"] = df["utc_date"].dt.dayofweek
df["month"] = df["utc_date"].dt.month  # January=1, December=12
df["year"] = df["utc_date"].dt.year
df["weekday"] = df["utc_date"].dt.weekday  # Monday=0, Sunday=6
df["weekend"] = df["utc_date"].dt.weekday >= 5


Cyclical encoding helps machine learning models understand time patterns that wrap around — like:
- hour: 23 → 0 is not a jump of 23 hours — they’re adjacent
- weekday: Monday and Sunday are next to each other in the weekly cycle
- month: December → January

In [49]:
# Create cyclical features for hour, weekday, month (sine/cosine transform)

# Hour of day (24-hour cycle)
df["hour_sin"] = np.sin(2 * np.pi * df["hour"] / 24)
df["hour_cos"] = np.cos(2 * np.pi * df["hour"] / 24)

# Day of week (7-day cycle)
df["weekday_sin"] = np.sin(2 * np.pi * df["weekday"] / 7)
df["weekday_cos"] = np.cos(2 * np.pi * df["weekday"] / 7)

# Month of year (12-month cycle)
df["month_sin"] = np.sin(2 * np.pi * df["month"] / 12)
df["month_cos"] = np.cos(2 * np.pi * df["month"] / 12)

columns_to_show = ["hour_sin", "hour_cos", "weekday_sin", "weekday_cos", "month_sin", "month_cos"]
df[columns_to_show].head()

Unnamed: 0,hour_sin,hour_cos,weekday_sin,weekday_cos,month_sin,month_cos
0,-0.5,0.866025,0.974928,-0.222521,-1.0,-1.83697e-16
1,-0.5,0.866025,0.974928,-0.222521,-1.0,-1.83697e-16
2,-0.5,0.866025,0.974928,-0.222521,-1.0,-1.83697e-16
3,-0.5,0.866025,0.974928,-0.222521,-1.0,-1.83697e-16
4,-0.5,0.866025,0.974928,-0.222521,-1.0,-1.83697e-16


In [50]:
# Drop original columns in future
# df.drop(columns=["hour", "weekday", "month"], inplace=True)

# Drop utc_date column
df = df.drop(columns=["utc_date"])


In [51]:
# for later use
columns_datetime = ["hour", "dayofweek", "month", "year", "weekday", "weekend"]
columns_datetime += ["hour_sin", "hour_cos", "weekday_sin", "weekday_cos", "month_sin", "month_cos"]

### What to do with `track_album_slug_text`, `item_slug`?!

In [52]:
# 5 examples of unique values for 'item_slug'
eda.unique_values_list(df, 'item_slug')[:5]

array([nan, '/album/', '/album/prabhupadas-dissp-2015-3-vishakha-brhmch',
       '/album/friends-from-childhood', '/album/will-he-save-us'],
      dtype=object)

In [53]:
# items are sorted alphabetically, show the end of the list
eda.unique_values_list(df, 'item_slug')[-5:][::-1]

array(['/album/failure-to-return',
       '/album/fuck-you-diggy-1-d-x-nem-x-lil-woofy-woof-x-dj-killa-c-x-g-lo-key-x-mr-sisco-prod-nem',
       '/album/individuum-ii', '/album/hand-covering-sun-extended-mix',
       '/album/airbuccaneers-2'], dtype=object)

In [54]:
# 5 examples of unique values for 'track_album_slug_text'
eda.unique_values_list(df, 'track_album_slug_text')[:5]

array([nan, 'pachuco', 'dale-roberts',
       'prabhupadas-dissp-2015-3-vishakha-brhmch', '--348'], dtype=object)

In [55]:
# items are sorted alphabetically, show the end of the list
eda.unique_values_list(df, 'track_album_slug_text')[-5:][::-1]

array(['fetus-tacos-2', '04-allmiladies-3', 'party-plannin-intro',
       'aquawave', 'failure-to-return'], dtype=object)

In [56]:
columns_to_drop = []
columns_to_drop += ['item_slug', 'track_album_slug_text']

There is no description of `addl_count` column, there are 97.83% null values. Therefore this column should be dropped.

In [57]:
eda.unique_values_list(df, 'addl_count')

array([nan,  1.,  2.,  3.,  9.,  8.,  5.,  4.,  7., 12.,  6., 19., 11.,
       14., 10., 22., 13., 15.])

In [58]:
columns_to_drop.append('addl_count')

Columns related to artworks should be dropped for the scope of this project as retrieving and interpreting the relevant information is costly.

In [None]:
columns_to_drop += ['package_image_id', 'art_url', 'url']

### Dropping unusful columns

In [60]:
df.drop(columns=columns_to_drop, inplace=True)

## Treating target related columns

This could be done earlier, but by doing transformations on the reduced data set productivity will be higher.

- `amount_paid_usd` column contains the total cost of purchase in unified format - converted from original currency to USD for compatibility. 
  - This is the target variable for understanding sales trends.
  - This data is derived from columns:
    - `amount_paid_fmt`, `amount_paid`, `item_price`, `currency`, `amount_over_fmt`
    - Data relation formula in pseudocode:
    - ```amount_paid_usd = (item_price + amount_over_fmt = amount_paid) convert from currency to USD```
    - This assumption must be verified before any of the columns will be dropped.

- `amount_over_fmt` column contains information about voluntarily paid over the item price in the seller's currency. It is important for the project objective of exploring fan generosity.
  - this is stored as categorical data, not numeric
  - this data is stored in seller's currency and needs to be unified by converting into dollars. For this `currency` column is necessary. 
  - 88.09% of the column is missing data. That are cases where buyers did not pay extra on top of `item_price`. Must be filled with 0.


#### Convert `amount_over_fmt` from categorical to numerical

In [61]:
df["amount_over_fmt"].apply(type).value_counts()

amount_over_fmt
<class 'float'>    880867
<class 'str'>      119133
Name: count, dtype: int64

In [62]:
df[df["amount_over_fmt"].apply(lambda x: isinstance(x, str))]["amount_over_fmt"].head(5)

13      €2
18      $6
27      $5
28     £10
37    $197
Name: amount_over_fmt, dtype: object

In [63]:
# Remove currency symbols from string entries and convert to float
def clean_amount_over(x):
    if isinstance(x, str):
        # Remove any common currency symbol using regex
        cleaned = re.sub(r"[^\d.,]", "", x)
        try:
            return float(cleaned)
        except ValueError:
            return np.nan
    return x  # already float or NaN

df["amount_over_fmt"] = df["amount_over_fmt"].apply(clean_amount_over)

# Ensure NaNs are set where values were missing or invalid
df["amount_over_fmt"] = df["amount_over_fmt"].replace("", np.nan)

In [64]:
# Fill NaNs with 0 where no extra payment was made
df["amount_over_fmt"] = df["amount_over_fmt"].fillna(0.0)

In [65]:
# Veerify that the columns is cleaned now
print(f"All values sould be float now: {df["amount_over_fmt"].dtype}\n")
print(f"No null values are expected: {df["amount_over_fmt"].isnull().sum()}\n")
print(f"Treated as numeric and sow automatical stats\n: {df["amount_over_fmt"].describe()}\n")
# The stats are useless now because all values are in different currencies

All values sould be float now: float64

No null values are expected: 0

Treated as numeric and sow automatical stats
: count    1000000.000000
mean           1.415943
std           17.658453
min            0.000000
25%            0.000000
50%            0.000000
75%            0.000000
max          999.200000
Name: amount_over_fmt, dtype: float64



#### Convert currencies
- For perfect precision, currency convertion rate should be used accordign to the timestamp.
- Precision is not so important as unification of data for machine learning. 
- As we have `amount_paid_usd` already and "currency" for reference  we can deduce convertion rate from it.

In [66]:
# Verify that amount_paid_usd ≈ amount_paid when currency is USD
usd_rows = df["currency"] == "USD"
usd_diff = (df.loc[usd_rows, "amount_paid_usd"] - df.loc[usd_rows, "amount_paid"]).abs()

print("USD mismatches:", (usd_diff > 0.01).sum(), "/", usd_rows.sum())

USD mismatches: 0 / 455569


In [67]:
# Calculate conversion rate
df["conversion_rate"] = df["amount_paid_usd"] / df["amount_paid"]

# Fill NaNs or infs (from 0 amounts) with 1.0 only for USD, else leave NaN
df.loc[df["currency"] == "USD", "conversion_rate"] = 1.0

df[["currency", "amount_paid", "amount_paid_usd", "conversion_rate"]].sample(5)

Unnamed: 0,currency,amount_paid,amount_paid_usd,conversion_rate
144293,GBP,1.5,1.92,1.28
98646,USD,10.0,10.0,1.0
208918,USD,7.0,7.0,1.0
381612,USD,1.0,1.0,1.0
461747,EUR,15.0,17.76,1.184


In [68]:
# Convert fan donations to USD
df["amount_over_usd"] = df["amount_over_fmt"] * df["conversion_rate"]

df["amount_over_usd"].describe()

count    1000000.000000
mean           0.874630
std            6.984831
min            0.000000
25%            0.000000
50%            0.000000
75%            0.000000
max         1276.953795
Name: amount_over_usd, dtype: float64

In [69]:
# Convert item original price to USD
df["item_price_usd"] = df["item_price"] * df["conversion_rate"]

df["item_price_usd"].describe()

count    1000000.000000
mean           7.951427
std           10.511279
min            0.000000
25%            1.288000
50%            5.070000
75%           10.000000
max         1000.000000
Name: item_price_usd, dtype: float64

In [70]:
# double-check tere are no missing values
columns_money = ["amount_paid", "amount_paid_usd", "item_price", "item_price_usd", "amount_over_fmt", "amount_over_usd", "currency", "conversion_rate"]
df[columns_money].isnull().sum()

amount_paid        0
amount_paid_usd    0
item_price         0
item_price_usd     0
amount_over_fmt    0
amount_over_usd    0
currency           0
conversion_rate    0
dtype: int64

In [71]:
# Verify that amount_paid_usd = item_price_usd + amount_over_usd
df["paid_usd_diff"] = (df["amount_paid_usd"] - (df["item_price_usd"] + df["amount_over_usd"])).abs()

# Check how many rows match within a small tolerance
tolerance = 0.05  # Allowable rounding error
matching_rows_usd = df["paid_usd_diff"] < tolerance

print(f"Matching rows: {matching_rows_usd.sum()} / {len(df)}")


Matching rows: 861860 / 1000000


In [72]:
# # drop unnecessary columns
# columns_money_drop = ["amount_over_fmt", "amount_paid", "item_price", "currency", "conversion_rate"]
# df.drop(columns=columns_money_drop, inplace=True)
# columns_money = [col for col in columns_money if col not in columns_money_drop]
# print(f"Columns left: {columns_money}")

#### Verify that amount_paid = item_price + amount_over_fmt

In [73]:
# verify that amount_paid = item_price + amount_over_fmt

df["recomputed_paid"] = df["item_price_usd"] + df["amount_over_usd"]
df["paid_diff"] = (df["amount_paid_usd"] - df["recomputed_paid"]).abs()

# Check how many are matching within a tolerance
matching_rows = df["paid_diff"] < 0.05 # small rounding error allowed
print("Matching rows:", matching_rows.sum(), "/", len(df))

Matching rows: 861860 / 1000000


In [90]:
columns_money.append("recomputed_paid")
columns_money.append("paid_diff")

#### Study mismatch cases where amount_paid > item_price + amount_over_fmt

In [80]:
#  Isolate the mismatches
mismatch_df = df[df["paid_diff"] >= 0.05].copy()
print("Mismatch rows:", len(mismatch_df))

Mismatch rows: 138140


In [75]:
# Look at statistical summary of the differences
mismatch_df["paid_diff"].describe()

count    138140.000000
mean          1.316660
std           5.066392
min           0.050000
25%           0.730000
50%           1.000000
75%           1.273568
max        1000.000000
Name: paid_diff, dtype: float64

In [79]:
# Check which currencies dominate mismatches
mismatch_df["currency"].value_counts().head(10)

currency
USD    75449
EUR    30411
GBP    19503
CAD     3708
AUD     3402
JPY     2532
SEK      729
CHF      517
NZD      387
DKK      331
Name: count, dtype: int64

In [81]:
#  Look at mismatches where amount_paid_usd is very small or zero
mismatch_df[mismatch_df["amount_paid_usd"] < 1.0][["amount_paid", "item_price_usd", "amount_over_usd", "amount_paid_usd"]].head()

Unnamed: 0,amount_paid,item_price_usd,amount_over_usd,amount_paid_usd
84,0.75,0.0,0.0,0.97
104,1.0,0.0,0.0,0.73
138,0.5,0.0,0.0,0.65
145,0.5,0.0,0.0,0.59
147,0.5,0.0,0.0,0.59


In [82]:
# Compare full rows to see examples of what’s wrong
mismatch_df[["currency", "item_price", "amount_over_fmt", "amount_paid", "amount_paid_usd", "item_price_usd", "amount_over_usd", "recomputed_paid", "paid_diff"]].sample(5)

Unnamed: 0,currency,item_price,amount_over_fmt,amount_paid,amount_paid_usd,item_price_usd,amount_over_usd,recomputed_paid,paid_diff
556053,EUR,7.77,0.0,8.0,9.34,9.071475,0.0,9.071475,0.268525
753123,EUR,18.0,0.0,19.0,22.1,20.936842,0.0,20.936842,1.163158
30474,GBP,0.0,0.0,1.0,1.29,0.0,0.0,0.0,1.29
337449,GBP,7.0,0.0,8.0,10.36,9.065,0.0,9.065,1.295
728171,EUR,0.0,0.0,0.5,0.58,0.0,0.0,0.0,0.58


In [89]:
# assumption: the amount_over_usd is zero for all mismatches

zero_count = (mismatch_df["amount_over_usd"] == 0).sum()
nonzero_count = (mismatch_df["amount_over_usd"] != 0).sum()

print(f"Zero values for amount_over_usd: {zero_count}")
print(f"Non-zero values for amount_over_usd: {nonzero_count}")


Zero values for amount_over_usd: 138056
Non-zero values for amount_over_usd: 84


In [91]:
# understand the data inconsistency for the worst cases
nonzero_mismatches = mismatch_df[mismatch_df["amount_over_usd"] != 0]

nonzero_mismatches.sort_values("paid_diff", ascending=False).head(10)[columns_money]


Unnamed: 0,amount_paid,amount_paid_usd,item_price,item_price_usd,amount_over_fmt,amount_over_usd,currency,conversion_rate,recomputed_paid,paid_diff
894602,50.0,50.0,0.0,0.0,25.0,25.0,USD,1.0,25.0,25.0
526214,25.0,25.0,5.0,5.0,4.0,4.0,USD,1.0,9.0,16.0
850451,20.0,20.0,0.0,0.0,5.0,5.0,USD,1.0,5.0,15.0
850450,20.0,20.0,0.0,0.0,5.0,5.0,USD,1.0,5.0,15.0
605644,25.0,25.0,6.45,6.45,3.71,3.71,USD,1.0,10.16,14.84
851832,12.5,14.66,0.0,0.0,1.5,1.7592,EUR,1.1728,1.7592,12.9008
658799,20.0,20.0,4.0,4.0,4.0,4.0,USD,1.0,8.0,12.0
658794,20.0,20.0,4.0,4.0,4.0,4.0,USD,1.0,8.0,12.0
255522,25.0,25.0,0.0,0.0,15.0,15.0,USD,1.0,15.0,10.0
443350,20.0,20.0,0.0,0.0,10.0,10.0,USD,1.0,10.0,10.0


In [94]:
# Assumption: errors come from cases with item_price==0
free_items = mismatch_df[mismatch_df["item_price"] == 0]
print("Rows with item_price == 0:", len(free_items))
free_items[columns_money].head()

Rows with item_price == 0: 73757


Unnamed: 0,amount_paid,amount_paid_usd,item_price,item_price_usd,amount_over_fmt,amount_over_usd,currency,conversion_rate,recomputed_paid,paid_diff
6,1.0,1.0,0.0,0.0,0.0,0.0,USD,1.0,0.0,1.0
49,1.0,1.0,0.0,0.0,0.0,0.0,USD,1.0,0.0,1.0
76,1.0,1.0,0.0,0.0,0.0,0.0,USD,1.0,0.0,1.0
84,0.75,0.97,0.0,0.0,0.0,0.0,GBP,1.293333,0.0,0.97
104,1.0,0.73,0.0,0.0,0.0,0.0,AUD,0.73,0.0,0.73


#### Decompose mistatch cases where amount_paid > item_price + amount_over_fmt
- Case 1: [amount_over_usd == 0]
  - This means that customer did not add extra donation when buying an item
- Case 2: [item_price_usd == 0] & [amount_over_usd == 0]
  - The item was for free and customer did not pay extra
- Case 3: [item_price_usd == 0] & [amount_over_usd == 0]
  - .
- Case 4: [item_price_usd == 0] & [amount_over_usd == 0]
  - .

## Treating absence of data

In [76]:
df_nulls = eda.count_nulls(df)
print(df_nulls)

                   is_na   not_na  na_percent na_percent_pretty
releases          988416    11584     98.8416            98.84%
album_title       643539   356461     64.3539            64.35%
art_id            235480   764520     23.5480            23.55%
slug_type          11584   988416      1.1584             1.16%
item_description      23   999977      0.0023             0.00%
artist_name           10   999990      0.0010             0.00%
item_type              0  1000000      0.0000             0.00%
country_code           0  1000000      0.0000             0.00%
country                0  1000000      0.0000             0.00%
amount_paid_fmt        0  1000000      0.0000             0.00%
item_price             0  1000000      0.0000             0.00%
url                    0  1000000      0.0000             0.00%
amount_paid            0  1000000      0.0000             0.00%
currency               0  1000000      0.0000             0.00%
amount_paid_usd        0  1000000      0

In [77]:
# TODO


## Backup dataframe

In [78]:
df_filename = 'bandcamp-sales-v1-cleaned'
fl.save_df_pickle(df, df_filename)

INFO:root:Backup file is created: /Users/bubblegum_doubledrops/Library/Mobile Documents/com~apple~CloudDocs/0prio - Important heavy backups/IronHack/big_projects/midproject-bandcamp-insights/data/bandcamp-sales-v1-cleaned.pkl
