In [3]:
import pandas as pd
from src import datasets

In [4]:
# Setup: create dataframes from perdownloaded csv files, excluded from git repo
# all these files can be downloaded from the web pages linked below

datasets_files = [
'1000000-bandcamp-sales.csv', 
'discogs_20250101_artists.csv', 
'release_data.csv']

project_pack = datasets.DatasetPack()

ds_bandcamp_sales = datasets.Dataset('df_bandcamp_sales', datasets_files[0])
project_pack.add_dataset(ds_bandcamp_sales)
df_bandcamp_sales = ds_bandcamp_sales.dataframe

ds_discogs_artists = datasets.Dataset('df_discogs_artists', datasets_files[1])
project_pack.add_dataset(ds_discogs_artists)
df_discogs_artists = ds_discogs_artists.dataframe

ds_discogs_releases = datasets.Dataset('df_discogs_releases', datasets_files[2])
project_pack.add_dataset(ds_discogs_releases)
df_discogs_releases = ds_discogs_releases.dataframe

print(f"The following data frames are created with corresponding labels in data set pack:\n {project_pack.get_labels()}")

The following data frames are created with corresponding labels in data set pack:
 ['df_bandcamp_sales', 'df_discogs_artists', 'df_discogs_releases']


# Datasets raw source overview

## Discogs Datasets (January 2025)

Source: https://www.kaggle.com/datasets/ofurkancoban/discogs-datasets-january-2025

This file contains detailed information about artists from the Discogs Data Dump as of January 1, 2025. 

It provides structured data about individual artists, including their names, unique IDs, profiles, and related information, all in CSV format.

Columns:
- Artist ID: Unique identifier for each artist.
- Artist Name: The name of the artist.
- Profile: A short description or biography of the artist.
- Real Name: The real name of the artist (if available).
- Aliases: Alternative names or pseudonyms used by the artist.
- Groups: Bands or groups the artist is associated with.
- URLs: Links to external websites or profiles related to the artist.
- Data Quality: Indicates the quality or completeness of the data entry.

Use Case:
This file is ideal for applications like music cataloging, trend analysis, artist mapping, or building recommendation systems based on artist relationships and metadata.

In [21]:
df_discogs_artists.sample(30)

Unnamed: 0.1,Unnamed: 0,aliases_name,aliases_name_id,artist_data_quality,artist_id,artist_name,artist_realname,groups_name,groups_name_id,members_name,members_name_id,namevariations_name
4753567,4753567,,,Needs Vote,6264626,Gérald Daragnes,,Tample,4587682.0,,,Gérald Daragnès
8659321,8659321,,,Needs Major Changes,13626688,Die Watussis,,,,,,
5771649,5771649,,,Needs Vote,7279387,Buku Broux,,,,,,
110709,110709,,,Needs Vote,1373711,Dorota Ślęzak,,Piwnica Pod Baranami,3127081.0,,,
4992934,4992934,,,Needs Major Changes,6509048,Henry Posada (2),,,,,,
5533633,5533633,,,Needs Major Changes,7051041,D.D.B. (3),,,,,,
621312,621312,,,Needs Major Changes,1924144,Cassie Brzoza Designs,,,,,,
3725672,3725672,,,Needs Vote,5156574,M. Willingham (2),,Phaze (4),1198096.0,,,
6500530,6500530,,,Needs Major Changes,7999375,T.NAVA,,,,,,
4675356,4675356,,,Needs Vote,6188703,Bryan Brady,,Locked Up (2),8167408.0,,,Bryan


In [4]:
df_discogs_artists.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9194907 entries, 0 to 9194906
Data columns (total 14 columns):
 #   Column               Dtype  
---  ------               -----  
 0   Unnamed: 0           int64  
 1   aliases_name         object 
 2   aliases_name_id      float64
 3   artist_data_quality  object 
 4   artist_id            int64  
 5   artist_name          object 
 6   artist_profile       object 
 7   artist_realname      object 
 8   groups_name          object 
 9   groups_name_id       float64
 10  members_name         object 
 11  members_name_id      float64
 12  namevariations_name  object 
 13  urls_url             object 
dtypes: float64(3), int64(2), object(9)
memory usage: 982.1+ MB


#### Dropping columns that are not important for sure in the context of this project

In [5]:
columns_to_drop = ['artist_profile', 'urls_url']
df_discogs_artists.drop(columns=columns_to_drop, inplace=True)

#### Backing up the dataframe for further use

In [7]:
project_pack.dictionary['df_discogs_artists'].backup()

## Discogs Database All Release Data
This dataset encompasses over 13 million recordings, providing a broad view of music distribution and physical sales, particularly vinyl records.

https://www.kaggle.com/datasets/sohrabdaemi/discogs-database-all-release-data

In [24]:
df_discogs_releases.sample(20)

Unnamed: 0,release_id,country,year,genre,format
16893939,15298365,Spain,2020.0,Stage & Screen,CD
16621910,15050527,Spain,2014.0,Rock,Blu-ray
14043833,12738261,Germany,,Pop,CD
16563617,14997486,UK,2019.0,Electronic,File
8985756,8002855,Germany,2014.0,Rock,Box Set
15731071,14250460,Italy,2009.0,Electronic,File
4301111,3772374,UK,2009.0,Electronic,File
3393673,3035502,Japan,2001.0,Electronic,CD
3589073,3193827,Europe,2007.0,Rock,CD
728079,758962,UK,,Electronic,Vinyl


In [25]:
df_discogs_releases.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17372035 entries, 0 to 17372034
Data columns (total 5 columns):
 #   Column      Dtype  
---  ------      -----  
 0   release_id  int64  
 1   country     object 
 2   year        float64
 3   genre       object 
 4   format      object 
dtypes: float64(1), int64(1), object(3)
memory usage: 662.7+ MB


#### Backing up the dataframe for further use

In [8]:
project_pack.dictionary['df_discogs_releases'].backup()

## Bandcamp

Source: https://www.kaggle.com/datasets/mathurinache/1000000-bandcamp-sales

This dataset contains 1,000,000 items from Bandcamp's sales feed between 9/9/2020 and 10/2/2020, and is a slice of the whole dataset used in The Chaos Bazaar. It contains the following columns:

- _id: unique identifier combining the sale's URL and UTC timestamp.
- url: the path to the item on Bandcamp. Use this column to join this dataset to the dataset of Bandcamp items.
- artist_name: Name of the artist.
- album_title: Title of the album, if applicable.
- art_url: path to the item's art image.
- item_type: denotes the type of object. a for digital albums, p for physical items, and t for digital tracks.
- slug_type: also denotes the type of object. a for all albums, p for merch, and t for tracks.
- utc_date: the UTC timestamp of the sale datetime.
- country_code: country code of the buyer.
- country: full country code name of the buyer.
- item_price: price of the item in the seller's currency.
- currency: the seller's currency.
- amount_paid: amount paid in the seller's currency.
- amount_paid_fmt: amount paid in the seller's currency, with the currency symbol.
- amount_paid_usd: amount paid converted to US Dollars.
- amount_over_fmt: amount voluntarily paid over the item price in the seller's currency.

Sample code, charts and reports:
https://www.kaggle.com/code/mathurinache/bandcamp-dataset-starter


In [29]:
df_bandcamp_sales.sample(20)

Unnamed: 0,_id,item_type,utc_date,country_code,country,slug_type,item_description,art_id,url,releases,artist_name,album_title,amount_paid_usd,amount_over_fmt,item_slug
216520,1600165208.98146&//rc88.bandcamp.com/album/rc8...,a,1600165000.0,fr,France,a,RC88's Fighting is Magic Tracks,3093735000.0,//rc88.bandcamp.com/album/rc88s-fighting-is-ma...,,Stuart Ferguson,,2.0,,
101496,1599884294.88734&//astrophysicsbrazil.bandcamp...,t,1599884000.0,au,Australia,t,Sweden (Bonus Track),766346200.0,//astrophysicsbrazil.bandcamp.com/track/sweden...,,Astrophysics,Bittersweet Reality,1.0,,
666928,1601092341.56153&//banjoguyollie.bandcamp.com/...,a,1601092000.0,se,Sweden,a,Amiga Classics,1097989000.0,//banjoguyollie.bandcamp.com/album/amiga-classics,,Banjo Guy Ollie,,9.0,,
309318,1600366514.45336&//lamiavox.bandcamp.com/album...,p,1600367000.0,us,United States,a,Limited Edition Double Vinyl,,//lamiavox.bandcamp.com/album/sigillum-diaboli...,,Lamia Vox,"Sigillum Diaboli, re-issue",28.34,,
831011,1601478907.37418&https://listen.20buckspin.com...,a,1601479000.0,gb,United Kingdom,a,Stygian,4077586000.0,https://listen.20buckspin.com/album/stygian,,Atramentus,,7.77,,
752400,1601294906.70841&//fhlostonparadigm.bandcamp.c...,a,1601295000.0,au,Australia,a,Right Where We Are,3535556000.0,//fhlostonparadigm.bandcamp.com/album/right-wh...,,Fhloston Paradigm,,8.0,,
946197,1601633809.60372&//saikelabel.bandcamp.com/mer...,p,1601634000.0,fr,France,p,SAIKE+01 T-shirt VIPER DIVA,,//saikelabel.bandcamp.com/merch/saike-01-t-shi...,,SAIKE,,29.29,,
691294,1601139995.29481&//clppng.bandcamp.com/album/v...,p,1601140000.0,us,United States,a,Loser Edition 2xLP,,//clppng.bandcamp.com/album/visions-of-bodies-...,,clipping.,Visions of Bodies Being Burned,28.0,,
367695,1600461090.92535&//eprom.bandcamp.com/track/fl...,t,1600461000.0,ca,Canada,t,FleshNet,3257936000.0,//eprom.bandcamp.com/track/fleshnet,,EPROM,,1.9,,
196233,1600109537.53417&//biggiantcircles.bandcamp.co...,a,1600110000.0,us,United States,a,Max Effect,1588384000.0,//biggiantcircles.bandcamp.com/album/max-effect,,Big Giant Circles,,1.0,,


In [11]:
df_bandcamp_sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 23 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   _id                    1000000 non-null  object 
 1   art_url                1000000 non-null  object 
 2   item_type              1000000 non-null  object 
 3   utc_date               1000000 non-null  float64
 4   country_code           1000000 non-null  object 
 5   track_album_slug_text  2237 non-null     object 
 6   country                1000000 non-null  object 
 7   slug_type              988416 non-null   object 
 8   amount_paid_fmt        1000000 non-null  object 
 9   item_price             1000000 non-null  float64
 10  item_description       999977 non-null   object 
 11  art_id                 764520 non-null   float64
 12  url                    1000000 non-null  object 
 13  amount_paid            1000000 non-null  float64
 14  releases           

#### Dropping columns that are not important for sure in the context of this project
- artworks
- keep only 'amount_paid_usd' for the purchase value, as it is unified
- currency is dropped because because there is a column country and all payments are unified to usd

In [10]:
columns_to_drop = ['track_album_slug_text', 'package_image_id', 'art_url', 'amount_paid_fmt', 'amount_paid', 'addl_count', 'item_price', 'currency']
df_bandcamp_sales.drop(columns=columns_to_drop, inplace=True)

#### Backing up the dataframe for further use

In [11]:
project_pack.dictionary['df_bandcamp_sales'].backup()

### Backup the dataset dictionary for maintenance purposes

In [12]:
project_pack.backup_pack()