In [1]:
import pandas as pd
from src import datasets

In [2]:
# Setup: create dataframes from perdownloaded csv files, excluded from git repo
# all these files can be downloaded from the web pages linked below

datasets_files = [
'1000000-bandcamp-sales.csv', 
'discogs_20250101_artists.csv', 
'release_data.csv']

project_pack = datasets.DatasetPack()

ds_bandcamp_sales = datasets.Dataset('df_bandcamp_sales', datasets_files[0])
project_pack.add_dataset(ds_bandcamp_sales)
df_bandcamp_sales = ds_bandcamp_sales.dataframe

ds_discogs_artists = datasets.Dataset('df_discogs_artists', datasets_files[1])
project_pack.add_dataset(ds_discogs_artists)
df_discogs_artists = ds_discogs_artists.dataframe

ds_discogs_releases = datasets.Dataset('df_discogs_releases', datasets_files[2])
project_pack.add_dataset(ds_discogs_releases)
df_discogs_releases = ds_discogs_releases.dataframe

# Datasets raw source overview

## Discogs Datasets (January 2025)

Source: https://www.kaggle.com/datasets/ofurkancoban/discogs-datasets-january-2025

This file contains detailed information about artists from the Discogs Data Dump as of January 1, 2025. It provides structured data about individual artists, including their names, unique IDs, profiles, and related information, all in CSV format.

Columns:
- Artist ID: Unique identifier for each artist.
- Artist Name: The name of the artist.
- Profile: A short description or biography of the artist.
- Real Name: The real name of the artist (if available).
- Aliases: Alternative names or pseudonyms used by the artist.
- Groups: Bands or groups the artist is associated with.
- URLs: Links to external websites or profiles related to the artist.
- Data Quality: Indicates the quality or completeness of the data entry.

Use Case:
This file is ideal for applications like music cataloging, trend analysis, artist mapping, or building recommendation systems based on artist relationships and metadata.

In [3]:
df_discogs_artists.head()

Unnamed: 0.1,Unnamed: 0,aliases_name,aliases_name_id,artist_data_quality,artist_id,artist_name,artist_profile,artist_realname,groups_name,groups_name_id,members_name,members_name_id,namevariations_name,urls_url
0,0,The Pinguin Man,439150.0,Needs Vote,1,The Persuader,"Electronic artist working out of Stockholm, ac...",Jesper Dahlbäck,,,,,The Presuader,https://www.last.fm/music/Jesper+Dahlb%C3%A4ck
1,1,Alexi Delano & Cari Lekebusch,1779857.0,Correct,2,Mr. James Barth & A.D.,,,,,Cari Lekebusch,27.0,Mr. James Barth & A. D.,
2,2,J. Dawg,870371.0,Correct,3,Josh Wink,"Electronic music DJ, label owner, producer, an...",Joshua Winkelman,The Force (23),1642275.0,,,佐瑟溫克,http://www.youtube.com/user/JoshWinkVEVO
3,3,Cryptik,3558838.0,Needs Vote,4,Johannes Heil,"Electronic music producer, musician and live p...",Johannes Heil,Question Authority (2),239403.0,,,Johannes Hell,
4,4,HLX,221112.0,Needs Vote,5,Heiko Laux,German DJ and producer based in Berlin. He is ...,Heiko Laux,4K (3),3426169.0,,,Laux,http://www.myspace.com/heikolaux


In [4]:
df_discogs_artists.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9194907 entries, 0 to 9194906
Data columns (total 14 columns):
 #   Column               Dtype  
---  ------               -----  
 0   Unnamed: 0           int64  
 1   aliases_name         object 
 2   aliases_name_id      float64
 3   artist_data_quality  object 
 4   artist_id            int64  
 5   artist_name          object 
 6   artist_profile       object 
 7   artist_realname      object 
 8   groups_name          object 
 9   groups_name_id       float64
 10  members_name         object 
 11  members_name_id      float64
 12  namevariations_name  object 
 13  urls_url             object 
dtypes: float64(3), int64(2), object(9)
memory usage: 982.1+ MB


#### Dropping columns that are not important for sure in the context of this project

In [5]:
columns_to_drop = ['artist_profile', 'urls_url']
df_discogs_artists.drop(columns=columns_to_drop, inplace=True)

#### Backing up the dataframe for further use

In [6]:
project_pack.dictionary['df_discogs_artists'].backup()

## Discogs Database All Release Data
This dataset encompasses over 13 million recordings, providing a broad view of music distribution and physical sales, particularly vinyl records.

https://www.kaggle.com/datasets/sohrabdaemi/discogs-database-all-release-data

In [7]:
df_discogs_releases.head()

Unnamed: 0,release_id,country,year,genre,format
0,1,Sweden,1999.0,Electronic,Vinyl
1,2,Sweden,1998.0,Electronic,Vinyl
2,3,US,1999.0,Electronic,CD
3,4,US,1999.0,Electronic,CD
4,5,Germany,1995.0,Electronic,CD


In [8]:
df_discogs_releases.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17372035 entries, 0 to 17372034
Data columns (total 5 columns):
 #   Column      Dtype  
---  ------      -----  
 0   release_id  int64  
 1   country     object 
 2   year        float64
 3   genre       object 
 4   format      object 
dtypes: float64(1), int64(1), object(3)
memory usage: 662.7+ MB


#### Backing up the dataframe for further use

In [9]:
project_pack.dictionary['df_discogs_releases'].backup()

## Bandcamp

Source: https://www.kaggle.com/datasets/mathurinache/1000000-bandcamp-sales

This dataset contains 1,000,000 items from Bandcamp's sales feed between 9/9/2020 and 10/2/2020, and is a slice of the whole dataset used in The Chaos Bazaar. It contains the following columns:

- _id: unique identifier combining the sale's URL and UTC timestamp.
- url: the path to the item on Bandcamp. Use this column to join this dataset to the dataset of Bandcamp items.
- artist_name: Name of the artist.
- album_title: Title of the album, if applicable.
- art_url: path to the item's art image.
- item_type: denotes the type of object. a for digital albums, p for physical items, and t for digital tracks.
- slug_type: also denotes the type of object. a for all albums, p for merch, and t for tracks.
- utc_date: the UTC timestamp of the sale datetime.
- country_code: country code of the buyer.
- country: full country code name of the buyer.
- item_price: price of the item in the seller's currency.
- currency: the seller's currency.
- amount_paid: amount paid in the seller's currency.
- amount_paid_fmt: amount paid in the seller's currency, with the currency symbol.
- amount_paid_usd: amount paid converted to US Dollars.
- amount_over_fmt: amount voluntarily paid over the item price in the seller's currency.

Sample code, charts and reports:
https://www.kaggle.com/code/mathurinache/bandcamp-dataset-starter


In [10]:
df_bandcamp_sales.head()

Unnamed: 0,_id,art_url,item_type,utc_date,country_code,track_album_slug_text,country,slug_type,amount_paid_fmt,item_price,...,amount_paid,releases,artist_name,currency,album_title,amount_paid_usd,package_image_id,amount_over_fmt,item_slug,addl_count
0,1599688803.5175&//girlbanddublin.bandcamp.com/...,https://f4.bcbits.com/img/a0206405257_7.jpg,a,1599689000.0,gb,,United Kingdom,a,$9.99,9.99,...,9.99,,Girl Band,USD,,9.99,,,,
1,1599688805.27838&//maharettarecords.bandcamp.c...,https://f4.bcbits.com/img/a2984241552_7.jpg,a,1599689000.0,fi,,Finland,a,£1,1.0,...,1.0,,Jirah,GBP,,1.3,,,,
2,1599688805.90646&//maharettarecords.bandcamp.c...,https://f4.bcbits.com/img/a3320494770_7.jpg,a,1599689000.0,fi,,Finland,a,£3,3.0,...,3.0,,D-Ther,GBP,,3.9,,,,
3,1599688806.94234&//alicesitski.bandcamp.com/al...,https://f4.bcbits.com/img/0020476345_37.jpg,p,1599689000.0,gb,,United Kingdom,a,€10.50,10.5,...,10.5,,WHITE NOISE TV,EUR,WHITE NOISE TV,12.39,20476345.0,,,
4,1599688809.07942&//linguaignota.bandcamp.com/t...,https://f4.bcbits.com/img/a3428873396_7.jpg,t,1599689000.0,us,,United States,t,$1,1.0,...,1.0,,LINGUA IGNOTA,USD,,1.0,,,,


In [11]:
df_bandcamp_sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 23 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   _id                    1000000 non-null  object 
 1   art_url                1000000 non-null  object 
 2   item_type              1000000 non-null  object 
 3   utc_date               1000000 non-null  float64
 4   country_code           1000000 non-null  object 
 5   track_album_slug_text  2237 non-null     object 
 6   country                1000000 non-null  object 
 7   slug_type              988416 non-null   object 
 8   amount_paid_fmt        1000000 non-null  object 
 9   item_price             1000000 non-null  float64
 10  item_description       999977 non-null   object 
 11  art_id                 764520 non-null   float64
 12  url                    1000000 non-null  object 
 13  amount_paid            1000000 non-null  float64
 14  releases           

#### Dropping columns that are not important for sure in the context of this project
- artworks
- keep only 'amount_paid_usd' for the purchase value, as it is unified

In [12]:
columns_to_drop = ['track_album_slug_text', 'package_image_id', 'art_url', 'amount_paid_fmt', 'amount_paid', 'addl_count']
df_bandcamp_sales.drop(columns=columns_to_drop, inplace=True)

#### Backing up the dataframe for further use

In [13]:
project_pack.dictionary['df_bandcamp_sales'].backup()

### Backup the dataset dictionary for maintenance purposes

In [14]:
project_pack.backup_pack()