In [1]:
import pickle
import pandas as pd

# Create mp3 list for genre classes

### Imports

### Load data

In [2]:
df = pd.read_csv('track_data_trimmed.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 664466 entries, 0 to 664465
Data columns (total 9 columns):
id                 664466 non-null int64
release_artist     664466 non-null object
release_cat_num    663439 non-null object
release_date       664466 non-null object
release_genre      664466 non-null object
release_label      664466 non-null object
release_title      664465 non-null object
track_name         664466 non-null object
track_url          664466 non-null object
dtypes: int64(1), object(8)
memory usage: 45.6+ MB


### Genre counts & map to parent genres

In [4]:
track_genres = df.groupby('release_genre')['id'].count().sort_values(ascending=False).reset_index()
track_genres

Unnamed: 0,release_genre,id
0,Minimal/Tech House,127504
1,Funky/Club House,72220
2,Techno,66415
3,Deep House,65465
4,Progressive House,57294
5,Uplifting Trance,31910
6,Electro House,30999
7,Drum And Bass,29175
8,Dirty Dubstep/Trap/Grime,16636
9,Breakbeat,15450


Lots of genres, and lots with too few songs.

I'm going to map some of these smaller genres to a bigger parent genre and then remove ones that I'm not going to use.

In [5]:
def parent_genre(s):
    
    parent_genres = {
    'Minimal/Tech House':'Minimal House',
    'Progressive House':'Progressive House',
    'Funky/Club House':'Funky House',
    'Deep House':'Deep House',
    'Techno':'Techno',
    'Uplifting Trance':'Trance',
    'Electro House':'Electro House',
    'Drum And Bass':'Drum And Bass',
    'Dirty Dubstep/Trap/Grime':'Dubstep/Grime',
    'Breakbeat':'Breakbeat',
    'Disco/Nu-Disco':'Disco',
    'Balearic/Downtempo':'Downtempo',
    'Euro Dance/Pop Dance':'Euro Dance',
    'Hip Hop/R&B':'Hip Hop/R&B',
    'Hardstyle':'Not Needed',
    'Psy/Goa Trance':'Not Needed',
    'Dancehall/Ragga':'Dancehall/Ragga',
    'Hard Trance':'Hard Trance',
    'Indie':'Rock/Indie',
    'UK Hardcore':'Not Needed',
    'Hard House':'Hard House',
    'Experimental/Electronic':'Not Needed',
    'Pop Trance':'Not Needed',
    'Bass':'Not Needed',
    'Broken Beat/Nu Jazz':'Not Needed',
    'Rock':'Rock/Indie',
    'Gabba':'Not Needed',
    'Pop':'Pop',
    'UK Garage':'UK Garage',
    'Electro':'Not Needed',
    'Deep Dubstep':'Dubstep/Grime',
    'Roots/Lovers Rock':'Not Needed',
    'Hard Techno':'Not Needed',
    'Ambient/Drone':'Not Needed',
    'Funk':'Not Needed',
    'Scouse House':'Not Needed',
    'Dub':'Not Needed',
    'Coldwave/Synth':'Not Needed',
    'Jazz':'Not Needed',
    'DJ Tools':'Not Needed',
    'Industrial/Noise':'Not Needed',
    'Footwork/Juke':'Not Needed',
    'Classics/Ska':'Not Needed',
    'International':'Not Needed',
    'Soul':'Not Needed',
    'Soundtracks':'Not Needed',
    'Leftfield':'Not Needed',
    '50s/60s':'Not Needed',
    'Rock (All)':'Rock/Indie',
    'Unclassified':'Not Needed'
    }
    
    parent = parent_genres[s]
    return parent

In [6]:
df['parent_genre'] = df['release_genre'].apply(parent_genre)

In [7]:
df.head()

Unnamed: 0,id,release_artist,release_cat_num,release_date,release_genre,release_label,release_title,track_name,track_url,parent_genre
0,0,DJ SNAKE feat G4SHI,006025 57534641,22 Mar 17,Electro House,Polydor,4 Life (Explicit Habstrakt Remix),4 Life (Habstrakt remix) - (3:30),https://www.junodownload.com/MP3/SF3376865-02-...,Electro House
1,1,CLEAR VIEW feat JESSICA,SB 215-0,10 Sep 08,Progressive House,Songbird Holland,Tell Me,Tell Me - (6:43),https://www.junodownload.com/MP3/SF1354749-02-...,Progressive House
2,2,CLEAR VIEW feat JESSICA,SB 215-0,10 Sep 08,Progressive House,Songbird Holland,Tell Me,Tell Me (Max Graham remix) - (8:49),https://www.junodownload.com/MP3/SF1354749-02-...,Progressive House
3,3,_LINDEN,AED 0027DL,05 Feb 16,Rock,AED,Bones/Broken Glass,Bones - (3:17),https://www.junodownload.com/MP3/SF3007568-02-...,Rock/Indie
4,4,_LINDEN,AED 0027DL,05 Feb 16,Rock,AED,Bones/Broken Glass,Broken Glass - (3:00),https://www.junodownload.com/MP3/SF3007568-02-...,Rock/Indie


Look at the genres based on newly created parent genre...

In [8]:
df.groupby('parent_genre')['id'].count().sort_values(ascending=False).reset_index()

Unnamed: 0,parent_genre,id
0,Minimal House,127504
1,Not Needed,72349
2,Funky House,72220
3,Techno,66415
4,Deep House,65465
5,Progressive House,57294
6,Trance,31910
7,Electro House,30999
8,Drum And Bass,29175
9,Dubstep/Grime,19825


Remove the not needed 'genre'

In [9]:
not_needed = df[df['parent_genre'] == 'Not Needed']
df = df.drop(not_needed.index)
df.groupby('parent_genre')['id'].count().sort_values(ascending=False).reset_index()

Unnamed: 0,parent_genre,id
0,Minimal House,127504
1,Funky House,72220
2,Techno,66415
3,Deep House,65465
4,Progressive House,57294
5,Trance,31910
6,Electro House,30999
7,Drum And Bass,29175
8,Dubstep/Grime,19825
9,Breakbeat,15450


### Sample 1,000 songs for each genre

I want to take a sample of 1000 songs for each genre.

In [10]:
df_trimmed = df[(df['parent_genre'] != 'Pop') & (df['parent_genre'] != 'UK Garage')]
df_trimmed.groupby('parent_genre')['id'].count().sort_values(ascending=False).reset_index()

Unnamed: 0,parent_genre,id
0,Minimal House,127504
1,Funky House,72220
2,Techno,66415
3,Deep House,65465
4,Progressive House,57294
5,Trance,31910
6,Electro House,30999
7,Drum And Bass,29175
8,Dubstep/Grime,19825
9,Breakbeat,15450


In [11]:
grouped = df_trimmed.groupby('parent_genre', as_index=False)
equal_sample_df = grouped.apply(lambda x: x.sample(1000)).reset_index()
equal_sample_df.drop(['level_0','level_1'], axis=1, inplace=True)

In [12]:
equal_sample_df.groupby('parent_genre')['id'].count().sort_values(ascending=False).reset_index()

Unnamed: 0,parent_genre,id
0,Trance,1000
1,Techno,1000
2,Dancehall/Ragga,1000
3,Deep House,1000
4,Disco,1000
5,Downtempo,1000
6,Drum And Bass,1000
7,Dubstep/Grime,1000
8,Electro House,1000
9,Euro Dance,1000


### Save data

In [13]:
equal_sample_df.to_pickle('final_data.pkl')