In [1]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [2]:
df = pd.read_csv('./data/data_w_genres.csv')
print(df.shape)
df.head()

(32539, 16)


Unnamed: 0,artists,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key,mode,count,genres
0,"""Cats"" 1981 Original London Cast",0.5985,0.4701,267072.0,0.376203,0.010261,0.28305,-14.4343,0.20915,114.1288,0.35832,38.2,5,1,10,['show tunes']
1,"""Cats"" 1983 Broadway Cast",0.862538,0.441731,287280.0,0.406808,0.081158,0.315215,-10.69,0.176212,103.044154,0.268865,31.538462,5,1,26,[]
2,"""Fiddler On The Roof” Motion Picture Chorus",0.856571,0.348286,328920.0,0.286571,0.024593,0.325786,-15.230714,0.118514,77.375857,0.354857,34.571429,0,1,7,[]
3,"""Fiddler On The Roof” Motion Picture Orchestra",0.884926,0.425074,262890.962963,0.24577,0.073587,0.275481,-15.63937,0.1232,88.66763,0.37203,34.407407,0,1,27,[]
4,"""Joseph And The Amazing Technicolor Dreamcoat""...",0.510714,0.467143,270436.142857,0.488286,0.0094,0.195,-10.236714,0.098543,122.835857,0.482286,42.0,5,1,7,[]


### Numerical variables

In [3]:
df_num_vars = df.select_dtypes(include=np.number).columns
df_num_vars

Index(['acousticness', 'danceability', 'duration_ms', 'energy',
       'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo',
       'valence', 'popularity', 'key', 'mode', 'count'],
      dtype='object')

### Non-numerical variables

In [4]:
df_nonnum_vars = df.select_dtypes(exclude=np.number).columns
df_nonnum_vars

Index(['artists', 'genres'], dtype='object')

In [5]:
# let's tokenize the genres into lists
def tokenize_str(text):
    regex_rule = re.compile("[ '\[\]]")
    text = re.sub(regex_rule, "", text)
    return text.split(",")

df['genres'] = df['genres'].apply(tokenize_str)

In [6]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
df_genres = pd.DataFrame(mlb.fit_transform(df['genres']), columns=mlb.classes_, index=df.index)
df_genres.drop('',axis=1, inplace=True)
df_genres.columns

Index(['"australianchildrensmusic"', '"blacknroll"', '"britishchildrensmusic"',
       '"canadianchildrensmusic"', '"canzonedautore"', '"childrenschoir"',
       '"childrensfolk"', '"childrensmusic"', '"childrensstory"',
       '"deathnroll"',
       ...
       'yugoslavnewwave', 'yugoslavrock', 'zambianpop', 'zen', 'zhongguofeng',
       'zimurbangroove', 'zolo', 'zouk', 'zurichindie', 'zydeco'],
      dtype='object', length=3231)

In [7]:
# merge with the original dataframe
df = df.drop('genres', axis=1).merge(df_genres, left_index=True, right_index=True)

In [8]:
df = df.set_index('artists', verify_integrity=True)
df.head()

Unnamed: 0_level_0,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,...,yugoslavnewwave,yugoslavrock,zambianpop,zen,zhongguofeng,zimurbangroove,zolo,zouk,zurichindie,zydeco
artists,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""Cats"" 1981 Original London Cast",0.5985,0.4701,267072.0,0.376203,0.010261,0.28305,-14.4343,0.20915,114.1288,0.35832,...,0,0,0,0,0,0,0,0,0,0
"""Cats"" 1983 Broadway Cast",0.862538,0.441731,287280.0,0.406808,0.081158,0.315215,-10.69,0.176212,103.044154,0.268865,...,0,0,0,0,0,0,0,0,0,0
"""Fiddler On The Roof” Motion Picture Chorus",0.856571,0.348286,328920.0,0.286571,0.024593,0.325786,-15.230714,0.118514,77.375857,0.354857,...,0,0,0,0,0,0,0,0,0,0
"""Fiddler On The Roof” Motion Picture Orchestra",0.884926,0.425074,262890.962963,0.24577,0.073587,0.275481,-15.63937,0.1232,88.66763,0.37203,...,0,0,0,0,0,0,0,0,0,0
"""Joseph And The Amazing Technicolor Dreamcoat"" 1991 London Cast",0.510714,0.467143,270436.142857,0.488286,0.0094,0.195,-10.236714,0.098543,122.835857,0.482286,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# all the columns are numeric now
df.select_dtypes(exclude=np.number).columns

Index([], dtype='object')

In [10]:
df.shape

(32539, 3245)

### Numerical variables

In [11]:
df_num_vars

Index(['acousticness', 'danceability', 'duration_ms', 'energy',
       'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo',
       'valence', 'popularity', 'key', 'mode', 'count'],
      dtype='object')

### Export the dataframe

In [12]:
#df.to_csv('./data/data_genre_encoded.csv')