In [1]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Conv1D, MaxPooling1D, Flatten
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from keras.utils import np_utils
import matplotlib.pyplot as plt
import keras

# Data analysis of Music4All dataset

### listening_history.csv:
    The listening history for each user in the database. There are three attributes in this file: user, song and timestamp.

### id_tags.csv:
    This file contains the id for each song and the tags associated with it. The sets of tags are separated by a comma.

### id_genres.csv:
    This file contains the id for each song and the genres associated with it. The genres are separated by a comma.

### id_information.csv:
    This file contains the id of the song along with basic information about it, namely artist, song name and album name.

### id_metadata.csv:
    This file contains the id of the song and its attributes obtained through the Spotify official API, including their Spotify id.

### id_lang.csv:
    This file contains the id of the song and the language of the lyrics.

# listening_history.csv
5 109 592 records:
- 14 127 unique users
- 99 596 unique songs
- 122 340 unique timestamps

In [2]:
listening_history = pd.read_csv('data/listening_history.csv', sep='\t')

In [3]:
listening_history.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5109592 entries, 0 to 5109591
Data columns (total 3 columns):
 #   Column     Dtype 
---  ------     ----- 
 0   user       object
 1   song       object
 2   timestamp  object
dtypes: object(3)
memory usage: 116.9+ MB


In [4]:
listening_history.dtypes

user         object
song         object
timestamp    object
dtype: object

In [5]:
listening_history.describe()

Unnamed: 0,user,song,timestamp
count,5109592,5109592,5109592
unique,14127,99596,122340
top,user_N9OKtRH0,32m5suoC94ytD8Ed,2019-02-08 20:35
freq,500,82871,175


In [6]:
listening_history.head()

Unnamed: 0,user,song,timestamp
0,user_007XIjOr,DaTQ53TUmfP93FSr,2019-02-20 12:28
1,user_007XIjOr,dGeyvi5WCOjDU7da,2019-02-20 12:35
2,user_007XIjOr,qUm54NYOjeFhmKYx,2019-02-20 12:48
3,user_007XIjOr,FtnuMT1DlevSR2n5,2019-02-20 12:52
4,user_007XIjOr,LHETTZcSZLeaVOGh,2019-02-20 13:09


In [7]:
listening_history.tail()

Unnamed: 0,user,song,timestamp
5109587,user_zzWscYTy,BBiswLufo26YQCT7,2019-01-10 15:57
5109588,user_zzWscYTy,5ZHgff3sjETIiedr,2019-01-10 16:21
5109589,user_zzWscYTy,m4O1iLh6fC43xjRy,2019-01-10 16:48
5109590,user_zzWscYTy,mvUaP8k67qOFfA65,2019-01-10 21:13
5109591,user_zzWscYTy,BBiswLufo26YQCT7,2019-01-10 21:16


In [8]:
# listening_history.groupby('user').describe()

In [9]:
# listening_history.groupby('song').describe()

In [10]:
listening_history.isnull().sum()

user         0
song         0
timestamp    0
dtype: int64

## Date range
Date *from* **2013-12-30 22:42** *to* **2019-03-26 11:57**

In [11]:
listening_history_by_timestamp_asc = listening_history.sort_values(by='timestamp')

In [12]:
listening_history_by_timestamp_asc.head(1)

Unnamed: 0,user,song,timestamp
1331567,user_Fq3E54HF,tzpXiIYf6EeS89Ws,2013-12-30 22:42


In [13]:
listening_history_by_timestamp_asc.tail(1)

Unnamed: 0,user,song,timestamp
4636367,user_u4aT6q6m,ZCXukVDuFnyzOeuP,2019-03-26 11:57


## The most played songs by users

In [14]:
# listening_history.groupby('song').count().sort_values(by='user', ascending=False)

In [15]:
listening_history.song.value_counts()

32m5suoC94ytD8Ed    82871
YwURIu6eZDCJyuEf    33737
vkOg9ZAiUgUT87k6    30719
7Gsb7yKtAJNCvWez    27880
KKmk2QJLVj2Aqsjg    25640
                    ...  
PPc87NauCUbIkPWN        1
D2WHFtLqbsJF22hW        1
8ijHH2lE89Z0sWgV        1
l5x0RHb9W4nwjTk2        1
Nv44mPsz174VuZ7Q        1
Name: song, Length: 99596, dtype: int64

### The most played song information

In [16]:
id_information = pd.read_csv('data/id_information.csv', sep='\t')
id_tags = pd.read_csv('data/id_tags.csv', sep='\t')
id_genres = pd.read_csv('data/id_genres.csv', sep='\t')
id_metadata = pd.read_csv('data/id_metadata.csv', sep='\t')
id_lang = pd.read_csv('data/id_lang.csv', sep='\t')

In [17]:
id_information.loc[id_information['id'] == '32m5suoC94ytD8Ed']

Unnamed: 0,id,artist,song,album_name
5460,32m5suoC94ytD8Ed,Ariana Grande,7 rings,"thank u, next"


In [18]:
id_tags.loc[id_tags['id'] == '32m5suoC94ytD8Ed']

Unnamed: 0,id,tags
5460,32m5suoC94ytD8Ed,"trap,rap,pop,rnb"


In [19]:
id_genres.loc[id_genres['id'] == '32m5suoC94ytD8Ed']

Unnamed: 0,id,genres
5460,32m5suoC94ytD8Ed,"rap,pop"


In [20]:
id_metadata.loc[id_metadata['id'] == '32m5suoC94ytD8Ed']

Unnamed: 0,id,spotify_id,popularity,release,danceability,energy,key,mode,valence,tempo,duration_ms
5460,32m5suoC94ytD8Ed,6ocbgoVGwYJhOv1GgI9NsF,88.0,2019,0.778,0.317,1.0,0.0,0.327,140.048,178627


In [21]:
id_lang.loc[id_lang['id'] == '32m5suoC94ytD8Ed']

Unnamed: 0,id,lang
5460,32m5suoC94ytD8Ed,en


## Users with the most songs played

In [22]:
listening_history.user.value_counts()

user_N9OKtRH0    500
user_JuUy0Pi6    500
user_Q8VSlThn    500
user_NAE2bHHI    500
user_Vrd41io2    500
                ... 
user_8EtNkBPW     13
user_KSotRo4O     11
user_wDUNVLHd      8
user_M2G5DnkG      4
user_4e4K55C0      3
Name: user, Length: 14127, dtype: int64

In [26]:
# listening_history.groupby('user').count().sort_values(by='song', ascending=False)

Unnamed: 0_level_0,song,timestamp
user,Unnamed: 1_level_1,Unnamed: 2_level_1
user_N9OKtRH0,500,500
user_JuUy0Pi6,500,500
user_Q8VSlThn,500,500
user_NAE2bHHI,500,500
user_Vrd41io2,500,500
...,...,...
user_8EtNkBPW,13,13
user_KSotRo4O,11,11
user_wDUNVLHd,8,8
user_M2G5DnkG,4,4


In [32]:
listening_history.loc[listening_history['user'] == 'user_N9OKtRH0'].groupby('song').count()

Unnamed: 0_level_0,user,timestamp
song,Unnamed: 1_level_1,Unnamed: 2_level_1
28x5L80PQrU2SzWC,436,436
5RxsEogLBdXW2U8v,5,5
5stZqMOVZDKmzPt5,6,6
6PTU0EEwfcdwK9r6,5,5
DNxXlUUNqX4saEBc,5,5
EQeRaQuzupJSeJYX,5,5
GzacORlsoC9wIvKs,5,5
KKmk2QJLVj2Aqsjg,1,1
KnMItrBYhHFixzJT,5,5
TXi2DCIQzKFHA1uL,5,5


In [30]:
id_information.loc[id_information['id'] == '28x5L80PQrU2SzWC']

Unnamed: 0,id,artist,song,album_name
3853,28x5L80PQrU2SzWC,Spice Girls,Say You'll Be There,Spice
