In [27]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sbn

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('max_colwidth', None)

artists_data_path = "../data/artists.jsonl"
sessions_data_path = "../data/sessions.jsonl"
track_storage_data_path = "../data/track_storage.jsonl"
tracks_data_path = "../data/tracks.jsonl"
users_data_path = "../data/users.jsonl"

artists_data_frame = pd.read_json(artists_data_path, lines=True)
sessions_data_frame = pd.read_json(sessions_data_path, lines=True)
track_storage_data_frame = pd.read_json(track_storage_data_path, lines=True)
tracks_data_frame = pd.read_json(tracks_data_path, lines=True)
users_data_frame = pd.read_json(users_data_path, lines=True)

Pominęliśmy analizę danych o przechowywaniu utworów, ponieważ nie wnoszą one żadnych informacji do analizy w naszym temacie projektu.

# Analiza artystów

In [28]:
artists_data_frame.head(n=5)

Unnamed: 0,id,name,genres
0,7b6Ui7JVaBDEfZB9k6nHL0,The Local Train,"[desi pop, hindi indie, indian indie, indian rock, new delhi indie, sufi]"
1,5wJ1H6ud777odtZl5gG507,Vishal Mishra,"[desi pop, modern bollywood]"
2,6AETRCQep85mUtIlX7BaFV,Color Humano,"[argentine rock, rock nacional]"
3,3PN8odvj48fnALdAsCKG3g,Crucis,"[italian progressive rock, rock nacional]"
4,5SAeZRhgmJhgq196BR1Mna,Dúo Salteño,"[folclore salteno, folklore argentino]"


Atrybuty:
* id - identyfikator artysty
* name - nazwa artysty
* genres - gatunki muzyczne, które tworzy

In [29]:
artists_data_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10657 entries, 0 to 10656
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      10657 non-null  object
 1   name    10657 non-null  object
 2   genres  10657 non-null  object
dtypes: object(3)
memory usage: 249.9+ KB


### Analiza id

In [31]:
print("unique ids: {}/{}".format(artists_data_frame['id'].nunique(), artists_data_frame['id'].count()))

unique ids: 10657/10657


Wszystkie ID są unikatowe

### Analiza name


In [32]:
print("unique names: {}/{}".format(artists_data_frame['name'].nunique(), artists_data_frame['name'].count()))

# find artists with duplicated names
artists_data_frame[artists_data_frame['name'].duplicated(keep=False)]

unique names: 10643/10657


Unnamed: 0,id,name,genres
358,2xgbz6gqoIF4S6JZoRiH2y,Dalton,[classic danish pop]
390,38YroIcEOM2zsIEGdtmZYm,Bamse,"[barnmusik, bornesange]"
460,0Yk0PoyjQiEyIKgnaJFR0n,Traffic,"[estonian pop, estonian rock]"
1093,3rUSk3kcpl4Nseu1DqFNpJ,KK,"[classic icelandic pop, icelandic pop]"
1384,3lPK4zqijDKAvZkdlmZRfN,Mango,[lithuanian pop]
2076,4ppDL5W65K1HG6EF7yaDLd,Little River Band,"[album rock, classic rock, country rock, folk rock, mellow gold, soft rock]"
2352,53Thxvlr6imD5y3lxouOu4,Phoenix,[romanian rock]
2546,363VbwpX9anvrThJ2qDwBy,TNT,"[glam metal, hard rock]"
2721,1CqOLQmjzVWXQTiIN5Wucs,TNT,"[euphoric hardstyle, hardstyle]"
2757,7c5278WShlxTk3vS9XoTf8,Wings,[rock kapak]


Istnieje 28 artystów, których nazwa jest zduplikowana.
TODO co z tym zrobic?


### Analiza genres
TODO


# Analiza sesji

In [34]:
sessions_data_frame.head(n=5)

Unnamed: 0,timestamp,user_id,track_id,event_type,session_id
0,2023-01-03 05:09:55.000,101,2PmGtDUyJIpYBEtI1hQIVp,PLAY,124
1,2023-01-03 05:14:02.707,101,7hXy7Oc0XfODlcR8ESR9up,PLAY,124
2,2023-01-03 05:15:59.184,101,7hXy7Oc0XfODlcR8ESR9up,SKIP,124
3,2023-01-03 05:15:59.184,101,5atzraDZf6C8dEIFBHw3E2,PLAY,124
4,2023-01-03 05:16:47.572,101,5atzraDZf6C8dEIFBHw3E2,SKIP,124


Atrybuty:
* timestamp - czas rozpoczęcia sesji
* user_id - identyfikator użytkownika
* track_id - identyfikator utworu
* event_type - typ zdarzenia (PLAY, LIKE, SKIP)
* session_id - identyfikator sesji


In [35]:
sessions_data_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19348 entries, 0 to 19347
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   timestamp   19348 non-null  datetime64[ns]
 1   user_id     19348 non-null  int64         
 2   track_id    19348 non-null  object        
 3   event_type  19348 non-null  object        
 4   session_id  19348 non-null  int64         
dtypes: datetime64[ns](1), int64(2), object(2)
memory usage: 755.9+ KB


### Analiza timestamp

In [36]:
sessions_data_frame['timestamp'].describe(datetime_is_numeric=True)

count                            19348
mean     2023-02-13 09:22:11.073808640
min                2023-01-01 06:00:09
25%      2023-01-23 18:21:12.364000256
50%         2023-02-13 20:04:24.712000
75%      2023-03-07 07:58:05.143000064
max         2023-03-28 15:10:21.062000
Name: timestamp, dtype: object

Sesje rozpoczęły się 2023-01-01 06:00:09 i zakończyły 2023-03-24 15:10:21.062000.


### Analiza user_id

In [38]:
sessions_data_frame['user_id'].value_counts().describe()

count      50.000000
mean      386.960000
std       261.935649
min        44.000000
25%       157.750000
50%       339.000000
75%       588.750000
max      1052.000000
Name: user_id, dtype: float64

Średnia liczba sesji na użytkownika wynosi 387, a maksymalna 1052.


### Analiza track_id
TODO


### Analiza event_type

In [40]:
sessions_data_frame['event_type'].value_counts()

PLAY             10229
SKIP              3761
LIKE              2833
ADVERTISEMENT     2497
BUY_PREMIUM         28
Name: event_type, dtype: int64

### Analiza session_id

In [41]:
sessions_data_frame['session_id'].value_counts().describe()

count    596.000000
mean      32.463087
std       23.582724
min        1.000000
25%       11.000000
50%       29.000000
75%       50.000000
max       98.000000
Name: session_id, dtype: float64

Średnia liczba zdarzeń na sesję wynosi 32.5


# Analiza danych o utworach


In [42]:
tracks_data_frame.head(n=5)

Unnamed: 0,id,name,popularity,duration_ms,explicit,id_artist,release_date,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,4LdczeQKU4ojwFJd6QWGnW,Taking Care Of Business - 2005 Remaster,1,177853,0,4AgFg0hHythyklEs4t7XKt,2011-01-01,0.554,0.847,7,-7.237,0.0633,0.00789,2e-06,0.183,0.733,90.655
1,7F20EOraDwU8KvNcyzUKVz,Leef,42,217903,0,5l2Xy4aUoJDRSpsYHyOumD,2015-06-26,0.673,0.727,9,-6.587,0.0401,0.282,0.0,0.178,0.782,121.016
2,5EMGRFIidh98v9eXj6QnOU,Dime Que No,62,266013,0,0h1zs4CTlU9D2QtgPxptUD,2012-01-23,0.535,0.648,0,-5.842,0.0364,0.369,0.0,0.075,0.311,142.486
3,5XFfcpc0V1fqHeiEjJd4y5,I Hate the Capitalist System,28,196387,0,7htyUKqAPLpbTA9nkEhGRf,1973-01-01,0.454,0.115,5,-15.617,0.0405,0.797,0.0,0.103,0.366,86.761
4,4fsmo5mukKd4vafcHN41KP,Hai la sârbă roată,24,223013,0,4a0FNCbvDEoeHYFfc045W4,2008-01-01,0.509,0.699,10,-4.935,0.0372,0.686,1.6e-05,0.267,0.949,174.827


Atrybuty:
* id - identyfikator utworu
* name - nazwa utworu
* popularity - popularność utworu
* duration_ms - długość utworu w milisekundach
* explicit - czy utwór zawiera treści nieodpowiednie dla dzieci
* id_artist - identyfikator artysty
* release_date - data wydania utworu
* danceability - współczynnik "taneczności" utworu
* energy - współczynnik "energii" utworu
* key - główny ton utworu
* loudness - głośność utworu
* speechiness - współczynnik "mowy" utworu
* acousticness - współczynnik "akustyczności" utworu
* instrumentalness - współczynnik "instrumentalności" utworu
* liveness - współczynnik "żywotności" utworu
* valence - współczynnik "pozytywności" utworu
* tempo - tempo utworu

In [43]:
tracks_data_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21608 entries, 0 to 21607
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                21608 non-null  object 
 1   name              21608 non-null  object 
 2   popularity        21608 non-null  int64  
 3   duration_ms       21608 non-null  int64  
 4   explicit          21608 non-null  int64  
 5   id_artist         21608 non-null  object 
 6   release_date      21608 non-null  object 
 7   danceability      21608 non-null  float64
 8   energy            21608 non-null  float64
 9   key               21608 non-null  int64  
 10  loudness          21608 non-null  float64
 11  speechiness       21608 non-null  float64
 12  acousticness      21608 non-null  float64
 13  instrumentalness  21608 non-null  float64
 14  liveness          21608 non-null  float64
 15  valence           21608 non-null  float64
 16  tempo             21608 non-null  float6

### Analiza id
TODO

### Analiza name
TODO

### Analiza popularity
TODO

### Analiza duration_ms
TODO

### Analiza explicit
TODO

### Analiza id_artist
TODO

### Analiza release_date
TODO

### Analiza danceability
TODO

### Analiza energy
TODO

### Analiza key
TODO

### Analiza loudness
TODO

### Analiza speechiness
TODO

### Analiza acousticness
TODO

### Analiza instrumentalness
TODO

### Analiza liveness
TODO

### Analiza valence
TODO

### Analiza tempo
TODO

# Analiza użytkowników

In [48]:
users_data_frame.head(n=5)

Unnamed: 0,user_id,name,city,street,favourite_genres,premium_user
0,101,Klara Herdzik,Szczecin,pl. Floriana 55/22,"[permanent wave, mandopop, funk]",False
1,102,Szymon Plebanek,Kraków,al. Kalinowa 27/40,"[classic rock, ranchera, lounge]",True
2,103,Szymon Zawal,Gdynia,al. Głogowa 14/10,"[filmi, regional mexican, folk]",False
3,104,Andrzej Berendt,Kraków,ul. Strażacka 14/63,"[tropical, new wave, latin]",True
4,105,Ida Karsznia,Kraków,al. Złota 549,"[soul, tropical, alternative metal]",False


Atrybuty:
* user_id - identyfikator użytkownika
* name - imię i nazwisko użytkownika
* city - miasto zamieszkania użytkownika
* street - ulica zamieszkania użytkownika
* favourite_genres - ulubione gatunki użytkownika
* premium_user - czy użytkownik posiada konto premium
* id - zbędny atrybut


In [49]:
users_data_frame.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   user_id           50 non-null     int64 
 1   name              50 non-null     object
 2   city              50 non-null     object
 3   street            50 non-null     object
 4   favourite_genres  50 non-null     object
 5   premium_user      50 non-null     bool  
dtypes: bool(1), int64(1), object(4)
memory usage: 2.1+ KB


### Analiza user_id
TODO

### Analiza name
TODO

### Analiza city
TODO

### Analiza street
TODO

### Analiza favourite_genres
TODO

### Analiza premium_user
TODO

### Analiza id
TODO
