In [340]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sbn

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('max_colwidth', None)

artists_data_path = "../data/artists.jsonl"
sessions_data_path = "../data/sessions.jsonl"
track_storage_data_path = "../data/track_storage.jsonl"
tracks_data_path = "../data/tracks.jsonl"
users_data_path = "../data/users.jsonl"

artists_data_frame = pd.read_json(artists_data_path, lines=True)
sessions_data_frame = pd.read_json(sessions_data_path, lines=True)
track_storage_data_frame = pd.read_json(track_storage_data_path, lines=True)
tracks_data_frame = pd.read_json(tracks_data_path, lines=True)
users_data_frame = pd.read_json(users_data_path, lines=True)

Pominęliśmy analizę danych o przechowywaniu utworów, ponieważ nie wnoszą one żadnych informacji do analizy w naszym temacie projektu.

# Analiza artystów

In [341]:
artists_data_frame.head(n=5)

Unnamed: 0,id,name,genres
0,7b6Ui7JVaBDEfZB9k6nHL0,The Local Train,"[desi pop, hindi indie, indian indie, indian rock, new delhi indie, sufi]"
1,-1,Vishal Mishra,"[desi pop, modern bollywood]"
2,6AETRCQep85mUtIlX7BaFV,Color Humano,"[argentine rock, rock nacional]"
3,3PN8odvj48fnALdAsCKG3g,Crucis,"[italian progressive rock, rock nacional]"
4,5SAeZRhgmJhgq196BR1Mna,Dúo Salteño,"[folclore salteno, folklore argentino]"


Atrybuty:
* id - identyfikator artysty
* name - nazwa artysty
* genres - gatunki muzyczne, które tworzy

In [342]:
artists_data_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10657 entries, 0 to 10656
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      10657 non-null  object
 1   name    10657 non-null  object
 2   genres  10126 non-null  object
dtypes: object(3)
memory usage: 249.9+ KB


### Analiza id

In [343]:
# find artists with -1 id
artists_data_frame[artists_data_frame['id'] == -1]

Unnamed: 0,id,name,genres
1,-1,Vishal Mishra,"[desi pop, modern bollywood]"
14,-1,Luz Mala,[cumbia villera]
25,-1,Mono & Nikitaman,[german reggae]
61,-1,Rebelion,"[rawstyle, xtra raw]"
99,-1,FSB,"[bulgarian rock, classic bulgarian pop]"
107,-1,LouKass,[bolivian rock]
158,-1,Quatro por Um,"[adoracao, brazilian gospel, rock gospel brasileiro]"
161,-1,Mc Sapao,"[funk carioca, funk das antigas, funk melody, funk ostentacao]"
213,-1,Fred Pellerin,"[country quebecois, indie quebecois]"
229,-1,Gölä,"[mundart, swiss hip hop, swiss indie]"


Istnieje 557 artystów, których id jest ustawione na -1.
TODO co z tym zrobic?

In [344]:
print("unique ids: {}/{}".format(artists_data_frame['id'].nunique(), artists_data_frame['id'].count()))

unique ids: 10101/10657


Nieunikatowe ID wynikają z tego, że istnieje 557 artystów, których ID jest ustawione na -1.

### Analiza name


In [345]:
print("unique names: {}/{}".format(artists_data_frame['name'].nunique(), artists_data_frame['name'].count()))

# find artists with duplicated names
artists_data_frame[artists_data_frame['name'].duplicated(keep=False)]

unique names: 10643/10657


Unnamed: 0,id,name,genres
358,2xgbz6gqoIF4S6JZoRiH2y,Dalton,[classic danish pop]
390,38YroIcEOM2zsIEGdtmZYm,Bamse,"[barnmusik, bornesange]"
460,0Yk0PoyjQiEyIKgnaJFR0n,Traffic,"[estonian pop, estonian rock]"
1093,3rUSk3kcpl4Nseu1DqFNpJ,KK,"[classic icelandic pop, icelandic pop]"
1384,3lPK4zqijDKAvZkdlmZRfN,Mango,[lithuanian pop]
2076,4ppDL5W65K1HG6EF7yaDLd,Little River Band,"[album rock, classic rock, country rock, folk rock, mellow gold, soft rock]"
2352,53Thxvlr6imD5y3lxouOu4,Phoenix,[romanian rock]
2546,363VbwpX9anvrThJ2qDwBy,TNT,"[glam metal, hard rock]"
2721,1CqOLQmjzVWXQTiIN5Wucs,TNT,"[euphoric hardstyle, hardstyle]"
2757,7c5278WShlxTk3vS9XoTf8,Wings,[rock kapak]


Istnieje 28 artystów, których nazwa jest zduplikowana.
TODO co z tym zrobic?


### Analiza genres


In [346]:
# find artists with null genres
artists_data_frame[artists_data_frame['genres'].isnull()]

Unnamed: 0,id,name,genres
24,058bioyBUzSkKCfrqvODLu,Ursprung Buam,
26,3N4ZBIXZfvF0VLgEclgJvX,Enes Begovic,
85,3fxtMTmbY0jgtjHDuXspus,Panayot Panayotov,
122,0cDZ1YAlkkIn0Z7BjiEUj1,Volta Seca,
125,5CorrQPbOid8RKf5rCpxG7,Sempre Livre,
135,2u7kjqj92WlpCSAKiOeZ6I,Banda Mel,
169,28rjo4eRvpmLW0EsCIy6FW,Ana Paula Valadão,
187,4fk1ToCb0m7irMHCaZhIrM,Ministério Adoração e Vida,
190,0hGCbqEUzKxBUuJ7Q8ZFvK,Mcginty,
208,3IYryWetzBVax8oJPkgXMk,Marjo,


Istnieje 531 artystów, których gatunki nie zostały określone.
TODO co z tym zrobic?


# Analiza sesji

In [347]:
sessions_data_frame.head(n=5)

Unnamed: 0,timestamp,user_id,track_id,event_type,session_id
0,2023-01-04 11:46:25.000,101.0,78ybROgQ2V3TySb8ZfPNdg,PLAY,124
1,2023-01-04 11:49:32.533,,2lkoxPYi7o9OlaGP9bVI09,PLAY,124
2,2023-01-06 04:38:08.000,101.0,0juIv1JPqoCrWku2nIfMiA,PLAY,125
3,2023-01-06 04:40:49.544,101.0,0juIv1JPqoCrWku2nIfMiA,LIKE,125
4,2023-01-06 04:44:14.267,101.0,4srmb7zgB7eKWLUL96xftd,PLAY,125


Atrybuty:
* timestamp - czas rozpoczęcia sesji
* user_id - identyfikator użytkownika
* track_id - identyfikator utworu
* event_type - typ zdarzenia (PLAY, LIKE, SKIP)
* session_id - identyfikator sesji


In [348]:
sessions_data_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3971 entries, 0 to 3970
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   timestamp   3971 non-null   datetime64[ns]
 1   user_id     3776 non-null   float64       
 2   track_id    3776 non-null   object        
 3   event_type  3763 non-null   object        
 4   session_id  3971 non-null   int64         
dtypes: datetime64[ns](1), float64(1), int64(1), object(2)
memory usage: 155.2+ KB


### Analiza timestamp

In [349]:
sessions_data_frame['timestamp'].describe(datetime_is_numeric=True)

count                             3971
mean     2023-02-09 22:20:59.431924736
min                2023-01-01 01:12:53
25%      2023-01-22 12:22:54.168499968
50%      2023-02-08 12:59:38.910000128
75%      2023-02-28 06:16:43.153500160
max         2023-03-24 06:28:07.464000
Name: timestamp, dtype: object

Sesje rozpoczęły się 2023-01-01 01:12:53 i zakończyły 2023-03-24 06:28:08.464000.


### Analiza user_id

In [350]:
# average number of sessions per user
print("avg number of sessions per user {}".format(sessions_data_frame['user_id'].value_counts().mean()))

# min number of sessions per user
print("min number of sessions per user {}".format(sessions_data_frame['user_id'].value_counts().min()))

# max number of sessions per user
print("max number of sessions per user {}".format(sessions_data_frame['user_id'].value_counts().max()))


avg number of sessions per user 75.52
min number of sessions per user 20
max number of sessions per user 148


Średnia liczba sesji na użytkownika wynosi 75.5, a maksymalna 148.


### Analiza track_id



In [351]:
# TODO


### Analiza event_type

In [352]:
sessions_data_frame['event_type'].value_counts()


PLAY             2242
SKIP              825
LIKE              619
BUY_PREMIUM        47
ADVERTISEMENT      30
Name: event_type, dtype: int64

### Analiza session_id

In [353]:
# average number of events per session
print("avg number of events per session {}".format(sessions_data_frame['session_id'].value_counts().mean()))


avg number of events per session 6.2437106918239


Średnia liczba zdarzeń na sesję wynosi 6.2


# Analiza danych o utworach


In [354]:
tracks_data_frame.head(n=5)


Unnamed: 0,id,name,popularity,duration_ms,explicit,id_artist,release_date,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,4LdczeQKU4ojwFJd6QWGnW,Taking Care Of Business - 2005 Remaster,1.0,177853,0,4AgFg0hHythyklEs4t7XKt,2011-01-01,0.554,0.847,7,-7.237,0.0633,0.00789,2e-06,0.183,0.733,90.655
1,7F20EOraDwU8KvNcyzUKVz,Leef,42.0,217903,0,5l2Xy4aUoJDRSpsYHyOumD,2015-06-26,0.673,0.727,9,-6.587,0.0401,0.282,0.0,0.178,0.782,121.016
2,5EMGRFIidh98v9eXj6QnOU,Dime Que No,,266013,0,0h1zs4CTlU9D2QtgPxptUD,2012-01-23,0.535,0.648,0,-5.842,0.0364,0.369,0.0,0.075,0.311,142.486
3,5XFfcpc0V1fqHeiEjJd4y5,I Hate the Capitalist System,28.0,196387,0,7htyUKqAPLpbTA9nkEhGRf,1973-01-01,0.454,0.115,5,-15.617,0.0405,0.797,0.0,0.103,0.366,86.761
4,4fsmo5mukKd4vafcHN41KP,Hai la sârbă roată,24.0,223013,0,4a0FNCbvDEoeHYFfc045W4,2008-01-01,0.509,0.699,10,-4.935,0.0372,0.686,1.6e-05,0.267,0.949,174.827


Atrybuty:
* id - identyfikator utworu
* name - nazwa utworu
* popularity - popularność utworu
* duration_ms - długość utworu w milisekundach
* explicit - czy utwór zawiera treści nieodpowiednie dla dzieci
* id_artist - identyfikator artysty
* release_date - data wydania utworu
* danceability - współczynnik "taneczności" utworu
* energy - współczynnik "energii" utworu
* key - główny ton utworu
* loudness - głośność utworu
* speechiness - współczynnik "mowy" utworu
* acousticness - współczynnik "akustyczności" utworu
* instrumentalness - współczynnik "instrumentalności" utworu
* liveness - współczynnik "żywotności" utworu
* valence - współczynnik "pozytywności" utworu
* tempo - tempo utworu

In [355]:
tracks_data_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21608 entries, 0 to 21607
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                20496 non-null  object 
 1   name              20557 non-null  object 
 2   popularity        20505 non-null  float64
 3   duration_ms       21608 non-null  int64  
 4   explicit          21608 non-null  int64  
 5   id_artist         20526 non-null  object 
 6   release_date      21608 non-null  object 
 7   danceability      21608 non-null  float64
 8   energy            21608 non-null  float64
 9   key               21608 non-null  int64  
 10  loudness          21608 non-null  float64
 11  speechiness       21608 non-null  float64
 12  acousticness      21608 non-null  float64
 13  instrumentalness  21608 non-null  float64
 14  liveness          21608 non-null  float64
 15  valence           21608 non-null  float64
 16  tempo             21608 non-null  float6

### Analiza id
TODO

### Analiza name
TODO

### Analiza popularity
TODO

### Analiza duration_ms
TODO

### Analiza explicit
TODO

### Analiza id_artist
TODO

### Analiza release_date
TODO

### Analiza danceability
TODO

### Analiza energy
TODO

### Analiza key
TODO

### Analiza loudness
TODO

### Analiza speechiness
TODO

### Analiza acousticness
TODO

### Analiza instrumentalness
TODO

### Analiza liveness
TODO

### Analiza valence
TODO

### Analiza tempo
TODO

# Analiza użytkowników

In [356]:
users_data_frame.head(n=5)

Unnamed: 0,user_id,name,city,street,favourite_genres,premium_user,id
0,101,Klara Herdzik,Szczecin,pl. Floriana 55/22,"[permanent wave, mandopop, funk]",1.0,
1,102,Szymon Plebanek,Kraków,al. Kalinowa 27/40,,1.0,-1.0
2,103,Szymon Zawal,Gdynia,al. Głogowa 14/10,"[filmi, regional mexican, folk]",1.0,
3,104,Andrzej Berendt,Kraków,ul. Strażacka 14/63,"[tropical, new wave, latin]",1.0,
4,105,Ida Karsznia,Kraków,al. Złota 549,"[soul, tropical, alternative metal]",1.0,


Atrybuty:
* user_id - identyfikator użytkownika
* name - imię i nazwisko użytkownika
* city - miasto zamieszkania użytkownika
* street - ulica zamieszkania użytkownika
* favourite_genres - ulubione gatunki użytkownika
* premium_user - czy użytkownik posiada konto premium
* id - zbędny atrybut


In [357]:
users_data_frame.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   user_id           50 non-null     int64  
 1   name              50 non-null     object 
 2   city              50 non-null     object 
 3   street            50 non-null     object 
 4   favourite_genres  48 non-null     object 
 5   premium_user      49 non-null     float64
 6   id                5 non-null      float64
dtypes: float64(2), int64(1), object(4)
memory usage: 2.9+ KB


### Analiza user_id
TODO

### Analiza name
TODO

### Analiza city
TODO

### Analiza street
TODO

### Analiza favourite_genres
TODO

### Analiza premium_user
TODO

### Analiza id
TODO
