In [574]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sbn

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('max_colwidth', None)

artists_data_path = "../data/artists.jsonl"
sessions_data_path = "../data/sessions.jsonl"
track_storage_data_path = "../data/track_storage.jsonl"
tracks_data_path = "../data/tracks.jsonl"
users_data_path = "../data/users.jsonl"

artists_data_frame = pd.read_json(artists_data_path, lines=True)
sessions_data_frame = pd.read_json(sessions_data_path, lines=True)
track_storage_data_frame = pd.read_json(track_storage_data_path, lines=True)
tracks_data_frame = pd.read_json(tracks_data_path, lines=True)
users_data_frame = pd.read_json(users_data_path, lines=True)

Pominęliśmy analizę danych o przechowywaniu utworów, ponieważ nie wnoszą one żadnych informacji do analizy w naszym temacie projektu.

# Analiza artystów

In [575]:
artists_data_frame.head(n=5)

Unnamed: 0,id,name,genres
0,7b6Ui7JVaBDEfZB9k6nHL0,The Local Train,"[desi pop, hindi indie, indian indie, indian rock, new delhi indie, sufi]"
1,-1,Vishal Mishra,"[desi pop, modern bollywood]"
2,6AETRCQep85mUtIlX7BaFV,Color Humano,"[argentine rock, rock nacional]"
3,3PN8odvj48fnALdAsCKG3g,Crucis,"[italian progressive rock, rock nacional]"
4,5SAeZRhgmJhgq196BR1Mna,Dúo Salteño,"[folclore salteno, folklore argentino]"


Atrybuty:
* id - identyfikator artysty
* name - nazwa artysty
* genres - gatunki muzyczne, które tworzy

In [576]:
artists_data_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10657 entries, 0 to 10656
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      10657 non-null  object
 1   name    10657 non-null  object
 2   genres  10126 non-null  object
dtypes: object(3)
memory usage: 249.9+ KB


### Analiza id

In [577]:
# find artists with -1 id
artists_data_frame[artists_data_frame['id'] == -1]

Unnamed: 0,id,name,genres
1,-1,Vishal Mishra,"[desi pop, modern bollywood]"
14,-1,Luz Mala,[cumbia villera]
25,-1,Mono & Nikitaman,[german reggae]
61,-1,Rebelion,"[rawstyle, xtra raw]"
99,-1,FSB,"[bulgarian rock, classic bulgarian pop]"
107,-1,LouKass,[bolivian rock]
158,-1,Quatro por Um,"[adoracao, brazilian gospel, rock gospel brasileiro]"
161,-1,Mc Sapao,"[funk carioca, funk das antigas, funk melody, funk ostentacao]"
213,-1,Fred Pellerin,"[country quebecois, indie quebecois]"
229,-1,Gölä,"[mundart, swiss hip hop, swiss indie]"


Istnieje 557 artystów, których id jest ustawione na -1.
TODO co z tym zrobic?

In [578]:
print("unique ids: {}/{}".format(artists_data_frame['id'].nunique(), artists_data_frame['id'].count()))

unique ids: 10101/10657


Nieunikatowe ID wynikają z tego, że istnieje 557 artystów, których ID jest ustawione na -1.

### Analiza name


In [579]:
print("unique names: {}/{}".format(artists_data_frame['name'].nunique(), artists_data_frame['name'].count()))

# find artists with duplicated names
artists_data_frame[artists_data_frame['name'].duplicated(keep=False)]

unique names: 10643/10657


Unnamed: 0,id,name,genres
358,2xgbz6gqoIF4S6JZoRiH2y,Dalton,[classic danish pop]
390,38YroIcEOM2zsIEGdtmZYm,Bamse,"[barnmusik, bornesange]"
460,0Yk0PoyjQiEyIKgnaJFR0n,Traffic,"[estonian pop, estonian rock]"
1093,3rUSk3kcpl4Nseu1DqFNpJ,KK,"[classic icelandic pop, icelandic pop]"
1384,3lPK4zqijDKAvZkdlmZRfN,Mango,[lithuanian pop]
2076,4ppDL5W65K1HG6EF7yaDLd,Little River Band,"[album rock, classic rock, country rock, folk rock, mellow gold, soft rock]"
2352,53Thxvlr6imD5y3lxouOu4,Phoenix,[romanian rock]
2546,363VbwpX9anvrThJ2qDwBy,TNT,"[glam metal, hard rock]"
2721,1CqOLQmjzVWXQTiIN5Wucs,TNT,"[euphoric hardstyle, hardstyle]"
2757,7c5278WShlxTk3vS9XoTf8,Wings,[rock kapak]


Istnieje 28 artystów, których nazwa jest zduplikowana.
TODO co z tym zrobic?


### Analiza genres


In [580]:
# find artists with null genres
artists_data_frame[artists_data_frame['genres'].isnull()]

Unnamed: 0,id,name,genres
24,058bioyBUzSkKCfrqvODLu,Ursprung Buam,
26,3N4ZBIXZfvF0VLgEclgJvX,Enes Begovic,
85,3fxtMTmbY0jgtjHDuXspus,Panayot Panayotov,
122,0cDZ1YAlkkIn0Z7BjiEUj1,Volta Seca,
125,5CorrQPbOid8RKf5rCpxG7,Sempre Livre,
135,2u7kjqj92WlpCSAKiOeZ6I,Banda Mel,
169,28rjo4eRvpmLW0EsCIy6FW,Ana Paula Valadão,
187,4fk1ToCb0m7irMHCaZhIrM,Ministério Adoração e Vida,
190,0hGCbqEUzKxBUuJ7Q8ZFvK,Mcginty,
208,3IYryWetzBVax8oJPkgXMk,Marjo,


Istnieje 531 artystów, których gatunki nie zostały określone.
TODO co z tym zrobic?


# Analiza sesji

In [581]:
sessions_data_frame.head(n=5)

Unnamed: 0,timestamp,user_id,track_id,event_type,session_id
0,2023-01-04 11:46:25.000,101.0,78ybROgQ2V3TySb8ZfPNdg,PLAY,124
1,2023-01-04 11:49:32.533,,2lkoxPYi7o9OlaGP9bVI09,PLAY,124
2,2023-01-06 04:38:08.000,101.0,0juIv1JPqoCrWku2nIfMiA,PLAY,125
3,2023-01-06 04:40:49.544,101.0,0juIv1JPqoCrWku2nIfMiA,LIKE,125
4,2023-01-06 04:44:14.267,101.0,4srmb7zgB7eKWLUL96xftd,PLAY,125


Atrybuty:
* timestamp - czas rozpoczęcia sesji
* user_id - identyfikator użytkownika
* track_id - identyfikator utworu
* event_type - typ zdarzenia (PLAY, LIKE, SKIP)
* session_id - identyfikator sesji


In [582]:
sessions_data_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3971 entries, 0 to 3970
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   timestamp   3971 non-null   datetime64[ns]
 1   user_id     3776 non-null   float64       
 2   track_id    3776 non-null   object        
 3   event_type  3763 non-null   object        
 4   session_id  3971 non-null   int64         
dtypes: datetime64[ns](1), float64(1), int64(1), object(2)
memory usage: 155.2+ KB


### Analiza timestamp

In [583]:
sessions_data_frame['timestamp'].describe(datetime_is_numeric=True)

count                             3971
mean     2023-02-09 22:20:59.431924736
min                2023-01-01 01:12:53
25%      2023-01-22 12:22:54.168499968
50%      2023-02-08 12:59:38.910000128
75%      2023-02-28 06:16:43.153500160
max         2023-03-24 06:28:07.464000
Name: timestamp, dtype: object

Sesje rozpoczęły się 2023-01-01 01:12:53 i zakończyły 2023-03-24 06:28:08.464000.


In [584]:
# find sessions with timestamp, but without user_id, track_id or event_type
sessions_data_frame[sessions_data_frame['timestamp'].notnull() & (sessions_data_frame['user_id'].isnull() | sessions_data_frame['track_id'].isnull() | sessions_data_frame['event_type'].isnull())]

Unnamed: 0,timestamp,user_id,track_id,event_type,session_id
1,2023-01-04 11:49:32.533,,2lkoxPYi7o9OlaGP9bVI09,PLAY,124
9,2023-01-06 04:47:01.091,101.0,7fcD9YsAiBbtVyKe9Tqll6,,125
10,2023-01-06 04:47:17.285,101.0,7fcD9YsAiBbtVyKe9Tqll6,,125
15,2023-01-06 04:57:05.043,,0Um0731HrqdBsyGs3WdmJX,PLAY,125
32,2023-02-08 12:57:06.312,101.0,,SKIP,131
52,2023-02-14 16:51:43.190,,0Lh0VkvZ3LJPR5fUs0pigo,LIKE,134
59,2023-02-22 17:35:32.492,101.0,,LIKE,135
70,2023-03-07 16:11:35.000,101.0,,PLAY,138
71,2023-03-10 22:02:57.000,101.0,20IIOqNWjz9UL3EyxDgMfK,,139
75,2023-03-10 22:05:46.984,,2KqeHIAjaeeeUUJMzlv698,PLAY,139


### Analiza user_id

In [585]:
sessions_data_frame['user_id'].value_counts().describe()

count     50.0000
mean      75.5200
std       30.6436
min       20.0000
25%       50.0000
50%       75.0000
75%       95.7500
max      148.0000
Name: user_id, dtype: float64

Średnia liczba sesji na użytkownika wynosi 75.5, a maksymalna 148.


### Analiza track_id



In [586]:
# TODO


### Analiza event_type

In [587]:
sessions_data_frame['event_type'].value_counts()

PLAY             2242
SKIP              825
LIKE              619
BUY_PREMIUM        47
ADVERTISEMENT      30
Name: event_type, dtype: int64

### Analiza session_id

In [588]:
sessions_data_frame['session_id'].value_counts().describe()

count    636.000000
mean       6.243711
std        5.611185
min        1.000000
25%        2.000000
50%        4.000000
75%        9.000000
max       33.000000
Name: session_id, dtype: float64

Średnia liczba zdarzeń na sesję wynosi 6.2


# Analiza danych o utworach


In [589]:
tracks_data_frame.head(n=5)


Unnamed: 0,id,name,popularity,duration_ms,explicit,id_artist,release_date,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,4LdczeQKU4ojwFJd6QWGnW,Taking Care Of Business - 2005 Remaster,1.0,177853,0,4AgFg0hHythyklEs4t7XKt,2011-01-01,0.554,0.847,7,-7.237,0.0633,0.00789,2e-06,0.183,0.733,90.655
1,7F20EOraDwU8KvNcyzUKVz,Leef,42.0,217903,0,5l2Xy4aUoJDRSpsYHyOumD,2015-06-26,0.673,0.727,9,-6.587,0.0401,0.282,0.0,0.178,0.782,121.016
2,5EMGRFIidh98v9eXj6QnOU,Dime Que No,,266013,0,0h1zs4CTlU9D2QtgPxptUD,2012-01-23,0.535,0.648,0,-5.842,0.0364,0.369,0.0,0.075,0.311,142.486
3,5XFfcpc0V1fqHeiEjJd4y5,I Hate the Capitalist System,28.0,196387,0,7htyUKqAPLpbTA9nkEhGRf,1973-01-01,0.454,0.115,5,-15.617,0.0405,0.797,0.0,0.103,0.366,86.761
4,4fsmo5mukKd4vafcHN41KP,Hai la sârbă roată,24.0,223013,0,4a0FNCbvDEoeHYFfc045W4,2008-01-01,0.509,0.699,10,-4.935,0.0372,0.686,1.6e-05,0.267,0.949,174.827


Atrybuty:
* id - identyfikator utworu
* name - nazwa utworu
* popularity - popularność utworu
* duration_ms - długość utworu w milisekundach
* explicit - czy utwór zawiera treści nieodpowiednie dla dzieci
* id_artist - identyfikator artysty
* release_date - data wydania utworu
* danceability - współczynnik "taneczności" utworu
* energy - współczynnik "energii" utworu
* key - główny ton utworu
* loudness - głośność utworu
* speechiness - współczynnik "mowy" utworu
* acousticness - współczynnik "akustyczności" utworu
* instrumentalness - współczynnik "instrumentalności" utworu
* liveness - współczynnik "żywotności" utworu
* valence - współczynnik "pozytywności" utworu
* tempo - tempo utworu

In [590]:
tracks_data_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21608 entries, 0 to 21607
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                20496 non-null  object 
 1   name              20557 non-null  object 
 2   popularity        20505 non-null  float64
 3   duration_ms       21608 non-null  int64  
 4   explicit          21608 non-null  int64  
 5   id_artist         20526 non-null  object 
 6   release_date      21608 non-null  object 
 7   danceability      21608 non-null  float64
 8   energy            21608 non-null  float64
 9   key               21608 non-null  int64  
 10  loudness          21608 non-null  float64
 11  speechiness       21608 non-null  float64
 12  acousticness      21608 non-null  float64
 13  instrumentalness  21608 non-null  float64
 14  liveness          21608 non-null  float64
 15  valence           21608 non-null  float64
 16  tempo             21608 non-null  float6

### Analiza id
TODO

In [591]:
# find tracks with null id
tracks_data_frame[tracks_data_frame['id'].isnull()]

Unnamed: 0,id,name,popularity,duration_ms,explicit,id_artist,release_date,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
7,,"Junge, komm bald wieder",39.0,186667,0,18mGtkaRoIjbOaGT1rykKP,1962-01-01,0.449,0.333,7,-10.311,0.0272,0.807,7e-05,0.0597,0.494,86.612
23,,Æfintýri í Mararþaraborg - 10,3.0,51966,0,1AQJhI2hhTs9vGfmoNxrGF,1974-01-01,0.757,0.07,10,-19.78,0.0628,0.693,0.0,0.203,0.741,111.851
83,,064 - Geisterstadt - Teil 13,37.0,101187,0,3meJIgRw7YleJrmbpbJK6S,1995-08-29,0.559,0.408,5,-16.739,0.514,0.38,0.0,0.362,0.329,61.553
104,,Les petits poissons,28.0,97800,0,2fJR6skAOWHwNlYPCd5Spe,1999-10-07,0.451,0.266,2,-7.493,0.0357,0.889,0.0,0.248,0.751,175.806
156,,Join the Chant,33.0,364467,0,,1987-01-01,0.741,0.758,8,-11.795,0.0642,0.00199,0.285,0.0702,0.597,121.072
161,,Precioso Jesús,59.0,380317,0,2JjoUSP8dUA7UhlWW59fK1,2007-11-27,0.473,0.427,1,-6.911,0.0269,0.332,0.0,0.702,0.21,140.169
188,,Nagisalu Neenu,11.0,271438,0,5Xpg6PBSUOoho2lI9qLjiu,1980-01-01,0.435,0.678,1,-3.486,0.0459,0.899,1.6e-05,0.128,0.532,94.989
191,,,40.0,213160,0,1bZNv4q3OxYq7mmnLha7Tu,1990-01-01,0.808,0.194,4,-15.072,0.0703,0.705,2e-05,0.0755,0.756,120.07
214,,Domov,33.0,192180,1,4qla74GaFZkhFIUQvGZAEN,2015-10-11,0.607,0.58,6,-4.356,0.073,0.392,1.7e-05,0.102,0.401,132.921
227,,Ê Baiana,,181467,0,6UKz2oYWzE0ZBnciztCown,2005-01-01,0.624,0.88,9,-7.224,0.158,0.0806,4.2e-05,0.079,0.892,124.963


### Analiza name
TODO

In [592]:
# find tracks with null name
tracks_data_frame[tracks_data_frame['name'].isnull()]

Unnamed: 0,id,name,popularity,duration_ms,explicit,id_artist,release_date,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
114,7wolKJcoDqcyCIC9sLy2l3,,8.0,275737,0,6IW026WCYU8L1WF79dfwss,1962-01-01,0.461,0.595,5,-7.396,0.0956,0.901,3.4e-05,0.481,0.93,135.173
158,4Y4XzfNAzOqax0YcXxkP0u,,33.0,280693,0,,1998-12-21,0.704,0.859,10,-5.254,0.0786,0.324,0.000971,0.0671,0.619,179.955
174,3yV7OpqJxlFd7yWnOEDOYU,,13.0,207067,0,6GO0U13OFE0XntmksjZ4PY,1983,0.424,0.201,8,-14.562,0.0431,0.809,0.00465,0.319,0.298,96.591
191,,,40.0,213160,0,1bZNv4q3OxYq7mmnLha7Tu,1990-01-01,0.808,0.194,4,-15.072,0.0703,0.705,2e-05,0.0755,0.756,120.07
200,3FSaOL1i67DehIF1gqKkQv,,2.0,178027,0,1YzCsTRb22dQkh9lghPIrp,1948,0.373,0.0474,9,-16.316,0.0387,0.98,1.1e-05,0.0781,0.174,35.32
205,6oPqaiLKPdQj2ecBpMgM07,,35.0,222733,0,4mjPiZmthhrzR8n8D9c9xE,2013-05-01,0.514,0.942,7,-2.378,0.0856,0.0571,0.000239,0.0978,0.755,149.999
210,072nOX5nDAGalF8TC30EFv,,30.0,190400,0,786hGmAEXHUeCdKPAj3JIa,2013-01-01,0.835,0.47,7,-7.529,0.0803,0.212,1.9e-05,0.111,0.815,74.196
216,4PCvrovUYITtRCqV2vv7pu,,36.0,236520,0,4RN2vlFWepLa46qQIU2PHs,1977,0.532,0.242,4,-10.603,0.0426,0.738,2.5e-05,0.108,0.299,78.933
226,4u5MR81TPxZ9odymoLy8bb,,0.0,165897,0,0f8MDDzIc6M4uH1xH0o0gy,1927,0.723,0.116,8,-12.828,0.0968,0.993,0.454,0.0915,0.614,112.362
237,0zoNNZmhsJXVBJEXeTg9vJ,,16.0,225693,0,11wqbbMVEK4dBZbagPMcXl,1997-11-12,0.641,0.402,5,-7.577,0.0304,0.274,0.0,0.186,0.638,126.136


### Analiza popularity
TODO

In [593]:
# find tracks with null popularity
tracks_data_frame[tracks_data_frame['popularity'].isnull()]

Unnamed: 0,id,name,popularity,duration_ms,explicit,id_artist,release_date,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
2,5EMGRFIidh98v9eXj6QnOU,Dime Que No,,266013,0,0h1zs4CTlU9D2QtgPxptUD,2012-01-23,0.535,0.648,0,-5.842,0.0364,0.369,0.0,0.075,0.311,142.486
12,0CJQzybtZV8qql6Z1YtdLg,Workshop (Red Gold And Green),,274040,0,6qaKS0nzGP4hfjl4aFZmEU,1976-04-23,0.765,0.408,2,-10.532,0.0582,0.0172,0.826,0.0762,0.781,145.202
21,1SsgCimc43JJfajCXSGfwY,曼波迷,,165880,0,5OSr1J92YEsVUjPmA20puE,1958-06-30,0.81,0.339,6,-9.63,0.0641,0.843,0.0,0.342,0.917,81.601
115,0w5HJfvdcyw5aYJTyq6TJ8,Caution (Don Not Stop on Tracks),,576800,0,4TMHGUX5WI7OOm53PqSDAT,1968,0.33,0.397,9,-19.499,0.0945,0.491,0.0283,0.487,0.101,144.281
121,6BnQFRnHUkh9BU70bx59yG,Texas,,198147,0,2OFnmSAGGRIx8Xznv0GliR,1999-07-30,0.844,0.889,9,-7.472,0.0357,0.478,2.3e-05,0.215,0.904,106.066
175,76c2rWXt6MlW8lUlHufJ4f,Intill mej,,221600,0,2NVoobCAvqjbOsoEc8sjur,1992-10-28,0.606,0.68,7,-10.641,0.0328,0.375,7.8e-05,0.199,0.735,132.91
193,2BLI4Z9k2GU7HF7Oj4Schg,Everyday,,261960,0,55N838yCXjxLVkBkIM5pnf,2014,0.68,0.228,7,-9.651,0.0349,0.901,0.0,0.0566,0.248,119.863
219,4wendPqEPX9hknmLYLzWp4,COOL & SOUL,,232571,0,1XYuC1vxinTjHCNd5clB8C,2006-07-05,0.491,0.933,5,-6.155,0.0745,0.156,0.000365,0.0844,0.906,133.66
227,,Ê Baiana,,181467,0,6UKz2oYWzE0ZBnciztCown,2005-01-01,0.624,0.88,9,-7.224,0.158,0.0806,4.2e-05,0.079,0.892,124.963
247,0gWqlhTvn7XbkseUhKA3bl,Counting Stars,,257280,0,5Pwc4xIPtQLFEnJriah9YJ,2013-01-01,0.664,0.705,1,-4.972,0.0382,0.0654,0.0,0.118,0.477,122.016


### Analiza duration_ms
TODO

### Analiza explicit
TODO

### Analiza id_artist
TODO

In [594]:
# find tracks with null id_artist
tracks_data_frame[tracks_data_frame['id_artist'].isnull()]

Unnamed: 0,id,name,popularity,duration_ms,explicit,id_artist,release_date,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
26,3Z6DXqQnf2fXU9tfQqSA9j,My All,28.0,254360,0,,1957,0.32,0.0185,8,-26.744,0.036,0.986,0.95,0.105,0.166,88.723
31,49CTqnjw0WX1iEQI0yxBVq,Round Round,29.0,237133,0,,2006-01-01,0.733,0.843,6,-4.351,0.0339,0.00227,6e-06,0.102,0.76,126.591
80,1J1lVuoAJllePJ7lk44qKo,狐の嫁入り行進曲,20.0,304053,0,,1972-09-25,0.534,0.386,7,-11.517,0.0278,0.0348,0.00145,0.0547,0.586,109.266
95,6rPVunStILqsPHOWTv1wvB,The Journey,18.0,166093,0,,1968-12,0.474,0.553,7,-12.893,0.0338,0.0445,0.0339,0.219,0.564,112.98
103,5iKjtfvlHQkmDz1wrKlVv5,Kapitel 02 - Felix reißt aus (Folge 021),33.0,92948,0,,1994,0.67,0.299,1,-17.513,0.948,0.684,0.0,0.325,0.587,94.479
106,451AUl0iDHead9s7143WNe,Dragon,15.0,216364,0,,2018-12-14,0.428,0.886,2,-3.908,0.208,0.0122,0.779,0.272,0.108,132.176
110,44estWiBus1Gw8jo1JqZHp,"Misery And Gin - From The ""Clint Eastwood's Bronco Billy "" Soundtrack",38.0,168653,0,,1980-01-01,0.291,0.404,5,-10.869,0.0299,0.551,1.4e-05,0.203,0.245,144.099
120,26m9FbObHOTlXwcQH9HBH1,Bayou,33.0,295667,0,,1981,0.628,0.722,11,-12.635,0.0389,0.187,0.859,0.0378,0.869,111.968
141,15evaxM7psryct9lWFnYf5,Los Super Capos,39.0,168565,0,,1999-07-27,0.653,0.355,8,-10.072,0.0742,0.703,0.0,0.0547,0.968,94.619
156,,Join the Chant,33.0,364467,0,,1987-01-01,0.741,0.758,8,-11.795,0.0642,0.00199,0.285,0.0702,0.597,121.072


### Analiza release_date
TODO

### Analiza danceability
TODO

### Analiza energy
TODO

### Analiza key
TODO

### Analiza loudness
TODO

### Analiza speechiness
TODO

### Analiza acousticness
TODO

### Analiza instrumentalness
TODO

### Analiza liveness
TODO

### Analiza valence
TODO

### Analiza tempo
TODO

# Analiza użytkowników

In [595]:
users_data_frame.head(n=5)

Unnamed: 0,user_id,name,city,street,favourite_genres,premium_user,id
0,101,Klara Herdzik,Szczecin,pl. Floriana 55/22,"[permanent wave, mandopop, funk]",1.0,
1,102,Szymon Plebanek,Kraków,al. Kalinowa 27/40,,1.0,-1.0
2,103,Szymon Zawal,Gdynia,al. Głogowa 14/10,"[filmi, regional mexican, folk]",1.0,
3,104,Andrzej Berendt,Kraków,ul. Strażacka 14/63,"[tropical, new wave, latin]",1.0,
4,105,Ida Karsznia,Kraków,al. Złota 549,"[soul, tropical, alternative metal]",1.0,


Atrybuty:
* user_id - identyfikator użytkownika
* name - imię i nazwisko użytkownika
* city - miasto zamieszkania użytkownika
* street - ulica zamieszkania użytkownika
* favourite_genres - ulubione gatunki użytkownika
* premium_user - czy użytkownik posiada konto premium
* id - zbędny atrybut


In [596]:
users_data_frame.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   user_id           50 non-null     int64  
 1   name              50 non-null     object 
 2   city              50 non-null     object 
 3   street            50 non-null     object 
 4   favourite_genres  48 non-null     object 
 5   premium_user      49 non-null     float64
 6   id                5 non-null      float64
dtypes: float64(2), int64(1), object(4)
memory usage: 2.9+ KB


### Analiza user_id
TODO

### Analiza name
TODO

In [597]:
# find users with duplicated names
users_data_frame[users_data_frame['name'].duplicated(keep=False)]

Unnamed: 0,user_id,name,city,street,favourite_genres,premium_user,id


### Analiza city
TODO

### Analiza street
TODO

### Analiza favourite_genres
TODO

In [598]:
# find users with null favourite_genres
users_data_frame[users_data_frame['favourite_genres'].isnull()]

Unnamed: 0,user_id,name,city,street,favourite_genres,premium_user,id
1,102,Szymon Plebanek,Kraków,al. Kalinowa 27/40,,1.0,-1.0
40,141,Apolonia Bacia,Szczecin,al. Kilińskiego 972,,1.0,


### Analiza premium_user
TODO

In [599]:
# find users with null premium_user
users_data_frame[users_data_frame['premium_user'].isnull()]

Unnamed: 0,user_id,name,city,street,favourite_genres,premium_user,id
14,115,Jan Stempin,Wrocław,ulica Modrzewiowa 37,"[filmi, soul, turkish pop]",,


### Analiza id
TODO
