<a href="https://colab.research.google.com/github/FredArgoX/ChaoticTest_PySpark/blob/main/02_GL_Spark_Music_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Source: [Great Learning](https://olympus.mygreatlearning.com/courses/31729/modules/items/879881?pb_id=581)

# Dataset Extraction

[`Dataset 1`](http://ocelma.net/MusicRecommendationDataset/lastfm-1K.html)

[`Dataset 2`](http://ocelma.net/MusicRecommendationDataset/lastfm-360K.html)

In [37]:
# Install gdown for large files management
!pip install -U gdown



In [38]:
# 1K Dataset download
# Download Music Dataset from: https://drive.google.com/file/d/1P_fmQgzHmDWbHaTmoymYt56aAoeqfLQK/view?usp=sharing
!gdown --id 1P_fmQgzHmDWbHaTmoymYt56aAoeqfLQK -O lastfm-dataset-1K.tar.gz

Downloading...
From (original): https://drive.google.com/uc?id=1P_fmQgzHmDWbHaTmoymYt56aAoeqfLQK
From (redirected): https://drive.google.com/uc?id=1P_fmQgzHmDWbHaTmoymYt56aAoeqfLQK&confirm=t&uuid=54fa1763-d15b-4dbc-942f-a4e6bcf3755d
To: /content/lastfm-dataset-1K.tar.gz
100% 673M/673M [00:11<00:00, 58.3MB/s]


In [39]:
# 360K Dataset download
# Download Music Dataset from: https://drive.google.com/file/d/1gDNKnjzEXjXYB94HV9rWVCvqS9HIxffz/view?usp=sharing
!gdown --id 1gDNKnjzEXjXYB94HV9rWVCvqS9HIxffz -O lastfm-dataset-360K.tar.gz

Downloading...
From (original): https://drive.google.com/uc?id=1gDNKnjzEXjXYB94HV9rWVCvqS9HIxffz
From (redirected): https://drive.google.com/uc?id=1gDNKnjzEXjXYB94HV9rWVCvqS9HIxffz&confirm=t&uuid=7e413102-ea5e-45d3-b037-009e6ed67cc7
To: /content/lastfm-dataset-360K.tar.gz
100% 569M/569M [00:06<00:00, 87.6MB/s]


In [40]:
# Extract the downloaded datasets

# 1K
!tar -xzf lastfm-dataset-1K.tar.gz

# 360K
!tar -xzf lastfm-dataset-360K.tar.gz

# Verify Data Extraction

In [42]:
# 1K
!ls lastfm-dataset-1K

README.txt	    userid-timestamp-artid-artname-traid-traname.tsv
userid-profile.tsv


In [43]:
# 360K
!ls lastfm-dataset-360K

mbox_sha1sum.py  usersha1-artmbid-artname-plays.tsv
README.txt	 usersha1-profile.tsv


In [44]:
import numpy as np
import pandas as pd

In [47]:
# 1K
raw_data = pd.read_csv("lastfm-dataset-1K/userid-timestamp-artid-artname-traid-traname.tsv", delimiter="\t", header=None, on_bad_lines='skip')

# 1K User Profile
user_profile = pd.read_csv("lastfm-dataset-1K/userid-profile.tsv", sep="\t")

#360K
plays_data = pd.read_csv("lastfm-dataset-360K/usersha1-artmbid-artname-plays.tsv", sep="\t")

# Raw Data Exploration (1K)

In [48]:
raw_data.head()

Unnamed: 0,0,1,2,3,4,5
0,user_000001,2009-05-04T23:08:57Z,f1b1cf71-bd35-4e99-8624-24a6e15f133a,Deep Dish,,Fuck Me Im Famous (Pacha Ibiza)-09-28-2007
1,user_000001,2009-05-04T13:54:10Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Composition 0919 (Live_2009_4_15)
2,user_000001,2009-05-04T13:52:04Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Mc2 (Live_2009_4_15)
3,user_000001,2009-05-04T13:42:52Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Hibari (Live_2009_4_15)
4,user_000001,2009-05-04T13:42:11Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Mc1 (Live_2009_4_15)


In [49]:
raw_data.describe()

Unnamed: 0,0,1,2,3,4,5
count,19098853,19098853,18498005,19098853,16936134,19098643
unique,992,17454730,107295,173921,960402,1083470
top,user_000949,2009-02-26T21:29:15Z,a74b1b7f-71a5-4011-9441-d0b5e4122711,Radiohead,db16d0b3-b8ce-4aa8-a11a-e4d53cc7f8a6,Intro
freq,183103,248,115099,115099,3991,17561


In [50]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19098853 entries, 0 to 19098852
Data columns (total 6 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   0       object
 1   1       object
 2   2       object
 3   3       object
 4   4       object
 5   5       object
dtypes: object(6)
memory usage: 874.3+ MB


# User Profile Exploration (1K)

In [51]:
user_profile.head()

Unnamed: 0,#id,gender,age,country,registered
0,user_000001,m,,Japan,"Aug 13, 2006"
1,user_000002,f,,Peru,"Feb 24, 2006"
2,user_000003,m,22.0,United States,"Oct 30, 2005"
3,user_000004,f,,,"Apr 26, 2006"
4,user_000005,m,,Bulgaria,"Jun 29, 2006"


In [52]:
user_profile.columns

Index(['#id', 'gender', 'age', 'country', 'registered'], dtype='object')

In [53]:
for col in user_profile.columns:
  print(col)

#id
gender
age
country
registered


In [54]:
user_profile["age"]

Unnamed: 0,age
0,
1,
2,22.0
3,
4,
...,...
987,
988,
989,
990,


In [55]:
user_profile["age"].max()

103.0

In [56]:
user_profile.describe()

Unnamed: 0,age
count,286.0
mean,25.367133
std,8.314952
min,3.0
25%,21.0
50%,23.0
75%,28.0
max,103.0


In [57]:
user_profile.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 992 entries, 0 to 991
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   #id         992 non-null    object 
 1   gender      884 non-null    object 
 2   age         286 non-null    float64
 3   country     907 non-null    object 
 4   registered  984 non-null    object 
dtypes: float64(1), object(4)
memory usage: 38.9+ KB


In [58]:
user_profile["gender"].value_counts()

Unnamed: 0_level_0,count
gender,Unnamed: 1_level_1
m,502
f,382


In [59]:
user_profile["country"].value_counts()

Unnamed: 0_level_0,count
country,Unnamed: 1_level_1
United States,228
United Kingdom,126
Poland,50
Germany,36
Norway,35
...,...
Belarus,1
Tunisia,1
Iceland,1
Northern Mariana Islands,1


In [60]:
user_profile["registered"].value_counts()

Unnamed: 0_level_0,count
registered,Unnamed: 1_level_1
"Dec 6, 2005",6
"Dec 4, 2005",5
"Jan 22, 2006",5
"Dec 19, 2006",5
"Apr 18, 2006",5
...,...
"Jul 2, 2005",1
"May 22, 2006",1
"Oct 30, 2006",1
"Jan 5, 2007",1


In [61]:
user_profile.columns

Index(['#id', 'gender', 'age', 'country', 'registered'], dtype='object')

In [62]:
user_profile[:10]

Unnamed: 0,#id,gender,age,country,registered
0,user_000001,m,,Japan,"Aug 13, 2006"
1,user_000002,f,,Peru,"Feb 24, 2006"
2,user_000003,m,22.0,United States,"Oct 30, 2005"
3,user_000004,f,,,"Apr 26, 2006"
4,user_000005,m,,Bulgaria,"Jun 29, 2006"
5,user_000006,,24.0,Russian Federation,"May 18, 2006"
6,user_000007,f,,United States,"Jan 22, 2006"
7,user_000008,m,23.0,Slovakia,"Sep 28, 2006"
8,user_000009,f,19.0,United States,"Jan 13, 2007"
9,user_000010,m,19.0,Poland,"May 4, 2006"


# Plays Analytics

In [63]:
plays_data.head()

Unnamed: 0,00000c289a1829a808ac09c00daf10bc3c4e223b,3bd73256-3905-4f3a-97e2-8b341527f805,betty blowtorch,2137
0,00000c289a1829a808ac09c00daf10bc3c4e223b,f2fb0ff0-5679-42ec-a55c-15109ce6e320,die Ärzte,1099
1,00000c289a1829a808ac09c00daf10bc3c4e223b,b3ae82c2-e60b-4551-a76d-6620f1b456aa,melissa etheridge,897
2,00000c289a1829a808ac09c00daf10bc3c4e223b,3d6bbeb7-f90e-4d10-b440-e153c0d10b53,elvenking,717
3,00000c289a1829a808ac09c00daf10bc3c4e223b,bbd2ffd7-17f4-4506-8572-c1ea58c3f9a8,juliette & the licks,706
4,00000c289a1829a808ac09c00daf10bc3c4e223b,8bfac288-ccc5-448d-9573-c33ea2aa5c30,red hot chili peppers,691


In [64]:
plays_data.shape

(17535654, 4)

In [65]:
plays_data.columns = ["user_id", "artist_id", "artist_name", "plays"]

In [66]:
plays_data.head()

Unnamed: 0,user_id,artist_id,artist_name,plays
0,00000c289a1829a808ac09c00daf10bc3c4e223b,f2fb0ff0-5679-42ec-a55c-15109ce6e320,die Ärzte,1099
1,00000c289a1829a808ac09c00daf10bc3c4e223b,b3ae82c2-e60b-4551-a76d-6620f1b456aa,melissa etheridge,897
2,00000c289a1829a808ac09c00daf10bc3c4e223b,3d6bbeb7-f90e-4d10-b440-e153c0d10b53,elvenking,717
3,00000c289a1829a808ac09c00daf10bc3c4e223b,bbd2ffd7-17f4-4506-8572-c1ea58c3f9a8,juliette & the licks,706
4,00000c289a1829a808ac09c00daf10bc3c4e223b,8bfac288-ccc5-448d-9573-c33ea2aa5c30,red hot chili peppers,691


In [67]:
plays_data.describe()

Unnamed: 0,plays
count,17535650.0
mean,215.1931
std,614.4813
min,0.0
25%,35.0
50%,94.0
75%,224.0
max,419157.0


In [68]:
plays_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17535654 entries, 0 to 17535653
Data columns (total 4 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   user_id      object
 1   artist_id    object
 2   artist_name  object
 3   plays        int64 
dtypes: int64(1), object(3)
memory usage: 535.1+ MB


In [69]:
plays_data["plays"]

Unnamed: 0,plays
0,1099
1,897
2,717
3,706
4,691
...,...
17535649,12
17535650,11
17535651,11
17535652,10


In [70]:
plays_data["plays"].max()

419157

In [71]:
plays_data.columns

Index(['user_id', 'artist_id', 'artist_name', 'plays'], dtype='object')

In [72]:
plays_data["user_id"]

Unnamed: 0,user_id
0,00000c289a1829a808ac09c00daf10bc3c4e223b
1,00000c289a1829a808ac09c00daf10bc3c4e223b
2,00000c289a1829a808ac09c00daf10bc3c4e223b
3,00000c289a1829a808ac09c00daf10bc3c4e223b
4,00000c289a1829a808ac09c00daf10bc3c4e223b
...,...
17535649,"sep 20, 2008"
17535650,"sep 20, 2008"
17535651,"sep 20, 2008"
17535652,"sep 20, 2008"


In [73]:
plays_data["user_id"].nunique()

358868

In [74]:
plays_data["user_id"].unique()

array(['00000c289a1829a808ac09c00daf10bc3c4e223b',
       '00001411dc427966b17297bf4d69e7e193135d89',
       '00004d2ac9316e22dc007ab2243d6fcb239e707d', ...,
       'ffff9af9ae04d263dae91cb838b1f3a6725f5ffb',
       'ffff9ef87a7d9494ada2f9ade4b9ff637c0759ac', 'sep 20, 2008'],
      dtype=object)

In [75]:
plays_data["artist_name"]

Unnamed: 0,artist_name
0,die Ärzte
1,melissa etheridge
2,elvenking
3,juliette & the licks
4,red hot chili peppers
...,...
17535649,turbostaat
17535650,cuba missouri
17535651,little man tate
17535652,sigur rós


In [76]:
plays_data["artist_name"].nunique()

292363

In [77]:
plays_data["artist_name"].unique()

array(['die Ärzte', 'melissa etheridge', 'elvenking', ...,
       'sacred buddha', 'cat lunette', 'suzanina'], dtype=object)

In [78]:
plays_data.head()

Unnamed: 0,user_id,artist_id,artist_name,plays
0,00000c289a1829a808ac09c00daf10bc3c4e223b,f2fb0ff0-5679-42ec-a55c-15109ce6e320,die Ärzte,1099
1,00000c289a1829a808ac09c00daf10bc3c4e223b,b3ae82c2-e60b-4551-a76d-6620f1b456aa,melissa etheridge,897
2,00000c289a1829a808ac09c00daf10bc3c4e223b,3d6bbeb7-f90e-4d10-b440-e153c0d10b53,elvenking,717
3,00000c289a1829a808ac09c00daf10bc3c4e223b,bbd2ffd7-17f4-4506-8572-c1ea58c3f9a8,juliette & the licks,706
4,00000c289a1829a808ac09c00daf10bc3c4e223b,8bfac288-ccc5-448d-9573-c33ea2aa5c30,red hot chili peppers,691


In [79]:
plays_data.drop("artist_name", axis=1, inplace=True)

In [80]:
plays_data.head()

Unnamed: 0,user_id,artist_id,plays
0,00000c289a1829a808ac09c00daf10bc3c4e223b,f2fb0ff0-5679-42ec-a55c-15109ce6e320,1099
1,00000c289a1829a808ac09c00daf10bc3c4e223b,b3ae82c2-e60b-4551-a76d-6620f1b456aa,897
2,00000c289a1829a808ac09c00daf10bc3c4e223b,3d6bbeb7-f90e-4d10-b440-e153c0d10b53,717
3,00000c289a1829a808ac09c00daf10bc3c4e223b,bbd2ffd7-17f4-4506-8572-c1ea58c3f9a8,706
4,00000c289a1829a808ac09c00daf10bc3c4e223b,8bfac288-ccc5-448d-9573-c33ea2aa5c30,691


# Artist Data

In [82]:
artist_data = plays_data[["artist_id", "plays"]]

In [83]:
artist_data.head()

Unnamed: 0,artist_id,plays
0,f2fb0ff0-5679-42ec-a55c-15109ce6e320,1099
1,b3ae82c2-e60b-4551-a76d-6620f1b456aa,897
2,3d6bbeb7-f90e-4d10-b440-e153c0d10b53,717
3,bbd2ffd7-17f4-4506-8572-c1ea58c3f9a8,706
4,8bfac288-ccc5-448d-9573-c33ea2aa5c30,691


In [85]:
artist_aggr = artist_data\
              .groupby("artist_id", as_index=False)\
              .sum()

In [86]:
artist_aggr.head()

Unnamed: 0,artist_id,plays
0,00010eb3-ebfe-4965-81ef-0ac64cd49fde,517
1,0001cd84-2a11-4699-8d6b-0abf969c5f06,4991
2,0002260a-b298-48cc-9895-52c9425796b7,69
3,00026532-1fe3-45fb-a0df-34aec04a1319,321
4,00026d14-39c6-4f2d-b556-093233b5e714,709


In [87]:
artist_data.shape

(17535654, 2)

In [88]:
artist_aggr.shape

(160112, 2)

In [89]:
artist_aggr["artist_id"].nunique()

160112

In [90]:
artist_aggr.columns

Index(['artist_id', 'plays'], dtype='object')

# Clutering on Artist Data

In [95]:
from sklearn.cluster import KMeans

km1 = KMeans(n_clusters=10)
km1

In [96]:
artist_count = artist_aggr[["plays"]]

In [97]:
artist_count

Unnamed: 0,plays
0,517
1,4991
2,69
3,321
4,709
...,...
160107,36985
160108,2157
160109,30634
160110,166


In [98]:
km1.fit(artist_count)

In [104]:
label1 = km1.labels_
label1

array([0, 0, 0, ..., 0, 0, 0], dtype=int32)

In [106]:
# Unique labels in label1
np.unique(label1)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int32)

In [107]:
artist_aggr["cluster_id"] = label1
artist_aggr

Unnamed: 0,artist_id,plays,cluster_id
0,00010eb3-ebfe-4965-81ef-0ac64cd49fde,517,0
1,0001cd84-2a11-4699-8d6b-0abf969c5f06,4991,0
2,0002260a-b298-48cc-9895-52c9425796b7,69,0
3,00026532-1fe3-45fb-a0df-34aec04a1319,321,0
4,00026d14-39c6-4f2d-b556-093233b5e714,709,0
...,...,...,...
160107,fffed9ff-98c6-458a-8379-47e7fb4ba6ec,36985,0
160108,ffff01cd-0ae0-46c7-867b-d17d8d38cff8,2157,0
160109,ffff3742-4ae3-4e13-a29c-d4c164985a5b,30634,0
160110,ffff44bd-e5a5-4e87-8700-35481264e37d,166,0


In [108]:
artist_aggr[artist_aggr["cluster_id"] == 7]

Unnamed: 0,artist_id,plays,cluster_id
688,0110e63e-0a9b-4818-af8e-41e180c20b9a,2367133,7
817,013fa897-86db-41d3-8e9f-386c8a34f4e6,3338687,7
971,01809552-4f87-45b0-afff-2c6f0730a3be,3285873,7
1314,020bfbb4-05c3-4c86-b372-17825c262094,2250415,7
3360,05755bf1-380c-487f-983f-d1a02401fa28,4047899,7
...,...,...,...
156285,f9ef7a22-4262-4596-a2a8-1d19345b8e50,2461628,7
156604,fa7b9055-3703-473a-8a09-adf2fe031a24,2168716,7
156747,fabb37f8-eb2a-4cc1-a72a-b56935bbb72d,2114826,7
158338,fd429857-5ace-4609-ae54-1502c3bdac11,3025734,7
