<a href="https://colab.research.google.com/github/FredArgoX/ChaoticTest_PySpark/blob/main/02_GL_Spark_Music_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Source: [Great Learning](https://olympus.mygreatlearning.com/courses/31729/modules/items/879881?pb_id=581)

# Dataset Extraction

Taken from: http://ocelma.net/MusicRecommendationDataset/lastfm-1K.html

In [1]:
# Install gdown for large files management
!pip install -U gdown



In [2]:
# Download Music Dataset from: https://drive.google.com/file/d/1P_fmQgzHmDWbHaTmoymYt56aAoeqfLQK/view?usp=sharing
!gdown --id 1P_fmQgzHmDWbHaTmoymYt56aAoeqfLQK -O lastfm-dataset-1K.tar.gz

Downloading...
From (original): https://drive.google.com/uc?id=1P_fmQgzHmDWbHaTmoymYt56aAoeqfLQK
From (redirected): https://drive.google.com/uc?id=1P_fmQgzHmDWbHaTmoymYt56aAoeqfLQK&confirm=t&uuid=28fb0515-88e0-4e58-8b14-467341a3e894
To: /content/lastfm-dataset-1K.tar.gz
100% 673M/673M [00:08<00:00, 77.4MB/s]


In [3]:
# Extract the downloaded dataset
!tar -xzf lastfm-dataset-1K.tar.gz

In [4]:
import numpy as np
import pandas as pd

In [5]:
raw_data = pd.read_csv("lastfm-dataset-1K/userid-timestamp-artid-artname-traid-traname.tsv", delimiter="\t", header=None, on_bad_lines='skip')

In [6]:
raw_data.head()

Unnamed: 0,0,1,2,3,4,5
0,user_000001,2009-05-04T23:08:57Z,f1b1cf71-bd35-4e99-8624-24a6e15f133a,Deep Dish,,Fuck Me Im Famous (Pacha Ibiza)-09-28-2007
1,user_000001,2009-05-04T13:54:10Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Composition 0919 (Live_2009_4_15)
2,user_000001,2009-05-04T13:52:04Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Mc2 (Live_2009_4_15)
3,user_000001,2009-05-04T13:42:52Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Hibari (Live_2009_4_15)
4,user_000001,2009-05-04T13:42:11Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Mc1 (Live_2009_4_15)


In [7]:
raw_data.describe()

Unnamed: 0,0,1,2,3,4,5
count,19098853,19098853,18498005,19098853,16936134,19098643
unique,992,17454730,107295,173921,960402,1083470
top,user_000949,2009-02-26T21:29:15Z,a74b1b7f-71a5-4011-9441-d0b5e4122711,Radiohead,db16d0b3-b8ce-4aa8-a11a-e4d53cc7f8a6,Intro
freq,183103,248,115099,115099,3991,17561


In [8]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19098853 entries, 0 to 19098852
Data columns (total 6 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   0       object
 1   1       object
 2   2       object
 3   3       object
 4   4       object
 5   5       object
dtypes: object(6)
memory usage: 874.3+ MB


# Basic Data Exploration for User Profile

In [11]:
user_profile = pd.read_csv("lastfm-dataset-1K/userid-profile.tsv", sep="\t")

In [12]:
user_profile.head()

Unnamed: 0,#id,gender,age,country,registered
0,user_000001,m,,Japan,"Aug 13, 2006"
1,user_000002,f,,Peru,"Feb 24, 2006"
2,user_000003,m,22.0,United States,"Oct 30, 2005"
3,user_000004,f,,,"Apr 26, 2006"
4,user_000005,m,,Bulgaria,"Jun 29, 2006"


In [13]:
user_profile.columns

Index(['#id', 'gender', 'age', 'country', 'registered'], dtype='object')

In [14]:
for col in user_profile.columns:
  print(col)

#id
gender
age
country
registered


In [15]:
user_profile["age"]

Unnamed: 0,age
0,
1,
2,22.0
3,
4,
...,...
987,
988,
989,
990,


In [16]:
user_profile["age"].max()

103.0

In [17]:
user_profile.describe()

Unnamed: 0,age
count,286.0
mean,25.367133
std,8.314952
min,3.0
25%,21.0
50%,23.0
75%,28.0
max,103.0


In [18]:
user_profile.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 992 entries, 0 to 991
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   #id         992 non-null    object 
 1   gender      884 non-null    object 
 2   age         286 non-null    float64
 3   country     907 non-null    object 
 4   registered  984 non-null    object 
dtypes: float64(1), object(4)
memory usage: 38.9+ KB


In [19]:
user_profile["gender"].value_counts()

Unnamed: 0_level_0,count
gender,Unnamed: 1_level_1
m,502
f,382


In [20]:
user_profile["country"].value_counts()

Unnamed: 0_level_0,count
country,Unnamed: 1_level_1
United States,228
United Kingdom,126
Poland,50
Germany,36
Norway,35
...,...
Belarus,1
Tunisia,1
Iceland,1
Northern Mariana Islands,1


In [21]:
user_profile["registered"].value_counts()

Unnamed: 0_level_0,count
registered,Unnamed: 1_level_1
"Dec 6, 2005",6
"Dec 4, 2005",5
"Jan 22, 2006",5
"Dec 19, 2006",5
"Apr 18, 2006",5
...,...
"Jul 2, 2005",1
"May 22, 2006",1
"Oct 30, 2006",1
"Jan 5, 2007",1


In [22]:
user_profile.columns

Index(['#id', 'gender', 'age', 'country', 'registered'], dtype='object')

# Analytics