<a href="https://colab.research.google.com/github/Adil1979/AI_DS_Projects/blob/master/recommend_music.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Recommendation Music**

install pyspark

In [1]:
from google.colab import drive


In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install pyspark

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/45/b0/9d6860891ab14a39d4bddf80ba26ce51c2f9dc4805e5c6978ac0472c120a/pyspark-3.1.1.tar.gz (212.3MB)
[K     |████████████████████████████████| 212.3MB 74kB/s 
[?25hCollecting py4j==0.10.9
[?25l  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 20.7MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.1-py2.py3-none-any.whl size=212767604 sha256=e3ea4aa8a2821dde440309825688d5fb05b3b1026877ec489a1fa534a42ed575
  Stored in directory: /root/.cache/pip/wheels/0b/90/c0/01de724414ef122bd05f056541fb6a0ecf47c7ca655f8b3c0f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.1.1


Importing the modules

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import count, desc , col, max
from pyspark.ml.feature import  StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder

Creating the spark session


In [6]:
spark  = SparkSession.builder.appName('lastfm').getOrCreate()

# Loading the dataset

In [7]:
file_path = '/content/drive/MyDrive/Pyspark/listenings.csv'
df_listenings = spark.read.format('csv').option('header', True).option('inferSchema', True).load(file_path)
df_listenings.show()

+-----------+-------------+--------------------+---------------+--------------------+
|    user_id|         date|               track|         artist|               album|
+-----------+-------------+--------------------+---------------+--------------------+
|000Silenced|1299680100000|           Price Tag|       Jessie J|         Who You Are|
|000Silenced|1299679920000|Price Tag (Acoust...|       Jessie J|           Price Tag|
|000Silenced|1299679440000|Be Mine! (Ballad ...|          Robyn|            Be Mine!|
|000Silenced|1299679200000|            Acapella|          Kelis|            Acapella|
|000Silenced|1299675660000|   I'm Not Invisible|      The Tease|   I'm Not Invisible|
|000Silenced|1297511400000|Bounce (Feat NORE...|       MSTRKRFT|         Fist of God|
|000Silenced|1294498440000|Don't Stop The Mu...|        Rihanna|Addicted 2 Bassli...|
|000Silenced|1292438340000|               ObZen|      Meshuggah|               ObZen|
|000Silenced|1292437740000|   Yama's Messengers|      


# Cleaning tables 

In [8]:
df_listenings = df_listenings.drop('date')
df_listenings.show()

+-----------+--------------------+---------------+--------------------+
|    user_id|               track|         artist|               album|
+-----------+--------------------+---------------+--------------------+
|000Silenced|           Price Tag|       Jessie J|         Who You Are|
|000Silenced|Price Tag (Acoust...|       Jessie J|           Price Tag|
|000Silenced|Be Mine! (Ballad ...|          Robyn|            Be Mine!|
|000Silenced|            Acapella|          Kelis|            Acapella|
|000Silenced|   I'm Not Invisible|      The Tease|   I'm Not Invisible|
|000Silenced|Bounce (Feat NORE...|       MSTRKRFT|         Fist of God|
|000Silenced|Don't Stop The Mu...|        Rihanna|Addicted 2 Bassli...|
|000Silenced|               ObZen|      Meshuggah|               ObZen|
|000Silenced|   Yama's Messengers|         Gojira|The Way of All Flesh|
|000Silenced|On the Brink of E...|   Napalm Death|Time Waits For No...|
|000Silenced|On the Brink of E...|   Napalm Death|Time Waits For

In [9]:
df_listenings = df_listenings.na.drop()
df_listenings.show()

+-----------+--------------------+---------------+--------------------+
|    user_id|               track|         artist|               album|
+-----------+--------------------+---------------+--------------------+
|000Silenced|           Price Tag|       Jessie J|         Who You Are|
|000Silenced|Price Tag (Acoust...|       Jessie J|           Price Tag|
|000Silenced|Be Mine! (Ballad ...|          Robyn|            Be Mine!|
|000Silenced|            Acapella|          Kelis|            Acapella|
|000Silenced|   I'm Not Invisible|      The Tease|   I'm Not Invisible|
|000Silenced|Bounce (Feat NORE...|       MSTRKRFT|         Fist of God|
|000Silenced|Don't Stop The Mu...|        Rihanna|Addicted 2 Bassli...|
|000Silenced|               ObZen|      Meshuggah|               ObZen|
|000Silenced|   Yama's Messengers|         Gojira|The Way of All Flesh|
|000Silenced|On the Brink of E...|   Napalm Death|Time Waits For No...|
|000Silenced|On the Brink of E...|   Napalm Death|Time Waits For

In [11]:
row_numbers = df_listenings.count()
column_numbers = len(df_listenings.columns)
print(row_numbers, column_numbers)

13758905 4



# Let's Perform some aggregation
to see how many times each user has listened to specific track


In [12]:
df_listenings_agg = df_listenings.select('user_id', 'track').groupBy('user_id', 'track').agg(count('*')).orderBy('user_id')
df_listenings_agg.show()

+-------+--------------------+--------+
|user_id|               track|count(1)|
+-------+--------------------+--------+
| --Seph|Chelsea Hotel - L...|       1|
| --Seph|               Leloo|       1|
| --Seph|          Paris 2004|       7|
| --Seph|     The Way We Were|       1|
| --Seph|        Window Blues|       1|
| --Seph|   Summa for Strings|       1|
| --Seph|         The Embrace|       1|
| --Seph|       Life On Mars?|       1|
| --Seph|Hungarian Rhapsod...|       1|
| --Seph| Air on the G String|       1|
| --Seph|Vestido Estampado...|       1|
| --Seph|Belina (Original ...|       1|
| --Seph|Hungarian Dance No 5|       1|
| --Seph|       Phantom Pt II|       1|
| --Seph|              Monday|       1|
| --Seph| White Winter Hymnal|       3|
| --Seph|Airplanes [feat H...|       1|
| --Seph|  California Waiting|       1|
| --Seph|      Hour for magic|       2|
| --Seph|Virus (Luke Fair ...|       1|
+-------+--------------------+--------+
only showing top 20 rows



In [13]:
row_numbers = df_listenings_agg.count()
column_numbers = len(df_listenings_agg.columns)
print(row_numbers, column_numbers)

9930128 3


In [14]:
df_listenings_agg = df_listenings_agg.limit(20000)

# Let's convert the user id and track columns into unique integers




In [16]:
indexer = [StringIndexer(inputCol=col, outputCol=col+'_index').fit(df_listenings_agg) for col in list(set(df_listenings_agg.columns) - set(['count']))]
pipeline = Pipeline(stages=indexer)
data = pipeline.fit(df_listenings_agg).transform(df_listenings_agg)
data.show()

+-------+--------------------+--------+-----------+-------------+--------------+
|user_id|               track|count(1)|track_index|user_id_index|count(1)_index|
+-------+--------------------+--------+-----------+-------------+--------------+
| --Seph| White Winter Hymnal|       3|       59.0|         69.0|           2.0|
| --Seph|Virus (Luke Fair ...|       1|    15896.0|         69.0|           0.0|
| --Seph|Airplanes [feat H...|       1|      519.0|         69.0|           0.0|
| --Seph|Belina (Original ...|       1|     3278.0|         69.0|           0.0|
| --Seph|              Monday|       1|      334.0|         69.0|           0.0|
| --Seph|Hungarian Dance No 5|       1|     7558.0|         69.0|           0.0|
| --Seph|       Life On Mars?|       1|     1161.0|         69.0|           0.0|
| --Seph|  California Waiting|       1|      197.0|         69.0|           0.0|
| --Seph|       Phantom Pt II|       1|     1377.0|         69.0|           0.0|
| --Seph|   Summa for String

In [18]:
data = data.select('user_id_index', 'track_index', 'count(1)').orderBy('user_id_index')

In [19]:
data.show()

+-------------+-----------+--------+
|user_id_index|track_index|count(1)|
+-------------+-----------+--------+
|          0.0|    10943.0|       1|
|          0.0|    11628.0|       2|
|          0.0|     1349.0|       1|
|          0.0|      381.0|       1|
|          0.0|     8692.0|       1|
|          0.0|     6899.0|       1|
|          0.0|    14044.0|       1|
|          0.0|    15513.0|       1|
|          0.0|    11978.0|       2|
|          0.0|    15176.0|       1|
|          0.0|     8305.0|       1|
|          0.0|    13722.0|       1|
|          0.0|    10620.0|       1|
|          0.0|     4424.0|       1|
|          0.0|    16732.0|       1|
|          0.0|    10630.0|       1|
|          0.0|    12169.0|       1|
|          0.0|     4117.0|       1|
|          0.0|    10336.0|       1|
|          0.0|    16829.0|       1|
+-------------+-----------+--------+
only showing top 20 rows



# Train and Test data

In [26]:
(training, test) = data.randomSplit([0.5,0.5])

# Let's Create our Model

In [27]:
USERID = 'user_id_index'
TRACK = 'track_index'
COUNT = 'count(1)'

als = ALS(maxIter=5, regParam=0.01, userCol=USERID, itemCol=TRACK, ratingCol=COUNT)
model = als.fit(training)

predictions = model.transform(test)


# Generate top 10 Track recommendations for each user

In [28]:
recs = model.recommendForAllUsers(10)

In [29]:
recs.show()

+-------------+--------------------+
|user_id_index|     recommendations|
+-------------+--------------------+
|           31|[{11941, 39.95844...|
|           85|[{11941, 23.73517...|
|          137|[{4461, 10.181214...|
|           65|[{4461, 8.182498}...|
|           53|[{11941, 23.28130...|
|          133|[{11941, 21.76717...|
|           78|[{14301, 13.15504...|
|          108|[{14826, 14.95829...|
|           34|[{9500, 11.131841...|
|          101|[{182, 7.6971474}...|
|          115|[{11941, 12.69868...|
|          126|[{273, 13.0139885...|
|           81|[{11941, 33.07206...|
|           28|[{9500, 14.682959...|
|           76|[{12061, 10.56286...|
|           26|[{1738, 10.175392...|
|           27|[{11941, 31.72511...|
|           44|[{1693, 8.793795}...|
|          103|[{11941, 26.95999...|
|           12|[{1738, 12.177852...|
+-------------+--------------------+
only showing top 20 rows



In [30]:
recs.take(1)

[Row(user_id_index=31, recommendations=[Row(track_index=11941, rating=39.958438873291016), Row(track_index=12192, rating=7.958272457122803), Row(track_index=14826, rating=7.8038649559021), Row(track_index=17141, rating=6.939486026763916), Row(track_index=11878, rating=6.936768054962158), Row(track_index=9557, rating=6.936768054962158), Row(track_index=8496, rating=6.936768054962158), Row(track_index=10339, rating=6.936768054962158), Row(track_index=14236, rating=6.479747295379639), Row(track_index=1688, rating=6.220320701599121)])]