# Install Java, Spark, and Findspark
This installs Apache Spark 2.3.2, Java 8, and [Findspark](https://github.com/minrk/findspark), a library that makes it easy for Python to find Spark.

In [0]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-eu.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz
!tar xf spark-2.4.4-bin-hadoop2.7.tgz
!pip install -q findspark

In [103]:
!pip install py4j
!pip install pyspark



# Set Environment Variables
Set the locations where Spark and Java are installed.

In [0]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.4-bin-hadoop2.7"

# Start a SparkSession
This will start a local Spark session.

In [0]:
import os
import time

In [0]:
# spark imports
from pyspark.sql import SparkSession
from pyspark.sql.functions import UserDefinedFunction, explode, desc
from pyspark.sql.types import StringType, ArrayType
from pyspark.mllib.recommendation import ALS

In [0]:
# data science imports
import math
import numpy as np
import pandas as pd

# visualization imports
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [0]:
import findspark
findspark.init("spark-2.4.4-bin-hadoop2.7")# SPARK_HOME

from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [109]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
!cp /content/gdrive/My\ Drive/MscsDs/msdchallenge/kaggle_visible_evaluation_triplets.txt .

In [0]:
!cp -r /content/gdrive/My\ Drive/MscsDs/MSongsDB-master/PythonSrc .

In [0]:
# ! cd /content/gdrive/My\ Drive/MscsDs/ && tar -zxvf millionsongsubset_full.tar.gz

In [112]:
users = pd.read_table('./kaggle_visible_evaluation_triplets.txt', sep='\t', names= ['user_id','song_ids','play_count'])
users.head()

Unnamed: 0,user_id,song_ids,play_count
0,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SOBONKR12A58A7A7E0,1
1,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SOEGIYH12A6D4FC0E3,1
2,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SOFLJQZ12A6D4FADA6,1
3,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SOHTKMO12AB01843B0,1
4,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SODQZCY12A6D4F9D11,1


In [113]:
from pandas import DataFrame

temp = DataFrame(users, columns= ['user_id', 'song_ids','play_count'])
export_csv = temp.to_csv (r'./triplets.csv', index = None, header=True)
print(temp)

                                          user_id  ... play_count
0        fd50c4007b68a3737fe052d5a4f78ce8aa117f3d  ...          1
1        fd50c4007b68a3737fe052d5a4f78ce8aa117f3d  ...          1
2        fd50c4007b68a3737fe052d5a4f78ce8aa117f3d  ...          1
3        fd50c4007b68a3737fe052d5a4f78ce8aa117f3d  ...          1
4        fd50c4007b68a3737fe052d5a4f78ce8aa117f3d  ...          1
...                                           ...  ...        ...
1450928  5e650759ebf89012044c6d52121eeada8b0ec814  ...          1
1450929  5e650759ebf89012044c6d52121eeada8b0ec814  ...          2
1450930  5e650759ebf89012044c6d52121eeada8b0ec814  ...          2
1450931  5e650759ebf89012044c6d52121eeada8b0ec814  ...          2
1450932  5e650759ebf89012044c6d52121eeada8b0ec814  ...          3

[1450933 rows x 3 columns]


In [114]:
from sklearn.preprocessing import LabelEncoder

frame = temp
lb_make = LabelEncoder()
frame["user_id_code"] = lb_make.fit_transform(frame["user_id"])
frame[["user_id", "user_id_code"]].head(11)

lb_make2 = LabelEncoder()
frame["song_id_code"] = lb_make2.fit_transform(frame["song_ids"])
frame[["song_ids", "song_id_code"]].head(11)


Unnamed: 0,song_ids,song_id_code
0,SOBONKR12A58A7A7E0,10546
1,SOEGIYH12A6D4FC0E3,28684
2,SOFLJQZ12A6D4FADA6,36622
3,SOHTKMO12AB01843B0,51861
4,SODQZCY12A6D4F9D11,24663
5,SOXLOQG12AF72A2D55,148312
6,SOUVUHC12A67020E3B,133075
7,SOUQERE12A58A75633,131771
8,SOIPJAX12A8C141A2D,57332
9,SOEFCDJ12AB0185FA0,28367


In [116]:
frame

Unnamed: 0,user_id,song_ids,play_count,user_id_code,song_id_code
0,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SOBONKR12A58A7A7E0,1,108811,10546
1,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SOEGIYH12A6D4FC0E3,1,108811,28684
2,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SOFLJQZ12A6D4FADA6,1,108811,36622
3,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SOHTKMO12AB01843B0,1,108811,51861
4,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SODQZCY12A6D4F9D11,1,108811,24663
...,...,...,...,...,...
1450928,5e650759ebf89012044c6d52121eeada8b0ec814,SOVLNXV12A6D4F706E,1,40490,136574
1450929,5e650759ebf89012044c6d52121eeada8b0ec814,SOVDSJC12A58A7A271,2,40490,134757
1450930,5e650759ebf89012044c6d52121eeada8b0ec814,SOBRHVR12A8C133F35,2,40490,11310
1450931,5e650759ebf89012044c6d52121eeada8b0ec814,SOMGVYU12A8C1314FF,2,40490,80823


In [0]:
export_csv = frame.to_csv (r'./triplets.csv', index = None, header=True)

In [0]:
# spark imports
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.sql.functions import col
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import SparkSession
from pyspark.sql.functions import UserDefinedFunction, explode, desc
from pyspark.sql.types import StringType, ArrayType

In [0]:
spark = SparkSession \
    .builder \
    .appName("Music Recommender System") \
    .config("spark.driver.maxResultSize", "96g") \
    .config("spark.driver.memory", "96g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.master", "local[12]") \
    .getOrCreate()
# get spark context
sc = spark.sparkContext

In [0]:
df = spark.read.load('./triplets.csv', format='csv', header=True, inferSchema=True)
df.show(10, truncate=False)

+----------------------------------------+------------------+----------+------------+------------+
|user_id                                 |song_ids          |play_count|user_id_code|song_id_code|
+----------------------------------------+------------------+----------+------------+------------+
|fd50c4007b68a3737fe052d5a4f78ce8aa117f3d|SOBONKR12A58A7A7E0|1         |108811      |10546       |
|fd50c4007b68a3737fe052d5a4f78ce8aa117f3d|SOEGIYH12A6D4FC0E3|1         |108811      |28684       |
|fd50c4007b68a3737fe052d5a4f78ce8aa117f3d|SOFLJQZ12A6D4FADA6|1         |108811      |36622       |
|fd50c4007b68a3737fe052d5a4f78ce8aa117f3d|SOHTKMO12AB01843B0|1         |108811      |51861       |
|fd50c4007b68a3737fe052d5a4f78ce8aa117f3d|SODQZCY12A6D4F9D11|1         |108811      |24663       |
|fd50c4007b68a3737fe052d5a4f78ce8aa117f3d|SOXLOQG12AF72A2D55|1         |108811      |148312      |
|d7083f5e1d50c264277d624340edaaf3dc16095b|SOUVUHC12A67020E3B|1         |92390       |133075      |
|d7083f5e1

In [0]:
# indexer = [StringIndexer(inputCol=column, outputCol=column+"_index") for column in list(set(df.columns)-set(['overall'])) ]
# pipeline = Pipeline(stages=indexer)
# transformed = pipeline.fit(df).transform(df)
# transformed.show(truncate=False)

In [0]:
# user = list(transformed.select('user_id_index').distinct().sort(transformed["user_id_index"])) # Get our unique customers
# song = list(transformed.select('song_ids_index').distinct().sort(transformed["song_ids_index"])) # Get our unique products that were purchased
# playCount = list(transformed.select('play_count')) # All of our purchases

# # rows = transformed.select('user_id_index').astype('category', categories = user).cat.codes 
# # # Get the associated row indices
# # cols = transformed.song_ids_index.astype('category', categories = song).cat.codes 
# # Get the associated column indices
# # playCount_sparse = sparse.csr_matrix((playCount, (user, song)), shape=(len(user), len(song)))


In [0]:
(training,test)=df.randomSplit([0.8, 0.2])

In [0]:
als=ALS(maxIter=5, regParam=0.09,rank=25,userCol="user_id_code",itemCol="song_id_code",ratingCol="play_count",coldStartStrategy="drop",nonnegative=True)
model=als.fit(training)

In [0]:
evaluator=RegressionEvaluator(metricName="rmse",labelCol="play_count",predictionCol="prediction")
predictions=model.transform(test)
predictions.show(50, truncate = False)

+----------------------------------------+------------------+----------+------------+------------+----------+
|user_id                                 |song_ids          |play_count|user_id_code|song_id_code|prediction|
+----------------------------------------+------------------+----------+------------+------------+----------+
|8982c3890ef7a2978292a3c986e61cb914723261|SOAAOQV12A58A7FDFA|1         |59044       |148         |1.0275742 |
|19faa693156a63d3956f9bea38bd1ef594249144|SOABVCK12A6D4F8A92|2         |11215       |471         |0.5101284 |
|ba6186342cce189527238d6961d782a78982ae8f|SOABVCK12A6D4F8A92|2         |80079       |471         |0.30069754|
|cae426aea0ddb4132d197c6ca6ff8281d75a4954|SOADGAX12A8151B896|2         |87115       |833         |1.0532596 |
|cab90d247ebe4fa99117b686e40acbfd58ca2ca0|SOAKBZR12AC468D1DD|1         |87050       |2659        |1.1064874 |
|8439c1a5fbd468d956fceb15a7495c08bc51c54c|SOAOGOV12A8C13ACED|7         |56813       |3749        |0.71650064|
|55b972d3b

In [0]:
# rmse = evaluator.evaluate(predictions)

In [0]:
# print("Root-mean-square error = " + str(rmse))

In [0]:
# user_recs=model.recommendForAllUsers(10).show(10, truncate = False)

+-------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|user_id_index|recommendations                                                                                                                                                                                              |
+-------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|148          |[[132559, 128.63263], [147680, 83.976524], [66419, 82.88117], [125845, 76.72285], [82226, 76.19292], [39411, 76.14007], [162509, 75.64509], [99909, 63.91176], [98323, 63.563046], [117480, 62.751663]]      |
|463          |[[50694, 81.836334], [39411, 64.48308], [147680, 64.23534], [122151, 61.286102], [91642, 56.23076

In [0]:
recs=model.recommendForAllUsers(10).toPandas()

In [0]:
recs

Unnamed: 0,user_id_code,recommendations
0,148,"[(86930, 164.18919372558594), (130976, 143.074..."
1,463,"[(140084, 59.36147689819336), (20335, 54.54523..."
2,471,"[(140084, 346.9989013671875), (20335, 337.6782..."
3,496,"[(86930, 102.5612564086914), (20335, 101.52572..."
4,833,"[(39155, 280.4085388183594), (130976, 217.9136..."
...,...,...
109984,108934,"[(86930, 162.45037841796875), (3934, 160.69415..."
109985,108946,"[(140084, 271.0825500488281), (20335, 264.0805..."
109986,108961,"[(140084, 114.94924926757812), (20335, 99.6657..."
109987,109144,"[(20335, 165.97198486328125), (86930, 129.8025..."


In [0]:
recs.shape

(109989, 2)

In [0]:
song_df = pd.read_csv('/content/gdrive/My Drive/MscsDs/MillionSongSubset/msd.csv')

In [0]:
song_df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,song_ids,song_titles,song_durations,song_realease_years,artist_names,song_hotness,song_tempo,song_energies,song_danceability,song_bars,song_beats,song_time_signatures,song_tatum,song_modes,song_keys,artist_familiarty,artist_hotness
0,0,0,SONHTWP12A8C142744,Vals '89,162.58567,0,Manolis Famellos & I Podilates,,94.301,0.0,0.0,[ 1.45502 3.39383 5.32826 7.25953 9.1...,[ 0.16441 0.80395 1.45502 2.09102 2.7...,3,[ 0.16441 0.48418 0.80395 1.13522 1.4...,1,0,0.22873,0.0
1,1,1,SOPIHMB12AC468E0DD,Ghost of the Ocean (Live),214.25587,0,Uriah Heep,0.2502,149.921,0.0,0.0,[ 1.20218 2.80457 4.39266 5.98514 7.5...,[ 0.38752 0.79653 1.20218 1.60244 2.0...,4,[ 0.2523 0.38752 0.52273 ... 212.97102 2...,1,7,0.673239,0.479205
2,2,2,SODRPJT12AC468DEF1,The frog song,169.87383,0,Alain-François,,136.044,0.0,0.0,[ 1.37397 4.48312 7.56804 10.65675 13.7...,[ 0.48609 0.93058 1.37397 1.81958 2.2...,7,[ 0.26328 0.48609 0.70778 0.93058 1.1...,1,2,0.380317,0.300918
3,3,3,SOLLOTO12AB01804C6,Deep Blue Sea (Daniel Rossen home recording),351.50322,2008,Grizzly Bear,,85.003,0.0,0.0,[ 0.72449 3.55798 6.37712 9.20176 12.0...,[ 0.72449 1.43405 2.1445 2.85228 3.5...,4,[1.8653000e-01 3.6881000e-01 5.4665000e-01 ......,1,9,0.760636,0.547244
4,4,4,SOCNYGS12AB01832B8,La vérité,252.44689,2007,Annie Blanchard,0.0,113.287,0.0,0.0,[2.2334000e-01 2.3324800e+00 4.4484000e+00 6.5...,[2.2334000e-01 7.5000000e-01 1.2775800e+00 1.8...,4,[2.2334000e-01 3.9890000e-01 5.7445000e-01 ......,1,5,0.479735,0.35246


In [119]:
count = 0
song_id_found = set()
for recValue in recs.values:
    for item in recValue[1]:
      row = frame.loc[frame['song_id_code'] == item.song_id_code]
      song_id = row.song_ids.values[0]
      if ( song_id not in song_id_found) and (song_df.loc[ song_df['song_ids'] == song_id].shape[0] == 1):
        rowDetails = song_df.loc[ song_df['song_ids'] == song_id]
        print(" Recommendation to user ", recValue[0], " : ")
        print("   Song: ", rowDetails.song_titles.values[0])  
        print("   Artist: ",  rowDetails.artist_names.values[0])
        song_id_found.add(song_id)
        count += 1
        
    if count > 1:
      break

 Recommendation to user  148  : 
   Song:  Seed Will Grow
   Artist:  Ms. Dynamite / Kymani Marley
 Recommendation to user  1088  : 
   Song:  Till There Was You (John Creamer & Stephane K Remix)
   Artist:  Rachael Starr


In [118]:
print(" Recommendation to user 148 : ")
for recom in recs.query('user_id_code == 148').recommendations:
  for item in recom:
    row = frame.loc[frame['song_id_code'] == item.song_id_code]
    print("\n   Song ID: ", row.song_ids.values[0])
    song_id = row.song_ids.values[0]
    if song_df.loc[ song_df['song_ids'] == song_id].shape[0] == 1:
      rowDetails = song_df.loc[ song_df['song_ids'] == song_id]
      print("     Song: ", rowDetails.song_titles.values[0])  
      print("     Artist: ",  rowDetails.artist_names.values[0])

 Recommendation to user 148 : 

   Song ID:  SONGFCZ12AB01859AD

   Song ID:  SOUMPJA12AAF3B3FD0

   Song ID:  SOUEXOI12AB0189BA4

   Song ID:  SODALYQ12AB017EED4

   Song ID:  SOWAWSK12A8C1383B2

   Song ID:  SOAOXTG12A67021AA7
     Song:  Seed Will Grow
     Artist:  Ms. Dynamite / Kymani Marley

   Song ID:  SODJYMH12A8AE46EA5

   Song ID:  SOTFIZA12A8C13D80F

   Song ID:  SORJSQI12A6701D62D

   Song ID:  SOXTUWG12AB018A2E2


In [117]:
print(" Recommendation to user 108811 : ")
for recom in recs.query('user_id_code == 108811').recommendations:
  for item in recom:
    row = frame.loc[frame['song_id_code'] == item.song_id_code]
    print("\n   Song ID: ", row.song_ids.values[0])
    song_id = row.song_ids.values[0]
    if song_df.loc[ song_df['song_ids'] == song_id].shape[0] == 1:
      rowDetails = song_df.loc[ song_df['song_ids'] == song_id]
      print("     Song: ", rowDetails.song_titles.values[0])  
      print("     Artist: ",  rowDetails.artist_names.values[0])

 Recommendation to user 108811 : 

   Song ID:  SONGFCZ12AB01859AD

   Song ID:  SODALYQ12AB017EED4

   Song ID:  SOUMPJA12AAF3B3FD0

   Song ID:  SOFVLYV12A8C145D8F

   Song ID:  SOTFIZA12A8C13D80F

   Song ID:  SOUEXOI12AB0189BA4

   Song ID:  SORJSQI12A6701D62D

   Song ID:  SOWAWSK12A8C1383B2

   Song ID:  SOJLDIQ12A58A7E506

   Song ID:  SOLLPYO12A8C13B38C
