In [1]:
sc

In [2]:
#restore session notebook
#import dill
#dill.load_session("notebook_env.db")

In [3]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [4]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from time import time

In [5]:
#lines = spark.read.csv("gs://dataset-rs/dataset-last.fm/dataset_lastfm.csv", header="true",inferSchema="true").rdd
lines = spark.read.csv("/home/aleja/Documentos/datasets/meu-lastfm/dataset_lastfm_v3_ConvertIds.csv", header="true",inferSchema="true").rdd

lines.take(2)

[Row(_c0=0, artist_name='Kenny Rogers', name='fernandomaiscal', track_name='All I Ever Need Is You (feat. Dottie West)', track_playcount='110', new_userId='0', new_songId=90406, new_artistId=31083),
 Row(_c0=1, artist_name='Geraldo Azevedo', name='fernandomaiscal', track_name='Dia Branco', track_playcount='53', new_userId='0', new_songId=242791, new_artistId=2956)]

In [6]:
#Create dataframe
ratings = spark.createDataFrame(lines) 
#ratings = ratings.withColumn("topTracks_playcount", col("topTracks_playcount").cast('int'))

#50% Sample of data 
df_ratings = ratings.sample(False,fraction=0.5, seed=1)
#Drop NaN data 
df_ratings = df_ratings.na.drop()
df_ratings.select('new_userId').count()

1677020

In [7]:
df_ratings.show(5)

+---+------------+---------------+--------------------+---------------+----------+----------+------------+
|_c0| artist_name|           name|          track_name|track_playcount|new_userId|new_songId|new_artistId|
+---+------------+---------------+--------------------+---------------+----------+----------+------------+
|  0|Kenny Rogers|fernandomaiscal|All I Ever Need I...|            110|         0|     90406|       31083|
|  2| John Lennon|fernandomaiscal|    Gimme Some Truth|             49|         0|     17860|       11512|
|  3| David Bowie|fernandomaiscal|  Station to Station|             48|         0|    322953|       12565|
|  6| David Bowie|fernandomaiscal|       Life on Mars?|             42|         0|    318742|       12565|
|  8|Led Zeppelin|fernandomaiscal|Since I've Been L...|             40|         0|     96039|       34497|
+---+------------+---------------+--------------------+---------------+----------+----------+------------+
only showing top 5 rows



In [8]:
#changes the names of rows
df_ratings =  df_ratings.selectExpr("new_userId as userId","name as name", "new_songId as songId","track_name as track_name","new_artistId as artistId", "artist_name as artist_name","track_playcount as track_playcount")


In [9]:
# transform the type of data
df_ratings = df_ratings.withColumn("userId", col("userId").cast("int"))
df_ratings = df_ratings.withColumn("songId", col("songId").cast("int"))
df_ratings = df_ratings.withColumn("artistId", col("artistId").cast("int"))
df_ratings = df_ratings.withColumn("track_playcount", col("track_playcount").cast("int"))

In [10]:
## How many distinct users in data ?

uniqueUsers = df_ratings.select('userId').distinct().count()
print("Total n. of users: ", uniqueUsers)

## How many distinct artists in data ?

uniqueArtists  = df_ratings.select("artistId").distinct().count()
print("Total n. of artists: ", uniqueArtists)

## How many distinct music in data ?

uniqueSongs  = df_ratings.select("songId").distinct().count()
print("Total n. of songs: ", uniqueSongs)


Total n. of users:  16054
Total n. of artists:  33876
Total n. of songs:  274431


In [11]:
#select users to play a song more than 10 times and less to 300 
raw_plays_df_2more_plays = df_ratings.filter(df_ratings.track_playcount >= 10).distinct()

tot_entries_2more = raw_plays_df_2more_plays.count()
print('Total enties with two or more plays: {0}'.format(tot_entries_2more))

raw_plays_df_2more_plays = raw_plays_df_2more_plays.filter(raw_plays_df_2more_plays.userId < (uniqueUsers)) \
                                                   .select('userId',"name", 'songId',"track_name",'artistId',"artist_name", 'track_playcount').orderBy('track_playcount',ascending=False) 
raw_plays_df_2more_plays.cache() 


Total enties with two or more plays: 1428544


DataFrame[userId: int, name: string, songId: int, track_name: string, artistId: int, artist_name: string, track_playcount: int]

In [12]:

df_ratings_less100 = raw_plays_df_2more_plays.filter(raw_plays_df_2more_plays.track_playcount <= 300).distinct()
df_ratings_less100.count()


1385888

### create music data

In [13]:
music_data =  df_ratings_less100.selectExpr("songId","track_name","artistId","artist_name")
music_data

DataFrame[songId: int, track_name: string, artistId: int, artist_name: string]

In [14]:
sub_rating_data = df_ratings_less100.select("userId","songId","track_playcount")
sub_rating_data = sub_rating_data.na.drop()
sub_rating_data.orderBy(col('userId'),col('songId')).show(10)

+------+------+---------------+
|userId|songId|track_playcount|
+------+------+---------------+
|     0|  2000|             16|
|     0|  2048|             22|
|     0|  2745|             16|
|     0|  3450|             23|
|     0|  4927|             30|
|     0|  9298|             23|
|     0| 10458|             22|
|     0| 11642|             23|
|     0| 13482|             13|
|     0| 13993|             26|
+------+------+---------------+
only showing top 10 rows



### Train and test data

In [15]:
(training, test) = sub_rating_data.randomSplit([0.8, 0.2])
als = ALS(rank=15, maxIter=10, regParam=0.01, alpha=0.5, implicitPrefs=True, userCol="userId", itemCol="songId", ratingCol="track_playcount", coldStartStrategy="drop")
model = als.fit(training)

In [16]:
##Predictions-  test model
predictions = model.transform(test)
predictions.printSchema()
predictions.show()


root
 |-- userId: integer (nullable = true)
 |-- songId: integer (nullable = true)
 |-- track_playcount: integer (nullable = true)
 |-- prediction: float (nullable = true)

+------+------+---------------+-----------+
|userId|songId|track_playcount| prediction|
+------+------+---------------+-----------+
| 12054|   148|             66|0.046013635|
|   701|   148|             35|0.009146986|
|  8135|   471|             75|0.115864806|
|  4078|   833|             21| 0.49627587|
|  4126|   833|             78|  0.6293227|
|  6732|   833|             52|  0.5320421|
|  3885|   833|             70| 0.73477376|
|  4669|   833|             85| 0.32634944|
|  3814|   833|             10| 0.19078287|
|  4561|   833|            100| 0.77257174|
|  6779|   833|             57|  0.6971461|
| 13442|   833|             88|  0.2924518|
|  4808|   833|             33|  0.6063319|
|  4578|   833|             73|  0.5835113|
|  8749|   833|             45| 0.49052268|
|  4406|   833|             85|  0.

In [17]:
#p1 =  predictions
#t = p1.groupBy("new_userId","new_songId").count()
#t.orderBy("new_userId","count", ascending=False).show()



In [18]:
evaluator = RegressionEvaluator(labelCol="track_playcount", predictionCol="prediction")
rmse = evaluator.evaluate(predictions.na.drop(), {evaluator.metricName :"rmse"})
mae =  evaluator.evaluate(predictions.na.drop(), {evaluator.metricName :"mae"})
print("Root-mean-square error = " + str(rmse))
print("mae = " + str(mae))



Root-mean-square error = 80.04082692826245
mae = 61.026013159053655


## Función para recomendación

### Recomendação personalizada para um Usuario

In [19]:
from pyspark.sql.functions import lit



def recommendMusic(model, user, nbRecommendations):
     # Create a Spark DataFrame with the specified user and all the songs ratings in DataFrame
    dataSet = df_ratings_less100.select("songId").distinct().withColumn("userId", lit(user))

    # Create a Spark DataFrame with the movies that have already been rated by this user
    musicAlreadyRated = df_ratings_less100.filter(df_ratings_less100.userId == user).select("songId", "userId")
    #subtrama
    #sub_musicAlreadyRated =  musicAlreadyRated.sample(False,fraction=0.5, seed=1)
    #musicInclude = musicAlreadyRated.subtract(sub_musicAlreadyRated)
    #musicInclude_2 = musicInclude.join(music_data, musicInclude.new_songId == music_data.new_songId).distinct().select(musicInclude.new_songId, music_data.topTracks_name, music_data.topTracks_artist_name)
    #print ("music rated that was include to predictions:")
    #musicInclude_2.show(50)
    # Apply the recommender system to the data set without the already rated movies to predict ratings
    predictions = model.transform(dataSet.subtract(musicAlreadyRated)).dropna().select("songId", "prediction").orderBy("prediction", ascending=False).limit(nbRecommendations)
    
    # Join with the ratings DataFrame to get the music titles and genres
    recommendations = predictions.join(music_data, predictions.songId == music_data.songId).distinct().select(predictions.songId, music_data.track_name, music_data.artist_name, predictions.prediction).orderBy("prediction", ascending=False)
    recommendations.show(truncate=False)

### musicas escutadas pelo usuario

In [20]:
### songs listened - user
def songs_listened(userId, Nsongs):
    musicAlreadyRated = df_ratings_less100.filter(df_ratings_less100.userId == userId).select("songId", "userId","track_playcount").distinct()
    music_listened =  musicAlreadyRated.join(music_data, musicAlreadyRated.songId == music_data.songId).select(musicAlreadyRated.songId, music_data.track_name, music_data.artist_name,musicAlreadyRated.track_playcount).distinct()
    music_listened.orderBy("track_playcount",ascending=False).show(Nsongs)

### Dado um item, probabilidade que um Usuario goste 

In [21]:
def rankUserforItem(model, itemID, friendID):
    x = df_ratings_less100.select("userId").distinct().withColumn("songId", lit(itemID))
    #print ("Os DADOS {}" .format(x.show(truncate=False)))
    
    predictions = model.transform(x).dropna().select("userId","songId", "prediction").orderBy("prediction", ascending=False)
    
    recommendations =  predictions.join(music_data, predictions.songId ==  music_data.songId).distinct().select(predictions.userId, predictions.songId, music_data.track_name, music_data.artist_name, predictions.prediction).orderBy("prediction", ascending=False)
    rankUser = recommendations.filter(recommendations.userId==friendID)
    
    rankUser.show()
    
    
    
    

In [22]:
u=191
print ("list of songs listened by user {}:" .format(u))

songs_listened(u,500)

list of songs listened by user 191:
+------+--------------------+--------------------+---------------+
|songId|          track_name|         artist_name|track_playcount|
+------+--------------------+--------------------+---------------+
|209206|       Don't Tell Me|       Avril Lavigne|            282|
|238962|Give You What You...|       Avril Lavigne|            280|
|351286|       We Can't Stop|         Miley Cyrus|            257|
|218194|       Wrecking Ball|         Miley Cyrus|            243|
|  9723|           I Will Be|       Avril Lavigne|            235|
|393892|         Hello Kitty|       Avril Lavigne|            234|
|339901|          I Love You|       Avril Lavigne|            233|
|277261|      Fall to Pieces|       Avril Lavigne|            222|
|163260|                 Hot|       Avril Lavigne|            213|
|271399|         Losing Grip|       Avril Lavigne|            210|
|184500|         Rock n Roll|       Avril Lavigne|            210|
|240562|            Tomorr

In [23]:
u = 191
print ("Recommendations for user {}:".format(u))
recommendMusic(model,u,30)

Recommendations for user 191:
+------+-----------------------------+-------------+----------+
|songId|track_name                   |artist_name  |prediction|
+------+-----------------------------+-------------+----------+
|115256|Wish You Were Here           |Avril Lavigne|1.0621458 |
|26119 |Smile                        |Avril Lavigne|1.0516471 |
|138724|Wildest Dreams               |Taylor Swift |1.0504868 |
|372503|Complicated                  |Avril Lavigne|1.0440081 |
|168119|Out of the Woods             |Taylor Swift |1.0168552 |
|198203|All Too Well                 |Taylor Swift |1.0070193 |
|76893 |Come & Get It                |Selena Gomez |1.0058436 |
|344682|New Romantics                |Taylor Swift |1.0038327 |
|5204  |All You Had to Do Was Stay   |Taylor Swift |1.0008509 |
|268798|22                           |Taylor Swift |0.9994861 |
|232801|Shake It Off                 |Taylor Swift |0.99816144|
|348898|The Heart Wants What It Wants|Selena Gomez |0.9902732 |
|3067  |Sk

In [24]:
i= 90242
f=191

print("Rank of User= {} for item= {}" .format(f,i))
rankUserforItem(model,i,f) 

Rank of User= 191 for item= 90242
+------+------+----------+-----------+----------+
|userId|songId|track_name|artist_name|prediction|
+------+------+----------+-----------+----------+
|   191| 90242|     Hello|Evanescence|0.17921805|
+------+------+----------+-----------+----------+



In [25]:
u = 14600
print ("list of songs listened by user {}:" .format(u))

songs_listened(u,1000)

list of songs listened by user 14600:
+------+--------------------+--------------------+---------------+
|songId|          track_name|         artist_name|track_playcount|
+------+--------------------+--------------------+---------------+
|154307|         Boys Better|   The Dandy Warhols|             94|
| 69647|        Kuolemajärvi|      Kotiteollisuus|             75|
|308158|         Mummokeikka|       Vaasankatu SS|             63|
|290417|      Happy Birthday|The Birthday Mass...|             44|
|176554|     Vieraan sanomaa|      Kotiteollisuus|             44|
|331711|           Video Kid|The Birthday Mass...|             43|
|197645| Hyvää tulevaisuutta|      Kotiteollisuus|             37|
|154675|                Over|The Birthday Mass...|             36|
| 60353|                Syli|      Kotiteollisuus|             34|
|264755|        Hesari Blues|       Vaasankatu SS|             33|
|151673|        Kummitusjuna|      Kotiteollisuus|             33|
| 61079|    Under the St

In [26]:
print ("Recommendations for user {}:".format(u))
recommendMusic(model,u,20)

Recommendations for user 14600:
+------+-------------------------------+--------------+-----------+
|songId|track_name                     |artist_name   |prediction |
+------+-------------------------------+--------------+-----------+
|95473 |The Bitter End                 |Placebo       |0.11987309 |
|95473 |Bitter End                     |Placebo       |0.11987309 |
|12456 |Song to Say Goodbye            |Placebo       |0.10068031 |
|196783|Lullaby                        |The Cure      |0.09785267 |
|130985|Wicked Game                    |Him           |0.097195685|
|343743|This Picture                   |Placebo       |0.09571357 |
|147624|Every You Every Me             |Placebo       |0.09450346 |
|7072  |Teardrop                       |Massive Attack|0.0928891  |
|264903|For What It's Worth            |Placebo       |0.09212646 |
|102867|Nemo                           |Nightwish     |0.09131986 |
|119678|Infra-Red                      |Placebo       |0.0899719  |
|364045|Special 

In [27]:
#save notebook session 
#import dill
#dill.dump_session("notebook_env.db")