<h2>Importación de librerías</h2>

In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql.session import SparkSession
from pyspark.sql import SQLContext

import numpy as np
import pandas as pd

from sklearn.metrics.pairwise import cosine_similarity
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer
from pyspark.sql import Row

# Carga del paquete databricks:spark-xml para pasar directamente de XML a dataframe
import os
from os import environ
environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.10:0.4.1 pyspark-shell'

from pyspark.sql.functions import *
from pyspark.sql.types import *


<h2>Configuración de Spark</h2>

In [2]:
#conf = SparkConf()\
#        .setAppName("tfm02")\
#        .setMaster("local")

conf = SparkConf()\
        .setAppName("tfmX")\
        .setMaster("spark://192.168.2.132:7077")
    
sc = SparkContext(conf=conf)

sqlContext = SQLContext(sc)

<h2>Carga del fichero json con los datos normalizados.</h2> 

In [3]:
fullJsonFile = '../TFM Files/CNT2.2 - dataScaled/dataScaled.json'

scaledData = sqlContext.read.format("json")\
                        .load(fullJsonFile)
    
scaledData.show()


+---------+--------------------+--------------------+
|master_id|      scaledFeatures|               title|
+---------+--------------------+--------------------+
|    81137|[1, [0.1334085359...|     Greenbank Drive|
|    81640|[1, [0.1056811235...|Dancing On The Couch|
|    82039|[1, [0.1103023589...|We're Rocking The...|
|    82494|[1, [0.1334085359...|  It Takes Me Higher|
|    83907|[1, [0.0825749465...|        Bitter Sweet|
|    87426|[1, [0.1103023589...|        Sussex Drive|
|    90803|[1, [0.1241660651...|Hollow Man (Origi...|
|    94241|[1, [0.1056811235...|Chicas, Chicas, C...|
|    97214|[1, [0.1195448297...|          Extensions|
|    98178|[1, [0.1103023589...|          Alex Party|
|    98328|[1, [0.1195448297...|          Barry Blue|
|    98999|[1, [0.1380297713...|            Somebody|
|    99399|[1, [0.1056811235...|  The Invincible Sex|
|   100143|[1, [0.1334085359...|    Don't You Try It|
|   100288|[1, [0.0964386527...|        Turn-A-Round|
|   102367|[1, [0.1287873005

In [4]:
perUnAmicoFrame = scaledData.select("master_id").where(scaledData["title"] == "Per Un Amico")
perUnAmicoFrame.show()


+---------+
|master_id|
+---------+
|     1355|
+---------+



In [5]:
perUnAmicoIndex = perUnAmicoFrame.select("master_id").collect()
perUnAmicoIndex[0][0]

1355

<p>Aplicación de la función coseno para hallar las similitudes entre elementos.</p>

In [6]:
#Parte donde se calcula el cosine similarity


releaseID = perUnAmicoIndex[0][0]

selectedRelease = scaledData.where(col("master_id") == releaseID).first()

rdd_similarity = scaledData.rdd.map(lambda x : Row(id = x.master_id,
                                               sim = cosine_similarity([np.asarray(selectedRelease["scaledFeatures"])[1]],
                                                                       [np.asarray(x["scaledFeatures"])[1]])[0][0].item(),
                                                title = x.title))


<p>Generamos el dataframe con la similitud con los demás elementos.</p>

In [7]:
# Parte donde se genera el df con los resultados

schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("sim", FloatType(), False),
    StructField("title", StringType(), False)
])

df_similarity = sqlContext.createDataFrame(rdd_similarity, schema)

df_similarity.show()


+------+------------+--------------------+
|    id|         sim|               title|
+------+------------+--------------------+
| 81137|  0.93130213|     Greenbank Drive|
| 81640|  0.08699941|Dancing On The Couch|
| 82039| -0.03178424|We're Rocking The...|
| 82494|  0.38232934|  It Takes Me Higher|
| 83907|   0.8960064|        Bitter Sweet|
| 87426|   0.5762373|        Sussex Drive|
| 90803|   0.7572512|Hollow Man (Origi...|
| 94241|  0.20213285|Chicas, Chicas, C...|
| 97214|   0.4721324|          Extensions|
| 98178|  0.72252065|          Alex Party|
| 98328|  0.73610693|          Barry Blue|
| 98999|-0.082419455|            Somebody|
| 99399| -0.18022282|  The Invincible Sex|
|100143|  0.66346323|    Don't You Try It|
|100288|  0.61651397|        Turn-A-Round|
|102367|  0.91712916|          Sick Of It|
|102515|   0.9024957|Sir Duke / He's M...|
|102858| -0.16368663|             Cha Cha|
|103368| -0.44199383|          Rusty Acid|
|105384|   0.0793337|       Morning Light|
+------+---

In [8]:
import numpy as np

df_similarity = df_similarity.withColumn("sim", round(df_similarity["sim"],5))


df_similarity = df_similarity\
    .where(col("id") != releaseID)\
    .orderBy("sim", ascending=False)



In [9]:
print("Las 10 publicaciones recomendadas son: ")
df_similarity.show(10, False)

Las 10 publicaciones recomendadas son: 
+------+-------+--------------------------------------------------+
|id    |sim    |title                                             |
+------+-------+--------------------------------------------------+
|313950|0.99697|Snuffy / Wells Fargo / Rhodomagnetics / Count Down|
|23151 |0.99418|Marrakech                                         |
|53118 |0.99382|Ars Longa Vita Brevis                             |
|22420 |0.99291|Station To Station                                |
|61688 |0.99159|Ornette On Tenor                                  |
|248430|0.99045|Soprano Sax                                       |
|155249|0.9904 |Bottom Heavy 2008                                 |
|47579 |0.99009|Rhapsody And Blues                                |
|14612 |0.98976|Saturday Night (Remix '94)                        |
|47901 |0.98944|Sunlight                                          |
+------+-------+--------------------------------------------------+
only sho