<h2>Importación de librerías</h2>

In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql.session import SparkSession
from pyspark.sql import SQLContext

# Carga del paquete databricks:spark-xml para pasar directamente de XML a dataframe
import os
from os import environ
environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.10:0.4.1 pyspark-shell'

from pyspark.sql.functions import *
from pyspark.sql.types import *


<h2>Configuración de Spark</h2>

In [2]:
#conf = SparkConf()\
#        .setAppName("tfm01")\
#        .setMaster("local")

conf = SparkConf()\
        .setAppName("tfm03")\
        .setMaster("spark://192.168.2.132:7077")
    
sc = SparkContext(conf=conf)

sqlContext = SQLContext(sc)

<h2>Carga del fichero JSON</h2> 

In [3]:
#Definición de los archivos XML 
mastersReleasesJoinedFile = '../TFM Files/CNT1.3 - MastersReleasesJoined/MastersReleasesJoined.json'



dfMRJ = sqlContext.read.format("json")\
                        .load(mastersReleasesJoinedFile)
    
dfMRJ.count()

69955

In [4]:
dfMRJ = dfMRJ.select("master_id", 
                       "title", 
                       "year", 
                       "formats", 
                       "duration", 
                       "number_tracks", 
                       "artists",  
                       "country",   
                       "companies", 
                       "genres", 
                       "styles")\
                .orderBy("master_id")

<h4>Creación de columnas por formato. </h4>
<p>Como hay publicaciones con varios formatos, se ha decicido crear una columna por cada tipo diferente de formato e indicar con un 1 si se ha publicado en ese formato y con un 0 si no se ha hecho así. Se ha tomado esta decisión basándose en el algoritmo OneHotEncoder. En este caso, al tratarse de más de un valor, se utiliza la idea del MultilabelBinarizer implementado entre otros en Sklearn.</p>

In [5]:
# Expandimos la lista 

dfFormatSep = dfMRJ.withColumn("formatsSep", explode(col("formats")))
dfFormatSep = dfFormatSep.drop("formats")

dfFormatSep.select("formatsSep").distinct().show(30) 
dfFormatSep.select("formatsSep").distinct().count() 

+-----------------+
|       formatsSep|
+-----------------+
|       Betacam SP|
|            Vinyl|
|8-Track Cartridge|
|         Cassette|
|          Box Set|
|     Reel-To-Reel|
|              CDV|
|              DVD|
|          Acetate|
|        Lathe Cut|
|           MiniDV|
|             DVDr|
|          Blu-ray|
|          Shellac|
|              CDr|
|        All Media|
|             SACD|
|           Hybrid|
|             File|
|        Laserdisc|
|              VHS|
|    SelectaVision|
|          Betamax|
|     Memory Stick|
|       Flexi-disc|
|              DCC|
|         Minidisc|
|               CD|
+-----------------+



28

<p>Al haber 28 formatos diferentes, se van a agrupar en categorías más genéricas para evitar una gran cantidad de columnas.</p>

In [6]:
@udf("boolean")
def checkCondition(colValue, colType):
    if colType == 1:
        selectedCondList = audioCond
    else:
        selectedCondList = videoCond
    
    for cond in selectedCondList:
        if colValue == cond:
            return True
    return False

In [7]:
# audioCond son los valores que se tomarán como Audio
# videoCond los valores que se tomarán como video

audioCond = ["CD","CDr","Minidisc","SACD","DCC","Vinyl","Cassette","CDV","Laserdisc", "Flexi-disc"] 
videoCond = ["Box Set", "All Media", "File", "Hybrid"]

checkCondition_udf = udf(lambda c, n: checkCondition(c, n), BooleanType())

dfFormatSep = dfFormatSep.withColumn("formatsSep", 
                            when(checkCondition(dfFormatSep["formatsSep"], lit(1)), "Audio")\
                            .otherwise(when(checkCondition(dfFormatSep["formatsSep"], lit(2)), "Video")\
                            .otherwise("Other")))

dfFormatSep.select("formatsSep").distinct().show() 

+----------+
|formatsSep|
+----------+
|     Video|
|     Other|
|     Audio|
+----------+



In [8]:
dfFormatSep.show()

+---------+--------------------+----+--------+-------------+--------------------+--------------------+--------------------+------------+--------------------+----------+
|master_id|               title|year|duration|number_tracks|             artists|             country|           companies|      genres|              styles|formatsSep|
+---------+--------------------+----+--------+-------------+--------------------+--------------------+--------------------+------------+--------------------+----------+
|      128|         Cosmic Cars|2000|       9|            2|          [Cybotron]|               Spain|      Hispavox, S.A.|[Electronic]| [House, Deep House]|     Audio|
|      132|              Stress|2000|      24|            2|      [Emmanuel Top]|              France|Music Line Intern...|[Electronic]|        [Tech House]|     Audio|
|      156|It's A Bit Compli...|2000|      34|           11|          [Art Brut]|              Russia| S.B.A./GALA Records|[Electronic]|[Future Jazz, Dee..

<h5>Aplicación de MultilabelBinarizer para que se creen las columnas de los diferentes formatos.</h5>

In [9]:
# Creamos la tabla pivote
dfFormatPiv = dfFormatSep.groupby(dfFormatSep["master_id"]).pivot("formatsSep").count()
dfFormatPiv = dfFormatPiv.na.fill(0)
dfFormatPiv.show()

#Comprobamos que no hay ids repetidos

from pyspark.sql import Window

w = Window.partitionBy('master_id')
dfFormatExplodedRepeated = dfFormatPiv.select('*', count('master_id').over(w).alias('dupeCount'))\
    .where('dupeCount > 1')\
    .drop('dupeCount')

print(dfFormatExplodedRepeated.count())

+---------+-----+-----+-----+
|master_id|Audio|Other|Video|
+---------+-----+-----+-----+
|   419628|    1|    0|    0|
|   101519|    1|    0|    0|
|   776612|    1|    0|    0|
|    19141|    1|    0|    0|
|   364400|    1|    0|    0|
|   521238|    1|    0|    0|
|    23506|    1|    0|    0|
|    38996|    1|    0|    0|
|   889632|    1|    0|    0|
|   417135|    1|    0|    0|
|  1104088|    1|    0|    0|
|   721566|    1|    0|    0|
|  1350398|    1|    0|    0|
|    15846|    1|    0|    0|
|    34340|    1|    0|    0|
|    99489|    1|    0|    0|
|   640076|    1|    1|    0|
|    62526|    1|    0|    0|
|    71575|    1|    0|    0|
|   103478|    1|    0|    0|
+---------+-----+-----+-----+
only showing top 20 rows

0


In [10]:
dfJoined = dfFormatSep.alias("dfFormatSep")\
                .join(other = dfFormatPiv.alias("dfFormPiv"), on="master_id", how="inner")\
                .drop("formats")            

dfJoined.show()

+---------+--------------------+----+--------+-------------+--------------------+--------------------+--------------------+------------+--------------------+----------+-----+-----+-----+
|master_id|               title|year|duration|number_tracks|             artists|             country|           companies|      genres|              styles|formatsSep|Audio|Other|Video|
+---------+--------------------+----+--------+-------------+--------------------+--------------------+--------------------+------------+--------------------+----------+-----+-----+-----+
|      128|         Cosmic Cars|2000|       9|            2|          [Cybotron]|               Spain|      Hispavox, S.A.|[Electronic]| [House, Deep House]|     Audio|    1|    0|    0|
|      132|              Stress|2000|      24|            2|      [Emmanuel Top]|              France|Music Line Intern...|[Electronic]|        [Tech House]|     Audio|    1|    0|    0|
|      156|It's A Bit Compli...|2000|      34|           11|     

In [11]:
dfJoined = dfJoined.drop("formatsSep")
dfJoined = dfJoined.dropDuplicates()

In [12]:
dfJoined = dfJoined.cache()

<h4>Creación de columnas por Género. </h4>
<p>De la misma manera que se ha hecho con el formato, se procederá a separar las publicaciones por género.</p>

In [13]:
# Expandimos la lista 

dfGenresSep = dfJoined.withColumn("genresSep", explode(col("genres")))
dfGenresSep.select("genresSep").distinct().show() 

+--------------------+
|           genresSep|
+--------------------+
|      Stage & Screen|
|               Latin|
|    Brass & Military|
|           Non-Music|
|         Funk / Soul|
|                Rock|
|                Jazz|
|             Hip Hop|
|          Children's|
|           Classical|
|               Blues|
|Folk, World, & Co...|
|                 Pop|
|              Reggae|
|          Electronic|
+--------------------+



<h5>Aplicación de MultilabelBinarizer para que se creen las columnas de los diferentes formatos.</h5>

In [14]:
# Creamos la tabla pivote
dfGenresPiv = dfGenresSep.groupby(dfGenresSep["master_id"]).pivot("genresSep").count()
dfGenresPiv = dfGenresPiv.na.fill(0)
dfGenresPiv.show()

#Comprobamos que no hay ids repetidos

from pyspark.sql import Window

w = Window.partitionBy('master_id')
dfGenresExplodedRepeated = dfGenresPiv.select('*', count('master_id').over(w).alias('dupeCount'))\
    .where('dupeCount > 1')\
    .drop('dupeCount')

print(dfGenresExplodedRepeated.count())

+---------+-----+----------------+----------+---------+----------+----------------------+-----------+-------+----+-----+---------+---+------+----+--------------+
|master_id|Blues|Brass & Military|Children's|Classical|Electronic|Folk, World, & Country|Funk / Soul|Hip Hop|Jazz|Latin|Non-Music|Pop|Reggae|Rock|Stage & Screen|
+---------+-----+----------------+----------+---------+----------+----------------------+-----------+-------+----+-----+---------+---+------+----+--------------+
|      128|    0|               0|         0|        0|         1|                     0|          0|      0|   0|    0|        0|  0|     0|   0|             0|
|      132|    0|               0|         0|        0|         1|                     0|          0|      0|   0|    0|        0|  0|     0|   0|             0|
|      156|    0|               0|         0|        0|         1|                     0|          0|      0|   0|    0|        0|  0|     0|   0|             0|
|      164|    0|           

In [15]:
dfJoined = dfJoined.alias("dfJoined")\
                .join(other = dfGenresPiv.alias("dfGenresPiv"), on="master_id", how="inner")\
                .drop("genres")

dfJoined.show()

+---------+--------------------+----+--------+-------------+--------------------+-----------+--------------------+--------------------+-----+-----+-----+-----+----------------+----------+---------+----------+----------------------+-----------+-------+----+-----+---------+---+------+----+--------------+
|master_id|               title|year|duration|number_tracks|             artists|    country|           companies|              styles|Audio|Other|Video|Blues|Brass & Military|Children's|Classical|Electronic|Folk, World, & Country|Funk / Soul|Hip Hop|Jazz|Latin|Non-Music|Pop|Reggae|Rock|Stage & Screen|
+---------+--------------------+----+--------+-------------+--------------------+-----------+--------------------+--------------------+-----+-----+-----+-----+----------------+----------+---------+----------+----------------------+-----------+-------+----+-----+---------+---+------+----+--------------+
|     1677|                  M6|1997|      18|            2|          [Maurizio]|    Ger

In [16]:
dfJoined.orderBy("master_id").show()

+---------+--------------------+----+--------+-------------+--------------------+--------------------+--------------------+--------------------+-----+-----+-----+-----+----------------+----------+---------+----------+----------------------+-----------+-------+----+-----+---------+---+------+----+--------------+
|master_id|               title|year|duration|number_tracks|             artists|             country|           companies|              styles|Audio|Other|Video|Blues|Brass & Military|Children's|Classical|Electronic|Folk, World, & Country|Funk / Soul|Hip Hop|Jazz|Latin|Non-Music|Pop|Reggae|Rock|Stage & Screen|
+---------+--------------------+----+--------+-------------+--------------------+--------------------+--------------------+--------------------+-----+-----+-----+-----+----------------+----------+---------+----------+----------------------+-----------+-------+----+-----+---------+---+------+----+--------------+
|      128|         Cosmic Cars|2000|       9|            2| 

In [17]:
dfJoined.count()

69955

<h3>Creación de tabla intermedia</h3>

In [18]:
dfJoined = dfJoined.cache()

<h4>Columna Estilo.</h4>

In [19]:
# Expandimos la lista 

dfStyleSep = dfMRJ.withColumn("stylesSep", explode(col("styles")))
dfStyleSep.select("stylesSep").distinct().show() 
print(dfStyleSep.select("stylesSep").distinct().count())

+-----------------+
|        stylesSep|
+-----------------+
|        Synth-pop|
|        Jumpstyle|
|       Folk Metal|
|         Ranchera|
|   Piedmont Blues|
|          Beatbox|
|   Electric Blues|
|Minneapolis Sound|
|       Acid House|
|         Acoustic|
|    Jazzy Hip-Hop|
|        Hip-House|
|   Rhythmic Noise|
|        Education|
|        Post-Punk|
|              MPB|
|             Surf|
|           Thrash|
| Avant-garde Jazz|
|        Chillwave|
+-----------------+
only showing top 20 rows

416


<p>Al haber 416 estilos diferentes, generar una columna para cada estilo no es la estrategia más apropiada. En este caso, se utilizará el algoritmo String Indexer, este algoritmo asigna un índice a cada valor.</p>

In [20]:
from pyspark.ml.feature import StringIndexer

dfStyleSep = dfJoined.withColumn("stylesConcat", concat_ws(", ", "styles"))

indexer = StringIndexer(inputCol="stylesConcat", outputCol="stylesIndex")
indexed = indexer.fit(dfStyleSep).transform(dfStyleSep)
indexed.show()

+---------+--------------------+----+--------+-------------+--------------------+-----------+--------------------+--------------------+-----+-----+-----+-----+----------------+----------+---------+----------+----------------------+-----------+-------+----+-----+---------+---+------+----+--------------+--------------------+-----------+
|master_id|               title|year|duration|number_tracks|             artists|    country|           companies|              styles|Audio|Other|Video|Blues|Brass & Military|Children's|Classical|Electronic|Folk, World, & Country|Funk / Soul|Hip Hop|Jazz|Latin|Non-Music|Pop|Reggae|Rock|Stage & Screen|        stylesConcat|stylesIndex|
+---------+--------------------+----+--------+-------------+--------------------+-----------+--------------------+--------------------+-----+-----+-----+-----+----------------+----------+---------+----------+----------------------+-----------+-------+----+-----+---------+---+------+----+--------------+--------------------+--

In [21]:
# <p>Al haber 416 estilos diferentes, generar una columna para cada estilo no es la estrategia más apropiada. En este caso, se utilizará el algoritmo HashingTF, IDF. Este algoritmo genera un corpus de palabras y asigna un valor númerico en base a la frecuencia de las palabras dentro del corpus.</p>

# from pyspark.ml.feature import HashingTF, IDF, Tokenizer

# dfStyleSep = dfJoined.withColumn("stylesConcat", concat_ws(", ", "styles"))

# tokenizer = Tokenizer(inputCol="stylesConcat", outputCol="stylesWords")
# wordsData = tokenizer.transform(dfStyleSep)

# hashingTF = HashingTF(inputCol="stylesWords", outputCol="stylesWordsTF", numFeatures=20)
# featurizedData = hashingTF.transform(wordsData)

# idf = IDF(inputCol="stylesWordsTF", outputCol="stylesWordFeatures")
# idfModel = idf.fit(featurizedData)
# rescaledData = idfModel.transform(featurizedData)

# rescaledData.show()

# # dfStyleSep = rescaledData.select("master_id", "stylesWordFeatures")
# # dfStyleSep.show()

# dfJoined = rescaledData.drop("styles", "stylesConcat", "stylesWords", "stylesWordsTF")
# dfJoined.show()

In [22]:
dfJoined = indexed.drop("styles", "stylesConcat")

In [23]:
dfJoined.count()

69955

In [24]:
dfJoined = dfJoined.cache()

<h4>Columna Artistas.</h4>

In [25]:
# Expandimos la lista 

dfArtistsSep = dfJoined.withColumn("artistsSep", explode(col("artists")))
dfArtistsSep.select("artistsSep").distinct().show() 
print(dfArtistsSep.select("artistsSep").distinct().count())

+--------------------+
|          artistsSep|
+--------------------+
|          Snoop Dogg|
|     Johannes Brahms|
|      The Black Keys|
|       Cockney Rebel|
|        Emilio Pujol|
|      The Chameleons|
|           Los Lobos|
|         Justin Vali|
|         Exaltasamba|
|             Mr. Joy|
|Don Johnson Big Band|
|     Ladislav Slovák|
|  Larsen-Feiten Band|
|      Angelo Ferreri|
|           Maxim Rad|
|        Grady Gaines|
|             Goa Gil|
|          Eurythmics|
|    Brooklyn Express|
|Von Hertzen Brothers|
+--------------------+
only showing top 20 rows

38620


<p>En este caso, tenemos 38620 Artistas diferentes, por lo que habrá que tomar otra estrategia distinta a la tomada anteriormente, ya que cada artista debe representarse por un valor.</p>
<p>Utilizaremos StringIndexer, este algoritmo asigna un valor númerico a cada artista.</p>

In [26]:
from pyspark.ml.feature import StringIndexer

dfArtistsSep = dfJoined.withColumn("artistsConcat", concat_ws(", ", "artists"))

indexer = StringIndexer(inputCol="artistsConcat", outputCol="artistsIndex")
indexed = indexer.fit(dfArtistsSep).transform(dfArtistsSep)
indexed.show()

+---------+--------------------+----+--------+-------------+--------------------+-----------+--------------------+-----+-----+-----+-----+----------------+----------+---------+----------+----------------------+-----------+-------+----+-----+---------+---+------+----+--------------+-----------+--------------------+------------+
|master_id|               title|year|duration|number_tracks|             artists|    country|           companies|Audio|Other|Video|Blues|Brass & Military|Children's|Classical|Electronic|Folk, World, & Country|Funk / Soul|Hip Hop|Jazz|Latin|Non-Music|Pop|Reggae|Rock|Stage & Screen|stylesIndex|       artistsConcat|artistsIndex|
+---------+--------------------+----+--------+-------------+--------------------+-----------+--------------------+-----+-----+-----+-----+----------------+----------+---------+----------+----------------------+-----------+-------+----+-----+---------+---+------+----+--------------+-----------+--------------------+------------+
|     1677|  

In [27]:
dfJoined = indexed.drop("artists", "artistsConcat")

In [28]:
tablaIntermedia3 = dfJoined.select("*")

In [29]:
dfJoined.count()

69955

In [30]:
dfJoined = dfJoined.cache()

<h4>Columna Companies.</h4>

In [31]:
dfJoined.select("companies").distinct().show() 
print(dfJoined.select("companies").distinct().count())


+--------------------+
|           companies|
+--------------------+
|WARNER MUSIC BRAS...|
|    Rough Trade Inc.|
|  Membran Music Ltd.|
|  Tesco Distribution|
| BMG The Netherlands|
|            PRT Ltd.|
|       Sutra Records|
|Sundesire Media Worx|
|RedX Entertainmen...|
|                 Vme|
|      Apostrophe (2)|
|Trans Canada Records|
|Virgin Vision Dis...|
|      Benson Records|
|                   K|
|                B.W.|
|Starland Music, Inc.|
|Milletlerarası Mü...|
|Universal Music D...|
|       DiscMedi Blau|
+--------------------+
only showing top 20 rows

7831


<p>De forma semejante a la columna de los artistas, se utilizará el StringIndexer para asignar un valor a cada valor.</p>

In [32]:

indexer = StringIndexer(inputCol="companies", outputCol="companiesIndex")
indexed = indexer.fit(dfJoined).transform(dfJoined)
indexed.show()


+---------+--------------------+----+--------+-------------+-----------+--------------------+-----+-----+-----+-----+----------------+----------+---------+----------+----------------------+-----------+-------+----+-----+---------+---+------+----+--------------+-----------+------------+--------------+
|master_id|               title|year|duration|number_tracks|    country|           companies|Audio|Other|Video|Blues|Brass & Military|Children's|Classical|Electronic|Folk, World, & Country|Funk / Soul|Hip Hop|Jazz|Latin|Non-Music|Pop|Reggae|Rock|Stage & Screen|stylesIndex|artistsIndex|companiesIndex|
+---------+--------------------+----+--------+-------------+-----------+--------------------+-----+-----+-----+-----+----------------+----------+---------+----------+----------------------+-----------+-------+----+-----+---------+---+------+----+--------------+-----------+------------+--------------+
|     1677|                  M6|1997|      18|            2|    Germany|                 SRD| 

In [33]:
dfJoined = indexed.drop("companies")

In [34]:
tablaIntermedia4 = dfJoined.select("*")

In [35]:
dfJoined = dfJoined.cache()

In [36]:
dfJoined.count()

69955

<h4>Columna Country.</h4>

In [37]:
# Expandimos la lista 

dfMRJ.select("country").distinct().show() 
print(dfMRJ.select("country").distinct().count())

+--------------------+
|             country|
+--------------------+
|              Russia|
|         UK & Europe|
|              Sweden|
|         Philippines|
|            Malaysia|
|           Singapore|
|              Turkey|
|             Germany|
|              Europe|
| UK, Europe & Israel|
|         Ivory Coast|
|              Jordan|
|              France|
|              Greece|
|              Taiwan|
|Germany, Austria,...|
|            Slovakia|
|             Reunion|
|           Argentina|
|Congo, Democratic...|
+--------------------+
only showing top 20 rows

128


<p>Vamos a los valores de la tabla columna, los países .</p>

In [38]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol="country", outputCol="countryIndex")
indexed = indexer.fit(dfJoined).transform(dfJoined)
indexed.show()

+---------+--------------------+----+--------+-------------+-----------+-----+-----+-----+-----+----------------+----------+---------+----------+----------------------+-----------+-------+----+-----+---------+---+------+----+--------------+-----------+------------+--------------+------------+
|master_id|               title|year|duration|number_tracks|    country|Audio|Other|Video|Blues|Brass & Military|Children's|Classical|Electronic|Folk, World, & Country|Funk / Soul|Hip Hop|Jazz|Latin|Non-Music|Pop|Reggae|Rock|Stage & Screen|stylesIndex|artistsIndex|companiesIndex|countryIndex|
+---------+--------------------+----+--------+-------------+-----------+-----+-----+-----+-----+----------------+----------+---------+----------+----------------------+-----------+-------+----+-----+---------+---+------+----+--------------+-----------+------------+--------------+------------+
|     1677|                  M6|1997|      18|            2|    Germany|    1|    0|    0|    0|               0|     

In [39]:
# tokenizer = Tokenizer(inputCol="country", outputCol="countryWords")
# wordsData = tokenizer.transform(dfJoined)

# hashingTF = HashingTF(inputCol="countryWords", outputCol="countryWordsTF")
# featurizedData = hashingTF.transform(wordsData)

# idf = IDF(inputCol="countryWordsTF", outputCol="countryWordsFeatures")
# idfModel = idf.fit(featurizedData)
# rescaledData = idfModel.transform(featurizedData)

# rescaledData.show()

# # dfCountrySep = rescaledData.select("master_id", "stylesWordFeatures")
# # dfCountrySep.show()

# dfJoined = rescaledData.drop("country", "countryWords", "countryWordsTF")

In [40]:
dfJoined = indexed.drop("country")

In [41]:
dfJoined.show()

+---------+--------------------+----+--------+-------------+-----+-----+-----+-----+----------------+----------+---------+----------+----------------------+-----------+-------+----+-----+---------+---+------+----+--------------+-----------+------------+--------------+------------+
|master_id|               title|year|duration|number_tracks|Audio|Other|Video|Blues|Brass & Military|Children's|Classical|Electronic|Folk, World, & Country|Funk / Soul|Hip Hop|Jazz|Latin|Non-Music|Pop|Reggae|Rock|Stage & Screen|stylesIndex|artistsIndex|companiesIndex|countryIndex|
+---------+--------------------+----+--------+-------------+-----+-----+-----+-----+----------------+----------+---------+----------+----------------------+-----------+-------+----+-----+---------+---+------+----+--------------+-----------+------------+--------------+------------+
|     1677|                  M6|1997|      18|            2|    1|    0|    0|    0|               0|         0|        0|         1|                     

In [42]:
dfJoined = dfJoined.cache()

<h4>Limpieza de datos de las demás columnas.</h4>

In [43]:
# Buscamos los lanzamientos cuya duración sea de 0 (falta de datos) y los rellenamos con la media de las duraciones

mean = int(dfJoined.groupBy().avg("duration").take(1)[0][0])
print("Media de la duración", mean)

print("Numero de filas con duración 0: ", dfJoined.select("master_id","duration").where(dfJoined["duration"] == 0).count())

dfJoined = dfJoined.withColumn("duration", when(dfJoined["duration"] == 0, mean).otherwise(dfJoined["duration"]))

print("Numero de filas con duración 0: ", dfJoined.select("master_id","duration").where(dfJoined["duration"] == 0).count())

Media de la duración 42
Numero de filas con duración 0:  14
Numero de filas con duración 0:  0


In [44]:
# También rellenamos con la media los posibles valores null.

dfJoined.select("duration").distinct().orderBy("duration").show()

dfJoined = dfJoined.na.fill(mean)

dfJoined.select("duration").distinct().orderBy("duration").show()

+--------+
|duration|
+--------+
|    null|
|       1|
|       2|
|       3|
|       4|
|       5|
|       6|
|       7|
|       8|
|       9|
|      10|
|      11|
|      12|
|      13|
|      14|
|      15|
|      16|
|      17|
|      18|
|      19|
+--------+
only showing top 20 rows

+--------+
|duration|
+--------+
|       1|
|       2|
|       3|
|       4|
|       5|
|       6|
|       7|
|       8|
|       9|
|      10|
|      11|
|      12|
|      13|
|      14|
|      15|
|      16|
|      17|
|      18|
|      19|
|      20|
+--------+
only showing top 20 rows



In [45]:
dfJoined = dfJoined.cache()

In [46]:
dfJoined.show()

+---------+--------------------+----+--------+-------------+-----+-----+-----+-----+----------------+----------+---------+----------+----------------------+-----------+-------+----+-----+---------+---+------+----+--------------+-----------+------------+--------------+------------+
|master_id|               title|year|duration|number_tracks|Audio|Other|Video|Blues|Brass & Military|Children's|Classical|Electronic|Folk, World, & Country|Funk / Soul|Hip Hop|Jazz|Latin|Non-Music|Pop|Reggae|Rock|Stage & Screen|stylesIndex|artistsIndex|companiesIndex|countryIndex|
+---------+--------------------+----+--------+-------------+-----+-----+-----+-----+----------------+----------+---------+----------+----------------------+-----------+-------+----+-----+---------+---+------+----+--------------+-----------+------------+--------------+------------+
|     1677|                  M6|1997|      18|            2|    1|    0|    0|    0|               0|         0|        0|         1|                     

<h3>Creación del fichero con los datos de la manera que nos interesa para calcular la similitud.</h3>

In [53]:
csvFileName = "../TFM Files/CNT2.1 - dataNotScaled"

dfJoined\
    .repartition(1)\
    .coalesce(1)\
    .write\
    .format("csv")\
    .options(sep=",", inferSchema="true", header="true")\
    .save(csvFileName)

<h1>Generación del dataset con los valores normalizados.</h1>

In [56]:
#Definición del archivo json 

mastersReleasesJoinedFile = '../TFM Files/CNT2.1 - dataNotScaled/dataNotScaled.csv'

dfMRJ = sqlContext.read.load(mastersReleasesJoinedFile,
                     format="csv", sep=",", inferSchema="true", header="true")

dfMRJ.show()
    
# dfMRJ.count()

+---------+--------------------+----+--------+-------------+-----+-----+-----+-----+----------------+----------+---------+----------+----------------------+-----------+-------+----+-----+---------+---+------+----+--------------+-----------+------------+--------------+------------+
|master_id|               title|year|duration|number_tracks|Audio|Other|Video|Blues|Brass & Military|Children's|Classical|Electronic|Folk, World, & Country|Funk / Soul|Hip Hop|Jazz|Latin|Non-Music|Pop|Reggae|Rock|Stage & Screen|stylesIndex|artistsIndex|companiesIndex|countryIndex|
+---------+--------------------+----+--------+-------------+-----+-----+-----+-----+----------------+----------+---------+----------+----------------------+-----------+-------+----+-----+---------+---+------+----+--------------+-----------+------------+--------------+------------+
|     1840|    Cursed / Optimum|1998|      17|            2|    1|    0|    0|    0|               0|         0|        0|         1|                     

In [57]:
dfDropped = dfMRJ.drop("master_id", "title")

featuresCols = dfDropped.columns
print(featuresCols)

['year', 'duration', 'number_tracks', 'Audio', 'Other', 'Video', 'Blues', 'Brass & Military', "Children's", 'Classical', 'Electronic', 'Folk, World, & Country', 'Funk / Soul', 'Hip Hop', 'Jazz', 'Latin', 'Non-Music', 'Pop', 'Reggae', 'Rock', 'Stage & Screen', 'stylesIndex', 'artistsIndex', 'companiesIndex', 'countryIndex']


In [58]:
from pyspark.ml.feature import VectorAssembler, VectorIndexer
from pyspark.ml.feature import StandardScaler
from pyspark.ml import Pipeline

# This concatenates all feature columns into a single feature vector in a new column "rawFeatures".
vectorAssembler = VectorAssembler(inputCols=featuresCols, outputCol="features")

scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                        withStd=True, withMean=True)

pipeline = Pipeline(stages=[vectorAssembler, scaler])

# Compute summary statistics by fitting the StandardScaler
pipelineModel = pipeline.fit(dfMRJ)

# Normalize each feature to have unit standard deviation.
scaledData = pipelineModel.transform(dfMRJ)
scaledData.show()

+---------+--------------------+----+--------+-------------+-----+-----+-----+-----+----------------+----------+---------+----------+----------------------+-----------+-------+----+-----+---------+---+------+----+--------------+-----------+------------+--------------+------------+--------------------+--------------------+
|master_id|               title|year|duration|number_tracks|Audio|Other|Video|Blues|Brass & Military|Children's|Classical|Electronic|Folk, World, & Country|Funk / Soul|Hip Hop|Jazz|Latin|Non-Music|Pop|Reggae|Rock|Stage & Screen|stylesIndex|artistsIndex|companiesIndex|countryIndex|            features|      scaledFeatures|
+---------+--------------------+----+--------+-------------+-----+-----+-----+-----+----------------+----------+---------+----------+----------------------+-----------+-------+----+-----+---------+---+------+----+--------------+-----------+------------+--------------+------------+--------------------+--------------------+
|     1840|    Cursed / Opti

In [59]:
dfCleanedScaled = scaledData.select("master_id", "title", "scaledFeatures")

dfCleanedScaled.show()

+---------+--------------------+--------------------+
|master_id|               title|      scaledFeatures|
+---------+--------------------+--------------------+
|     1840|    Cursed / Optimum|[0.11954482973390...|
|     2812|            Showcase|[0.12416606513307...|
|     3391|          Fairytales|[0.09643865273808...|
|     5418|                Sign|[0.12878730053223...|
|     6424|Papua New Guinea ...|[0.10105988813724...|
|     6834|          Tiger Trap|[0.12878730053223...|
|     7544|The Sound Of Goodbye|[0.13340853593140...|
|     7546|Different Stages ...|[0.13340853593140...|
|     8305|At Home With The ...|[0.09643865273808...|
|    10889|        Virus Meadow|[0.12878730053223...|
|    11593|                奇跡|[0.12878730053223...|
|    13206|   Piano In The Dark|[0.11492359433474...|
|    13658|        Fire And Ice|[0.12416606513307...|
|    14259|     Szenario Europa|[0.11030235893557...|
|    14805|      Cocktail Mixxx|[0.12878730053223...|
|    16781|Take (My Breath A..

In [61]:
jsonFileName = "../TFM Files/CNT2.2 - dataScaled"

dfCleanedScaled\
    .repartition(1)\
    .coalesce(1)\
    .write\
    .format("json")\
    .save(jsonFileName)