In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import StringType
import os

spark = SparkSession.builder.appName("RecomendadorLibros").getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/12/07 21:12:16 WARN Utils: Your hostname, Gyro, resolves to a loopback address: 127.0.1.1; using 172.19.135.164 instead (on interface eth0)
25/12/07 21:12:16 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/07 21:12:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/12/07 21:12:20 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


### Carga la matriz para conseguir los nombres de los libros

In [2]:
df_sim = spark.read.parquet("notebooks/matriz_similitud_libros.parquet")
df_sim.show(5, truncate=False)

                                                                                

+--------------------------------------------------+--------------------------------------------------------------------------------------------+-------------------+
|libro1                                            |libro2                                                                                      |similitud          |
+--------------------------------------------------+--------------------------------------------------------------------------------------------+-------------------+
|file:///home/robc/SistDist/books/Bleak%20House.txt|file:///home/robc/SistDist/books/The%20Republic.txt                                         |0.06913458548336301|
|file:///home/robc/SistDist/books/Bleak%20House.txt|file:///home/robc/SistDist/books/Dracula.txt                                                |0.04213809987545557|
|file:///home/robc/SistDist/books/Bleak%20House.txt|file:///home/robc/SistDist/books/Little%20Women.txt                                         |0.07950533276063317|
|fil

### Limpia los nombres de los libros

In [17]:
def extraer_nombre(ruta):
    nombre = os.path.basename(ruta)
    return nombre.replace("%20", " ")

In [18]:
extraer_nombre_udf = udf(extraer_nombre, StringType())

In [19]:
df_sim = df_sim.withColumn("libro1_nombre", extraer_nombre_udf(col("libro1")))
df_sim = df_sim.withColumn("libro2_nombre", extraer_nombre_udf(col("libro2")))

In [21]:
def recomendar_libros(libro_base, top_n=10):
    df_filtrado = df_sim.filter(col("libro1_nombre") == libro_base)
    df_top = df_filtrado.orderBy(col("similitud").desc()).limit(top_n)
    libros_recomendados = [row.libro2_nombre for row in df_top.collect()]
    return libros_recomendados

### Se lecciona los primeros 20 nombres limpios

In [22]:
df_sim.select("libro1_nombre").distinct().show(20, truncate=False)

+------------------------------------------------------------------+
|libro1_nombre                                                     |
+------------------------------------------------------------------+
|A Study in Scarlet.txt                                            |
|Narrative of the Life of Frederick Douglass, an American Slave.txt|
|Frankenstein; Or, The Modern Prometheus.txt                       |
|The Odyssey.txt                                                   |
|The Tragical History of Doctor Faustus.txt                        |
|Bleak House.txt                                                   |
|The Works of Edgar Allan Poe — Volume 2.txt                       |
|White Nights and Other Stories.txt                                |
|Thus Spake Zarathustra A Book for All and None.txt                |
|Pride and Prejudice.txt                                           |
|The Picture of Dorian Gray.txt                                    |
|Moby Multiple Language Lists of C

### Recomienda los libros parecidos

In [23]:
libro_ejemplo = "The Republic.txt" #Libro de Referencia
recomendados = recomendar_libros(libro_ejemplo, top_n=10)
print(f"Libros recomendados para {libro_ejemplo}: {recomendados}")

Libros recomendados para The Republic.txt: ['Beyond Good and Evil.txt', 'On Liberty.txt', 'Society in America, Volume 1 (of 2).txt', 'How to Observe Morals and Manners.txt', 'Second Treatise of Government.txt', 'The Works of Edgar Allan Poe — Volume 2.txt', 'Walden, and On The Duty Of Civil Disobedience.txt', 'The Souls of Black Folk.txt', "Gulliver's Travels into Several Remote Nations of the World.txt", 'The Adventures of Roderick Random.txt']
