# Preparacion librerias spark

In [None]:
# Load external packages programatically
# llamamos a MAVEN organiz:artefacto:version(scala)
#                 com.databricks:spark-xml_2.11:0.4.1
import os
packages = "com.databricks:spark-xml_2.11:0.4.1"
os.environ["PYSPARK_SUBMIT_ARGS"] = ("--packages {0} pyspark-shell".format(packages))

In [None]:
## iniciamos sesion con pyspark
import pyspark
from pyspark.sql import SparkSession
spark = (SparkSession.builder
    .master("local[*]")
    .config("spark.driver.cores", 1)
    .appName("xml2json_dblp")
    .getOrCreate() )
sc = spark.sparkContext
sc

In [None]:
from pyspark.sql.functions import lit,col,udf,explode
from pyspark.sql.types import StringType,StructType,StructField,ArrayType, IntegerType

# MongoDB

## Leemos el xml para cada tipo de publicación

In [None]:
schemaDBLP = StructType([\
    StructField("_key", StringType(), True), \
    StructField("author", ArrayType(StructType([StructField("_VALUE", StringType())]))),\
    StructField("title", StructType([StructField("_VALUE", StringType())])),\
    StructField("year", IntegerType(), True)])

In [None]:
df_incollection = (spark.read.format('xml')
                 .options(rowTag='incollection', rootTag='incollection')                 
                 .load('dblp.xml', schema = schemaDBLP))

In [None]:
df_inproceedings = (spark.read.format('xml')
                 .options(rowTag='inproceedings', rootTag='inproceedings')                 
                 .load('dblp.xml', schema = schemaDBLP))

In [None]:
df_article = (spark.read.format('xml')
                 .options(rowTag='article', rootTag='article')                 
                 .load('dblp.xml', schema = schemaDBLP))

## Eliminamos registros vacíos

In [None]:
df_incollection = df_incollection.na.drop(subset=["_key"])

In [None]:
df_inproceedings = df_inproceedings.na.drop(subset=["_key"])

In [None]:
df_article = df_article.na.drop(subset=["_key"])

## Añadimos columna con el tipo de publicación

In [None]:
df_incollection = df_incollection.withColumn('type', lit('incollection'))
df_incollection.printSchema()

In [None]:
df_inproceedings = df_inproceedings.withColumn('type', lit('inproceedings'))
df_inproceedings.printSchema()

In [None]:
df_article = df_article.withColumn('type', lit('article'))
df_article.printSchema()

## Cambiamos "title" de struct a string

In [None]:
df_incollection = df_incollection.select('_key','author','title._VALUE','year','type')
df_incollection = df_incollection.withColumnRenamed('_VALUE', 'title')

In [None]:
df_inproceedings = df_inproceedings.select('_key','author','title._VALUE','year','type')
df_inproceedings = df_inproceedings.withColumnRenamed('_VALUE', 'title')

In [None]:
df_article = df_article.select('_key','author','title._VALUE','year','type')
df_article = df_article.withColumnRenamed('_VALUE', 'title')

## Eliminamos la etiqueta "_VALUE" del campo "author"

In [None]:
def tuples2list(lista):
    if lista is not None:
        lista_final = []
        for value in lista:
            lista_final.append(value[0])
        return lista_final
tuples2list_udf = udf(tuples2list, ArrayType(StringType(),False))

In [None]:
df_incollection = df_incollection.withColumn("author", tuples2list_udf(df_incollection.author))
df_incollection.printSchema()

In [None]:
df_incollection.take(2)

In [None]:
df_inproceedings = df_inproceedings.withColumn("author", tuples2list_udf(df_inproceedings.author))
df_inproceedings.printSchema()

In [None]:
df_inproceedings.take(2)

In [None]:
df_article = df_article.withColumn("author", tuples2list_udf(df_article.author))
df_article.printSchema()

In [None]:
df_article.take(2)

## Grabamos en .json

In [None]:
df_incollection.coalesce(1).write.format('json').save('json_incollection')

In [None]:
df_inproceedings.coalesce(1).write.format('json').save('json_inproceedings')

In [None]:
df_article.coalesce(1).write.format('json').save('json_article')

# Neo4j

## Unimos los tres dataframes

In [None]:
df_union = df_incollection.union(df_inproceedings).union(df_article)

In [None]:
df_union.count()

In [None]:
df_union.printSchema()

### CASO 1. Grabamos csv para carga en neo4j

In [None]:
def list2neo4jstr(lista):
    if lista is not None:
        return str(lista).replace(",",";").lstrip("[").rstrip("]")

list2neo4jstr_udf = udf(list2neo4jstr, StringType())

In [None]:
df = df_union.withColumn("author", list2neo4jstr_udf(df.author))

In [None]:
df.coalesce(1).write.option("header", "true").csv("neo4j_input.csv")

### CASO 2. Grabajos 3 csv para importar en neo4j

In [None]:
df_colecciones = df_union.select("_key","year")
df_colecciones = df_colecciones.withColumnRenamed("_key", "collectionId:ID(Collection)")
df_colecciones = df_colecciones.withColumnRenamed("yar","year:INT")
df_colecciones.printSchema()
df_colecciones.count()

In [None]:
df_relaciones = df_union.select(col("_key"),explode(col("author")).alias("author"))
df_relaciones = df_relaciones.withColumnRenamed("_key", ":START_ID(Collection)")
df_relaciones = df_relaciones.withColumnRenamed("author",":END_ID(Author)")
df_relaciones.printSchema()
df_relaciones.count()

In [None]:
df_autores = df_relaciones.select(":END_ID(Author)").distinct()
df_autores = df_autores.withColumnRenamed(":END_ID(Author)", "authorId:ID(Author)")
df_autores.printSchema()
df_autores.count()

In [None]:
df_colecciones.coalesce(1).write.option("header", "true").csv("colecciones.csv")

In [None]:
df_relaciones.coalesce(1).write.option("header", "true").csv("relaciones.csv")

In [None]:
df_autores.coalesce(1).write.option("header", "true").csv("autores.csv")

# ANEXO 1. Consultas con Spark SQL

In [None]:
df2 = df_union.withColumn("author_separado", explode(df.author))
df3 = df2.select('_key','author_separado','title','year','type')
df3.registerTempTable('publicaciones')

#### Listado de todas las publicaciones de un autor determinado

In [None]:
a = spark.sql("""
SELECT author_separado as author,
       title           as titulo       
       FROM publicaciones 
WHERE author_separado ='Javier M. Moguerza'
""")
a.show()

#### Número de publicaciones de un autor determinado

In [None]:
a = spark.sql("""
SELECT author_separado as author,
       count(*)        as num_publicaciones 
       FROM publicaciones 
WHERE author_separado ='Javier M. Moguerza'
GROUP BY author_separado
""")
a.show()

#### Numero de articulos en revista para el año 2017

In [None]:
a = spark.sql(""" SELECT count(*) as Num_articles
                  FROM authorjuntos 
                  WHERE year='2017' AND type='article'
               """)
a.show()

#### Numero de autores ocasionales, es decir, que tengan menos de 5 publicaciones en total

In [None]:
e = spark.sql(""" SELECT count(*) as num_autores_ocasionales
                  FROM (SELECT  author_separado as author
                              , count(*)        as num_publicaciones
                          FROM publicaciones
                          GROUP BY author_separado
                          ORDER BY 2 asc
                         )
                  WHERE num_publicaciones < 5  
               """)
e.show()

#### Edad de los 5 autores con un periodo de publicacion más largo

In [None]:
f = spark.sql(""" SELECT author,
                        maxyear - minyear as edad_autor
                  FROM(  
                        SELECT   author_separado as author
                                ,max(year) as maxyear 
                                ,min(year) as minyear 
                        FROM publicaciones                         
                        GROUP BY author_separado
                        ORDER BY 1,2,3
                       ) 
                  ORDER BY 2 desc
                  LIMIT 5
               """)
f.show()

#### Número de autores novatos, e.d que tengan una edad menor 5 años

In [None]:
f = spark.sql(""" SELECT count(*) as Num_autores_novatos
                  FROM(
                          SELECT author,
                                 maxyear - minyear as edad_autor
                          FROM(  
                                 SELECT   author_separado as author
                                         ,max(year) as maxyear 
                                         ,min(year) as minyear 
                                 FROM publicaciones                                  
                                 GROUP BY author_separado
                                 ORDER BY 1,2,3
                               ) 
                          ORDER BY 2 desc
                       )   
                 WHERE edad_autor < 5
               """)
f.show()

# Paramos el contexto de Spark

In [None]:
sc.stop()