In [0]:
dbutils.widgets.text("raw_documents", "/FileStore/tables/cfuentes/input")
dbutils.widgets.text("english_documents", "/FileStore/tables/cfuentes/english_articles.parquet")
dbutils.widgets.text("origin_path", "/FileStore/tables/cfuentes/input/{}.json")
dbutils.widgets.text("error_destination_path", "/FileStore/tables/cfuentes/error/{}.json")

In [0]:
%python

path = dbutils.widgets.get("raw_documents")

jsonDocs = spark.read.option("primitivesAsString ","true").option("multiLine", "true").json(path)

jsonDocs.count()

In [0]:
from langdetect import detect
from pyspark.sql.functions import udf
import numpy as np
from pyspark.sql.types import StringType

def language_detection(text):
  try:
    return detect(text)
  except:
    return None

langdetect_udf = udf(language_detection, StringType())

In [0]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, ArrayType

def get_name_list(author_list):
  return [author.first + " " + " ".join(author.middle) + " " + author.last if author.middle else author.first + " " + author.last for author in author_list]
  

get_name_list_udf = udf(get_name_list, ArrayType(StringType()))

In [0]:
from pyspark.sql.functions import array_join

jsonDocs = jsonDocs.select(
  "paper_id",
  array_join(jsonDocs.abstract.text, " ").alias("abstract_text"),
  langdetect_udf(array_join(jsonDocs.body_text.text, " ")).alias("language"),
  get_name_list_udf("metadata.authors").alias("authors")
)


In [0]:
jsonDocsW = jsonDocs.filter("language == 'en'")

jsonDocsW.count()

In [0]:
o_path = dbutils.widgets.get("english_documents")

jsonDocsW.write.mode('overwrite').parquet(o_path)

In [0]:
origin_path = dbutils.widgets.get("origin_path")
error_destination_path = dbutils.widgets.get("error_destination_path")

error_list = [r[0] for r in jsonDocs.filter("language != 'en'").select('paper_id').toLocalIterator()]

for error_id in error_list:
  dbutils.fs.mv(origin_path.format(error_id), error_destination_path.format(error_id))

In [0]:
dbutils.notebook.exit("Ok")