In [97]:
!pip install pyspark



In [98]:
import os
import sys
import re
import string
from pyspark.sql import SparkSession
from pyspark.sql.functions import regexp_replace, trim
from pyspark.sql.functions import lower, col, explode

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [99]:
def init_spark():
  spark = SparkSession.builder.appName("BigData").getOrCreate()
  sc = spark.sparkContext
  return spark,sc

In [100]:
def load_data_from_json(spark, file_name):
    return spark.read.json(file_name)

In [101]:
sc = init_spark()[0]
arxiv_dataset = load_data_from_json(sc, "part1.json")
arxiv_dataset.take(5)

[Row(abstract='  A fully differential calculation in perturbative quantum chromodynamics is\npresented for the production of massive photon pairs at hadron colliders. All\nnext-to-leading order perturbative contributions from quark-antiquark,\ngluon-(anti)quark, and gluon-gluon subprocesses are included, as well as\nall-orders resummation of initial-state gluon radiation valid at\nnext-to-next-to-leading logarithmic accuracy. The region of phase space is\nspecified in which the calculation is most reliable. Good agreement is\ndemonstrated with data from the Fermilab Tevatron, and predictions are made for\nmore detailed tests with CDF and DO data. Predictions are shown for\ndistributions of diphoton pairs produced at the energy of the Large Hadron\nCollider (LHC). Distributions of the diphoton pairs from the decay of a Higgs\nboson are contrasted with those produced from QCD processes at the LHC, showing\nthat enhanced sensitivity to the signal can be obtained with judicious\nselection 

## **Useful functions**

In [102]:
def remove_math_formula(df, field_name):
    return df.withColumn(field_name, regexp_replace(col(field_name), "\$.*?\$", ""))

def convert_to_lowercase(df, field_name):
    return df.withColumn(field_name, lower(col(field_name)))

def remove_punctuation_except_dot(df, field_name):
    pattern = "[^a-zA-Z0-9\s\.,]"
    cleaned_df = df.withColumn(field_name, regexp_replace(col(field_name), pattern, ""))
    return cleaned_df

def remove_empty_newlines(df, field_name):
    return df.withColumn(field_name, regexp_replace(col(field_name), "\n", " "))

def remove_extra_spaces(df, field_name):
    df = df.withColumn(field_name, trim(regexp_replace(col(field_name), " +", " ")))
    return df

def remove_text_in_braces(df, field_name):
    pattern = r'[\(\{\[].*?[\)\}\]]'
    cleaned_df = df.withColumn(field_name, regexp_replace(col(field_name), pattern, ''))
    return cleaned_df

def remove_punctuation(df, field_name):
    pattern = r'[\\"\^]'
    cleaned_df = df.withColumn(field_name, regexp_replace(col(field_name), pattern, ''))
    return cleaned_df

def remove_text_between_parentheses(strings_list):
    result_list = []
    for string in strings_list:
        modified_string = re.sub(r'\([^()]*\)', '', string)
        result_list.append(modified_string)
    return result_list

def remove_substring_from_list(input_list, substring_to_remove):
    return [element.replace(substring_to_remove, "") for element in input_list]

def split_function(sep, authors_list):
  for i in range(len(authors_list)):
      element = authors_list[i]
      author = element.split(sep)
      authors_list[i] = author
  authors_list = set([item for sublist in authors_list for item in sublist])
  return list(authors_list)

def delete_new_line_and_spaces(author_list):
  for i in range(len(author_list)):
    if author_list[i].startswith("\n "):
      author_list[i] = author_list[i].replace("\n ", '', 1)
  for i in range(len(author_list)):
    author_list[i] = author_list[i].strip()
  for author in author_list:
    if author == '':
      author_list.remove(author)
  return author_list

def delete_duplicate(author_list):
  distinct_list = []
  for author in author_list:
    if author not in distinct_list:
      distinct_list.append(author)
  return distinct_list

def delete_letter(author_list):
  for author in author_list:
    if len(author) == 1:
      author_list.remove(author)
    elif len(author) == 2 and author[-1] == ".":
      author_list.remove(author)
  return author_list

def remove_characters_before_substring(original_list, target_substring):
    return [remove_characters(element, target_substring) for element in original_list]

def remove_characters(original_string, target_substring):
    index = original_string.lower().find(target_substring.lower())
    if index != -1:
        return original_string[index + len(target_substring):]
    else:
        return original_string

def delete_element(input_list, substr):
  for element in input_list:
    if substr.lower() in element.lower():
      input_list.remove(element)
  return input_list

## **Article categories**

In [103]:
categories_df = arxiv_dataset.select("categories")
distinct_categories_df = categories_df.distinct()
categories_list = distinct_categories_df.rdd.flatMap(lambda x: x).collect()

for i in range(len(categories_list)):
    element = categories_list[i]
    categories = element.split()
    categories_list[i] = categories

categories_list = set([item for sublist in categories_list for item in sublist])

print(categories_list)

{'math.AT', 'econ.EM', 'math.CT', 'math.MP', 'physics.flu-dyn', 'cs.NA', 'hep-ph', 'physics.acc-ph', 'physics.plasm-ph', 'math-ph', 'nlin.AO', 'math.OA', 'hep-lat', 'cs.NI', 'stat.ME', 'cs.CV', 'math.KT', 'math.RA', 'math.DG', 'cs.CG', 'cs.NE', 'math.ST', 'astro-ph.EP', 'physics.class-ph', 'math.GT', 'cs.PL', 'math.GR', 'math.FA', 'q-fin.PM', 'q-bio.QM', 'cs.AI', 'cs.RO', 'math.AG', 'quant-ph', 'math.MG', 'cond-mat.str-el', 'cs.CY', 'cond-mat.other', 'q-bio.GN', 'stat.AP', 'physics.comp-ph', 'math.HO', 'physics.chem-ph', 'physics.ed-ph', 'physics.data-an', 'cond-mat.dis-nn', 'nlin.CG', 'q-bio.NC', 'nlin.CD', 'cond-mat.quant-gas', 'math.SG', 'math.QA', 'cs.AR', 'q-fin.RM', 'nlin.SI', 'math.IT', 'math.NT', 'math.CO', 'physics.ins-det', 'math.NA', 'stat.TH', 'stat.ML', 'q-bio.PE', 'cs.SC', 'math.LO', 'gr-qc', 'nlin.PS', 'cs.DS', 'astro-ph', 'cs.LO', 'cs.HC', 'physics.gen-ph', 'physics.atom-ph', 'physics.ao-ph', 'q-bio.BM', 'physics.space-ph', 'cs.CR', 'physics.atm-clus', 'stat.CO', 'astro

## **Articles authors**

In [104]:
authors_df = arxiv_dataset.select("authors")
distinct_authors_df = authors_df.distinct()
authors_list = distinct_authors_df.rdd.flatMap(lambda x: x).collect()

authors_list = remove_text_between_parentheses(authors_list)
authors_list = remove_text_between_parentheses(authors_list)

authors_list = remove_substring_from_list(authors_list, "et al")
authors_list = remove_characters_before_substring(authors_list, "Collaboration:")

authors_list = split_function(",", authors_list)
authors_list = split_function(" and ", authors_list)

authors_list = delete_new_line_and_spaces(authors_list)
authors_list = delete_duplicate(authors_list)
authors_list = delete_letter(authors_list)
authors_list = delete_element(authors_list, "Collaboration")

print(authors_list)
print(len(authors_list))

['Xiaolin Li', 'M. F. Bode', 'W. Mike Snow', 'Nadine Nettelmann', 'Alex Golovin', 'Catalina Curceanu', 'Ian Doust', 'M. Hewitson', 'Jounghun Lee', 'L. Oberauer', 'I. S. Hahn', 'Seung Woo Hong', 'Zhihong Lu', 'Jing Zeng', 'Zinaida A. Lykova', 'H.J. Kimble', 'Chen Avin', 'G. Newsham', 'J.B. Jones', 'Jeff Schneider', 'I.I. Mazin', "T\\^ania Tom\\'e", 'A. C. Aguilar', 'C. Montanari', 'Roman Krcmar', 'Tudor S. Ratiu', 'Peng Dong', 'V.R.R. Medicherla', 'Ding-fang Zeng', 'Lauro Tomio', 'Joshua J. Waterfall', 'R. Mukherjee', "I.Ya. Aref'eva", 'Anders Eriksson', 'Slavek M. Rucinski', 'Chia-Cheng Chen', 'Luca Fanelli', 'Stefan Bornholdt', 'T. K. Sridharan', 'Y. Kantor', 'D. W. Lunney', 'V.S. Timoteo', 'Phongpichit Channuie', 'A. Rogalev', 'Nino Zangh\\`i', 'N. I. Zheludev', 'Akimichi Takemura', 'A. Retico', 'Shinji Miyoki', 'Rupert L. Frank', 'Stephen Brierley', 'T. R.\n  Schibli', 'Dazhi Liu', 'Arthur Jaffe', 'V. Bozza', 'O. Oliveira', 'P. Fulde', 'Jonghoon Lee', 'J. M. Barr', 'Yunchang Shin', 

## **Find prolific authors for a specific category**

In [105]:
for category in categories_list:
  filtered_df = arxiv_dataset.filter(col("categories").like(f"%{category}%"))
  authors_per_cathegory = filtered_df.select("authors").distinct().rdd.flatMap(lambda x: x).collect()

  profilic_authors = {}
  for author in authors_per_cathegory:
      for author_name in authors_list:
          if author_name.lower() in author.lower():
              if author_name in profilic_authors:
                  profilic_authors[author_name] += 1
              else:
                  profilic_authors[author_name] = 1
  max_author = max(profilic_authors, key=profilic_authors.get)
  print("Category: " + category + " Author: " + str(max_author))


Category: math.AT Author: Koichi Fujii
Category: econ.EM Author: Victor Chernozhukov
Category: math.CT Author: David Ellerman
Category: math.MP Author: Hector Oviedo
Category: physics.flu-dyn Author: Baruch Meerson
Category: cs.NA Author: Lester Ingber
Category: hep-ph Author: Sheng Wang
Category: physics.acc-ph Author: A.A.Mikhailichenko
Category: physics.plasm-ph Author: S. C. Cowley
Category: math-ph Author: Hector Oviedo
Category: nlin.AO Author: Carson C. Chow
Category: math.OA Author: George A. Elliott
Category: hep-lat Author: H. Neuberger
Category: cs.NI Author: Omid Amini
Category: stat.ME Author: Victor Chernozhukov
Category: cs.CV Author: Tshilidzi Marwala
Category: math.KT Author: Philippe Elbaz-Vincent
Category: math.RA Author: Ming-chang Kang
Category: math.DG Author: L. You
Category: cs.CG Author: Fajie Li
Category: cs.NE Author: Mohd Abubakr
Category: math.ST Author: Victor Chernozhukov
Category: astro-ph.EP Author: Natalia Miller
Category: physics.class-ph Author: Phil