In [1]:
import json
import csv
import numpy as np

In [2]:
authors = {}

In [3]:
def get_name(_person: dict) -> str:
    first = _person['firstname'] if _person['firstname'] else ''
    middle = _person['middlename'] if _person['middlename'] else ''
    last = _person['lastname'] if _person['lastname'] else ''
    return (first + middle + last).lower()

In [4]:
def get_co_authors(current_rank: int, _persons: list, _co_authors = None) -> dict:
    co_authors = _co_authors if _co_authors else {}

    if len(_persons) > 1:
        for _co_author in _persons:
            if _co_author['rank'] != current_rank:
                co_author_name = get_name(_co_author)
                co_authors[co_author_name] = 1 if co_author_name not in co_authors else co_authors[co_author_name] + 1

    return co_authors

In [5]:
def get_authors(start, end):
    for year in range(start, end+1):
        print(f"{year}", sep=",", end=" ")

        for month in range(1, 13):
            with open(f"../data/{year}_{month}.json", "r") as file:
                data = json.load(file)

            for doc in data['response']['docs']:
                if 'document_type' not in doc: continue
                if doc['document_type'] != 'article': continue

                persons = doc['byline']['person']
                if 'person' in doc and len(persons) == 0: continue
                if persons is None: continue

                news_desk = doc['news_desk'] if 'news_desk' in doc else None
                #section_name = doc['section_name'] if 'section_name' in doc else None
                #subsection_name = doc['subsection_name'] if 'subsection_name' in doc else None

                for person in persons:
                    # get the fullname
                    name = get_name(person)

                    # check if the author is already seen if not create a new author sub dict
                    if name not in authors:
                        author = {}

                        if news_desk: author[news_desk] = 1
                        #if section_name: author[section_name] = 1
                        #if subsection_name: author[subsection_name] = 1
                        author['co_authors'] = get_co_authors(person['rank'], persons)

                        authors[name] = author
                    else:
                        author = authors[name]

                        if news_desk and news_desk in author: author[news_desk] += 1
                        if news_desk and news_desk not in author: author[news_desk] = 1

                        # if section_name and section_name in author: author[section_name] += 1
                        # if section_name and section_name not in author: author[section_name] = 1
                        #
                        # if subsection_name and subsection_name in author: author[subsection_name] += 1
                        # if subsection_name and subsection_name not in author: author[subsection_name] = 1

                        author['co_authors'] = get_co_authors(person['rank'], persons, author['co_authors'])

In [6]:
def authors_to_json(j_year: int = None ):
    if j_year is None:
        with open('../data/authors.json', 'w') as output:
            json.dump(authors, output)
    else:
        with open(f'../data/authors_{j_year}.json', 'w') as output:
            json.dump(authors, output)


In [7]:
for i in range(2016, 2021):
    authors = {}
    get_authors(i, i)
    authors_to_json(i)

# with open('../data/authors_2016.csv', mode='w') as file:
#     file_writer = csv.writer(file, delimiter=';', quotechar='"', quoting=csv.QUOTE_MINIMAL)
#
#     for author, values in authors.items():
#         author_list = []
#
#         if len(values['co_authors']) > 0:
#             for co_author in values['co_authors']:
#                 author_list.append(co_author)
#
#         file_writer.writerow(author_list)
#

2016 2017 2018 2019 2020 

In [8]:
authors = {}
get_authors(2016, 2020)
authors_to_json()


# with open('../data/authors_matrix_2016.csv', mode='w') as file:
#     file_writer = csv.writer(file, delimiter=';', quotechar='"', quoting=csv.QUOTE_MINIMAL)
#
#     header = [";"] + list(authors.keys())
#     file_writer.writerow(header)
#
#     for author, values in authors.items():
#         row = [author] + list(np.zeros(len(header) - 1, dtype=int))
#         row[header.index(author)] = 0
#
#         if len(values['co_authors']) > 0:
#             for co_author, weight in values['co_authors'].items():
#                 row[header.index(co_author)] = weight
#
#         file_writer.writerow(row)

2016 2017 2018 2019 2020 