In [1]:
import json
import csv
import numpy as np
from numpy import linalg
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
def get_authors() -> list:
    result = []

    with open('../data/authors.json') as authorsJsonFile:
        a_authors = json.load(authorsJsonFile)

    a_authors.pop("")
    for author, values in a_authors.items():
        values.pop('co_authors')
        result.append([author, values])

    return result

In [3]:
def get_sections(s_authors: list) -> list:
    result = set()
    for author in s_authors:
        for value in author[1].keys():
            result.add(value)

    return list(result)

In [4]:
def pearson_correlation(p_authors):
    length = len(p_authors)
    dim = len(sections)
    author_array = np.empty(dim)
    co_author_array = np.empty(dim)

    with open('../data/pearson_edges.csv', mode='w') as cosine_file:
        file_writer = csv.writer(cosine_file, delimiter=';', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        file_writer.writerow(['source', 'target', 'weight'])

        for i in range(0, length):
            author_array.fill(0.0)

            author, values = p_authors[i]
            for key, value in values.items():
                author_array[sections.index(key)] = value

            for j in range(i+1, length):
                co_author_array.fill(0.0)

                co_author, co_values = p_authors[j]
                for co_key, co_value in co_values.items():
                    co_author_array[sections.index(co_key)] = co_value

                condition = any(item in co_values.keys() for item in values.keys())
                if condition:
                    similarity = round(np.corrcoef(author_array, co_author_array)[0,1], 4)
                    file_writer.writerow([author, co_author, similarity])

In [5]:
def minkowski_distance(minkowski_data: list, n: float) -> float:
    distances = [abs(element[0] - element[1])**n for element in minkowski_data]

    return (sum(distances) ** (1/n)) + 1

In [6]:
def calc_minkowski_distance(m_authors: list, n: float, mode: str):
    with open(f'../data/{mode}.csv', mode='w') as minkowski_file:
        file_writer = csv.writer(minkowski_file, delimiter=';', quotechar='"', quoting=csv.QUOTE_MINIMAL)

        header = ["source", "target", "weight"]
        file_writer.writerow(header)

        for author in m_authors:
            author_seen = False
            # calc for all other authors
            for colleague in m_authors:
                if author[0] == colleague[0] and not author_seen:
                    author_seen = True
                    continue

                if not author_seen:
                    continue

                # initial
                m_data = np.zeros((len(sections), 2))
                for co_key, co_value in colleague[1].items():
                    m_data[sections.index(co_key)] = [0, co_value]

                relevant = False
                for key, value in author[1].items():
                    if key in colleague[1]:
                        relevant = True
                        m_data[sections.index(key)] = [value, colleague[1][key]]
                    else:
                        m_data[sections.index(key)] = [value, 0]

                if relevant:
                    distance = minkowski_distance(list(m_data), n)
                    file_writer.writerow([author[0], colleague[0], round(distance, 2)])

In [7]:
def cos_sim():
    length = len(authors)
    dim = len(sections)
    author_array = np.empty(dim)
    co_author_array = np.empty(dim)

    with open('../data/cosine_edges.csv', mode='w') as cosine_file:
        file_writer = csv.writer(cosine_file, delimiter=';', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        file_writer.writerow(['source', 'target', 'weight'])

        for i in range(0, length):
            author_array.fill(0.0)

            author, values = authors[i]
            for key, value in values.items():
                author_array[sections.index(key)] = value
            author_norm = linalg.norm(author_array)

            for j in range(i+1, length):
                co_author_array.fill(0.0)

                co_author, co_values = authors[j]
                for co_key, co_value in co_values.items():
                    co_author_array[sections.index(co_key)] = co_value
                co_author_norm = linalg.norm(co_author_array)

                similarity = np.dot(author_array, co_author_array) / (author_norm * co_author_norm)
                if similarity > 0.0:
                    file_writer.writerow([author, co_author, round(similarity, 4)])

In [8]:
authors = get_authors()
sections = get_sections(authors)
print('initial done')

initial done


In [12]:
calc_minkowski_distance(authors, 2, 'euclid')
print('euclid')

euclid


In [13]:
calc_minkowski_distance(authors, 1, 'manhattan')
print('manhattan')

KeyboardInterrupt: 

In [None]:
calc_minkowski_distance(authors, .5, 'frac')
print('frac')

In [10]:
cos_sim()
print('cos')

cos


In [11]:
pearson_correlation(authors)
print('pearson')

pearson


In [8]:
pcc = pd.read_csv('../data/pearson_edges.csv', sep=';')
euc = pd.read_csv('../data/euclidean_edges.csv', sep=';')


FileNotFoundError: [Errno 2] No such file or directory: '../data/pearson_edges.csv'

In [None]:
euc['weight'].plot.hist(bins=100, cumulative=True)

In [None]:
pcc['weight'] = round(pcc['weight'], 2)
plt.figure()
pcc['weight'].abs().plot.hist(bins=100, cumulative=True)
