# Generate Network Data Files

This scripts generates two CSV files describing nodes for every professor and edges describing the professors cooperation count. It also creates a TXT file assigning research field groups for all professor nodes. These files are later used by matlab scripts to create network figures showing cooperations between professors.

In [1]:
import csv
import matplotlib.pyplot as plt
import pandas as pd
import math
import ast
import numpy as np
from tueplots.constants.color import rgb

plt.rcParams.update({"figure.dpi": 300, 'font.family': 'Times New Roman'})

In [2]:
nodes_file  = './../dat/professor_nodes.csv'
edges_file  = './../dat/professor_edges.csv'
groups_file = './../dat/professor_groups.txt'

In [3]:
df = pd.read_csv("./../dat/Publications.csv")
df = df.drop_duplicates(subset=['Professor', 'Title'], keep='first')
professors = df["Professor"].unique()
classified_titles_df = pd.read_csv("./../dat/ClassifiedPublications.csv")
successfully_classified_titles_df = classified_titles_df[classified_titles_df['Research Fields'].apply(lambda x: len(ast.literal_eval(x)) > 0)]

research_fields = [
    'Computer Vision',
    'Robotics',
    'Bioinformatics and Computational Biology',
    'Human-Computer Interaction',
    'Design Automation of Microelectronic Systems (EDA)',
    'Visualization', 
    'Web Information Retrieval',
    'Natural Language Processing',
    'Machine Learning', 
]

sorted_research_fields = ["Machine Learning", "Computer Vision", "Robotics", "Bioinformatics and Computational Biology", "Human-Computer Interaction", "Visualization", "Design Automation of Microelectronic Systems (EDA)", "Web Information Retrieval", "Natural Language Processing"]
sorted_colors = [rgb.tue_darkgreen, rgb.pn_orange, rgb.tue_darkblue, rgb.tue_lightblue, rgb.tue_brown, rgb.tue_blue, rgb.tue_green, rgb.tue_dark, rgb.tue_red]

In [5]:
# Read and store the classified research field for every professor
professors_research_fields = {}
for professor in professors:
    professor_df = successfully_classified_titles_df[successfully_classified_titles_df['Professor'] == professor]
    professor_research_fields = {research_field: 0 for research_field in research_fields}
    for i, row in professor_df.iterrows():
        professor_research_fields[row['Research Field']] += 1 
    professors_research_fields[professor] = max(professor_research_fields, key=professor_research_fields.get)


# Group the professors with unique indices for each research field
professor_grouping = {}
for prof in professors_research_fields:
    professor_grouping[prof] = research_fields.index(professors_research_fields[prof])


# Generate the node file
with open(nodes_file, mode='w', newline='') as csv_file:
    
    
    writer = csv.DictWriter(csv_file, fieldnames=["Research Field", "id", "name"])

    # Write the header
    writer.writeheader()

    for prof in professors:
        writer.writerow({"Research Field": professors_research_fields[prof],"id": "id"+ str(np.where(professors == prof)[0][0]), "name": prof.replace("ö", "oe")})

print(f"CSV file " + nodes_file + " created successfully.")

# Getting the coauthors and their collaboration count for every professor for the edges
professors_coauthors = {prof: {} for prof in professors}

for _, row in df.iterrows():
    prof = row.iloc[0]
    for coauthor in row.iloc[4].split(';'):
        if (coauthor != prof):
            if coauthor in professors:
                    if coauthor in professors_coauthors[prof]:
                        professors_coauthors[prof][coauthor] +=1
                    else:
                        professors_coauthors[prof][coauthor] = 1

# Generate the edges file
with open(edges_file, mode='w', newline='') as csv_file:
    writer = csv.DictWriter(csv_file, fieldnames=["id1", "id2", "Paper Cooperations"])

    # Write the header
    #writer.writeheader()
    writer.writerow({"id1":"id1", "id2":"id2", "Paper Cooperations":"Paper Cooperations"})
    for prof in professors_coauthors:
        for coauthor in professors_coauthors[prof]:
            writer.writerow({"id1": "id" + str(np.where(professors == prof)[0][0]),"id2": "id"+ str(np.where(professors == coauthor)[0][0]), "Paper Cooperations": professors_coauthors[prof][coauthor]})

print(f"CSV file " + edges_file + " created successfully.")

#Generate the group file
f = open(groups_file, mode="wt")
f.write("id;nodes;padding\n")
for field in research_fields:
    group_list = []
    for prof in professors:
        if professors_research_fields[prof] == field:
            group_list.append('"id' + str(np.where(professors == prof)[0][0])+'"')  
    if len(group_list)>1:
        f.write("g"+str(research_fields.index(field)) + ";[")
        for i in range(len(group_list)):
            f.write(group_list[i])
            if i < len(group_list)-1:
                f.write(",")
        f.write("];8\n")

print(f"txt file " + groups_file + " created successfully.")

CSV file ./../dat/professor_nodes.csv created successfully.
CSV file ./../dat/professor_edges.csv created successfully.
txt file ./../dat/professor_groups.txt created successfully.
