# Import needed libraries

In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from functools import reduce
from datetime import datetime
import datetime as dt
import ipywidgets as wg
from IPython.display import display
import networkx as nx


# Load the data files and take care of unwanted rows

In [None]:
students_all = pd.read_excel("./ORIGIN-GRADUATE_SCHOOL/dissertation_committees_2012-2017.xlsx")
notgood1 = students_all['Advisor Role'] != "Dissertation Committee Chair" 
notgood2 = students_all['Advisor Role'] != "Dissertation Committee CoChair"
notgood = notgood1 & notgood2
students = students_all[notgood]

faculties = pd.read_excel("./ORIGIN-SCHOLARSATDUKE/ScholarsAtDuke_Faculty_October2017.xlsx")
faculty = faculties[faculties['APPOINTMENT_TYPE'] == "P"]

# Build a new data frame using students and faculty data frames that shows school name of each committee member

In [None]:
df1 = faculty[['DUID','ORG_DISPLAY_NAME','SCHOOL_NAME']]
df2 = students[['student random ID','Degree Nbr','Degree','Confer Dt','Compl Term Descr','Advisor Role','Advisor','Acad Org','Advisor Duke UID']]
df3 = df2.merge(df1, left_on='Advisor Duke UID', right_on='DUID', how='left')
df_final = df3.drop_duplicates()
df_final = df_final.reset_index()

# Extract the names of organizations and departments

In [8]:
print(df_final.ORG_DISPLAY_NAME.unique())
len(df_final.ORG_DISPLAY_NAME.unique())

['Art, Art History & Visual Studies' 'Cultural Anthropology' 'Literature'
 nan 'History' 'School of Law' 'Romance Studies' 'English'
 'Religious Studies' 'Classical Studies' 'Economics'
 'Evolutionary Anthropology' 'Biology' 'Fuqua School of Business'
 'Biochemistry' 'Medicine, Infectious Diseases' 'Cell Biology'
 'Neurobiology' 'Molecular Genetics and Microbiology' 'Chemistry'
 'Radiology' 'Medicine, Endocrinology, Metabolism, and Nutrition'
 'Statistical Science' 'Computer Science' 'Pathology'
 'Medicine, Cardiology' 'Pharmacology & Cancer Biology'
 'Biostatistics & Bioinformatics'
 'Mechanical Engineering and Materials Science'
 'Radiology, Cardiothoracic Imaging' 'Surgery, Surgical Sciences'
 'Philosophy' 'Environmental Sciences and Policy'
 'Pediatrics, Medical Genetics' 'Mathematics' 'Pediatrics, Neonatology'
 'Physics' 'Marine Science and Conservation' 'Dermatology'
 'Biomedical Engineering' 'Neurosurgery' 'Radiation Oncology'
 'Surgery, Cardiovascular and Thoracic Surgery'
 'So

126

# Add the school names to each student's row

In [None]:
df4 = df3[df3['Advisor Role'] == "Dissertation Advisor"]
df5 = df4[['student random ID','ORG_DISPLAY_NAME','SCHOOL_NAME']]
df5.columns = ['student random ID','STU_DEPT','STU_SCHOOL']

# Seperate the info for different years to create dynamical data

In [10]:
orgs = pd.Series(df_final.ORG_DISPLAY_NAME.unique()) #advisors
depts = pd.Series(df_final['Acad Org'].unique()) #students
schools = pd.Series(df_final.SCHOOL_NAME.unique()) #advisors


def f1(timeind):
    graph1 = pd.DataFrame(graph_big[:][:][timeind])
    plt.imshow(np.transpose(graph1), cmap='hot', interpolation='nearest')
    plt.show()

def f2(timeind):
    graph1 = pd.DataFrame(graph_small[:][:][timeind])
    plt.imshow(np.transpose(graph1), cmap='hot', interpolation='nearest')
    plt.show()
    

time_length = 3*6;
org_length = len(orgs)
dept_length = len(depts)
school_length = len(schools)
graph_big = np.zeros((dept_length,org_length,time_length))
graph_small = np.zeros((dept_length,school_length,time_length))

d1 = dict(depts.items())
p1 = {val:key for (key, val) in d1.items()}
d2 = dict(orgs.items())
p2 = {val:key for (key, val) in d2.items()}
d3 = dict(schools.items())
p3 = {val:key for (key, val) in d3.items()}
semdict={5:0,9:1,12:2}

for i in range(0,len(df_final)):
    dept = df_final['Acad Org'][i]
    org = df_final['ORG_DISPLAY_NAME'][i]
    school = df_final['SCHOOL_NAME'][i]
    time = df_final['Confer Dt'][i]
    year = time.year
    month = time.month
    sem = semdict[month]
    semester = (year-2012)*3+sem
    n1 = p1[dept]
    n2 = p2[org]
    n3 = p3[school]
    graph_big[n1][n2][semester] += 1
    graph_small[n1][n3][semester] += 1
        

# Plots

In [None]:
semester = wg.IntSlider(value=0, min=0, max=17, description='Semester')    
wg.interact(f1, timeind=semester)

semester = wg.IntSlider(value=0, min=0, max=17, description='Semester')    
wg.interact(f2, timeind=semester)

# Prepare the input files for Gephi graph visualization

In [13]:
dfsmall = pd.DataFrame(data=graph_small[:,:,10], index=depts, columns=schools)
dfbig = pd.DataFrame(data=graph_big[:,:,10], index=depts, columns=orgs)

dfsmall = dfsmall.loc[:, dfsmall.columns.notnull()]
dfsmall.to_csv('smallgraph.csv',sep=';')
dfbig = dfbig.loc[:, dfbig.columns.notnull()]
dfbig.to_csv('biggraph.csv',sep=';')

In [81]:
#define the matrix and dictionaries, let the code change it to an edge table for Gephi
m = graph_small 

# dictionaries
d1 = dict(depts.items())
p1 = {val:key for (key, val) in d1.items()}
d2 = dict(orgs.items())
p2 = {val:key for (key, val) in d2.items()}
d3 = dict(schools.items())
p3 = {val:key for (key, val) in d3.items()}
semdict={5:0,9:1,12:2}
psem = {val:key for (key, val) in semdict.items()}

# creating the list of edges
columns = ['Source','Target','Interval','Weight']
edgelist = pd.DataFrame(columns=columns)
l = 0
for i in range(0,m.shape[0]):
    for j in range(0,m.shape[1]):
        for k in range(0,m.shape[2]):
            Weight = m[i][j][k]
            if Weight!=0:
                Source = d1[i]
                Target = d3[j]
                month = psem[k%3]   #set here how you define date, yyyy-mm-dd
                year = k//3 + 2012
                #Interval = dt.date(year, month, 1)
                #Interval = "<["+str(k)+","+str(k)+"]>"
                Interval = k
                edgelist.loc[l] = [Source, Target, Interval, Weight]
                l += 1

edgelist = edgelist.dropna(axis=0, how='any') # dropping rows with any NaN element
edgelist.to_csv('edgelist.csv', sep=';', index=False)