# Data preprocessing (by Elton)

## Obtain all the coauthor details w/ organization location

In [1]:
import numpy as np
import pandas as pd
from scholarly import scholarly
import json
import csv
from selenium import webdriver
import time
from tqdm import tqdm

In [2]:
df = pd.read_pickle("./cse_gs.pkl")

In [3]:
coauthor_list = []
for i in range(51):
    for n in df.iloc[i]['coauthors']:
        coauthor_list.append(n['scholar_id'])

In [4]:
coauthor_list = list(set(coauthor_list))
len(coauthor_list)

1263

In [5]:
coauthor_list_details = []

In [19]:
# Select wither one.
# The web scrapping should takes more than 3 hours for avoiding Google blocking it

# for i, author_id in enumerate(tqdm(coauthor_list)):
#     coauthor_list_details.append(scholarly.fill(scholarly.search_author_id(author_id), sections=['basics']))
#     time.sleep(7) 
#     if i % 2 == 0:
#         with open('./backup_data/all_coauthor_details.json', 'w') as f:
#             json.dump(coauthor_list_details, f, indent=4)

f = open('./backup_data/all_coauthor_details.json')
coauthor_list_details = json.load(f)

In [20]:
copy_coauthor_list_details = [
    dict([key, str(value)] for key, value in dicts.items()) for dicts in coauthor_list_details
]

In [21]:
coauthor_detail = pd.DataFrame.from_dict(copy_coauthor_list_details)

In [22]:
coauthor_detail.head()

Unnamed: 0,container_type,filled,scholar_id,source,name,url_picture,affiliation,interests,email_domain,homepage,citedby,organization
0,Author,['basics'],9hwXx34AAAAJ,AUTHOR_PROFILE_PAGE,Ahmed E. Hassan,https://scholar.googleusercontent.com/citation...,"Mustafa Prize Laureate, ACM/IEEE/NSERC Steacie...","['Mining Software Repositories', 'Software Ana...",@cs.queensu.ca,http://sail.cs.queensu.ca/,32995,
1,Author,['basics'],DnnCWN0AAAAJ,AUTHOR_PROFILE_PAGE,Christopher Ré,https://scholar.googleusercontent.com/citation...,"Computer Science, Stanford University","['machine learning', 'machine learning systems...",@cs.stanford.edu,http://cs.stanford.edu/people/chrismre/,30244,8.539678734835078e+18
2,Author,['basics'],BYm7qHAAAAAJ,AUTHOR_PROFILE_PAGE,Jaechang Nam,https://scholar.googleusercontent.com/citation...,Handong Global University,"['Software Quality', 'Software Defect Predicti...",@handong.edu,http://lifove.github.io/,2922,
3,Author,['basics'],YG0DFyYAAAAJ,AUTHOR_PROFILE_PAGE,Hanwang Zhang (张含望）,https://scholar.googleusercontent.com/citation...,"Associate Professor, SCSE, Nanyang Technologic...","['Computer Vision', 'Causality', 'Machine Lear...",@ntu.edu.sg,https://mreallab.github.io/index.html,23357,3.012140508424118e+18
4,Author,['basics'],SKVnHakAAAAJ,AUTHOR_PROFILE_PAGE,H. V. Jagadish,,University of Michigan,[],@umich.edu,,45821,4.770128543809688e+18


## Find unique Org ID

In [23]:
unique_organizationID = [ x['organization'] for x in coauthor_list_details if 'organization' in x]

In [24]:
unique_organizationID = list(set(unique_organizationID))

In [25]:
len(unique_organizationID)

299

In [27]:
# Pick either one

# scrap_org_id_name=[]
# driver = webdriver.Chrome()
# for i, id in enumerate(tqdm(unique_organizationID)):
#     print("Opening a page")
#     driver.get(f'https://scholar.google.com.hk/citations?view_op=view_org&hl=en&org={id}')

#     time.sleep(5) # Prevent blocking
#     org_name = driver.find_element_by_class_name('gsc_authors_header')
    
#     scrap_org_id_name.append({"id": id, "name": org_name.text[:-11]})
#     print(org_name.text[:-11])

#     if i % 2 == 0:
#         with open('./backup_data/scrap_org_map.json', 'w') as f:
#             json.dump(scrap_org_id_name, f, indent=4)
# driver.quit()

f = open('./backup_data/scrap_org_map.json')
scrap_org_id_name = json.load(f)

In [29]:
# csv from 4462 lab material
uni_country_df = pd.read_csv('./university_countries.csv')

In [30]:
for dicts in scrap_org_id_name:
    if dicts['name'] in uni_country_df['Institution'].values:
        dicts['map'] = True
        dicts['country'] = uni_country_df.loc[uni_country_df['Institution'] == dicts['name']]['Country'].values[0]
    else:
        dicts['map'] = False
        dicts['country'] = ""

In [32]:
# key 'map' == False means the mapping to the university_countries failed
# You need to manually map it ur own
scrap_org_id_name[0]

{'id': 6818723143689148416,
 'name': 'Philipps-Universität Marburg',
 'map': False,
 'country': ''}

In [41]:
with open('./backup_data/mapped_org.json', 'w') as f:
    json.dump(scrap_org_id_name, f, indent=4)

In [43]:
# We have manually filled in the missing country in the above part at 'mapped_org_complete.json
all_coauthor_df = coauthor_detail
all_org_df = pd.read_json('./backup_data/mapped_org_complete.json')

In [73]:
def get_org_name(x):
    if pd.isna(x):
        return np.nan
    if int(x) in all_org_df['id'].values:
        row = all_org_df.loc[all_org_df['id'] == int(x)]
        return row['name'].values[0]
    else:
        return np.nan

def get_org_country(x):
    if pd.isna(x):
        return np.nan
    if int(x) in all_org_df['id'].values:
        row = all_org_df.loc[all_org_df['id'] == int(x)]
        return row['country'].values[0]
    else:
        return np.nan

In [74]:
all_coauthor_df['org_name'] = all_coauthor_df['organization'].apply(get_org_name)
all_coauthor_df['org_country'] = all_coauthor_df['organization'].apply(get_org_country)

In [77]:
all_coauthor_df.head(3)

Unnamed: 0,container_type,filled,scholar_id,source,name,url_picture,affiliation,interests,email_domain,homepage,citedby,organization,org_name,org_country
0,Author,['basics'],9hwXx34AAAAJ,AUTHOR_PROFILE_PAGE,Ahmed E. Hassan,https://scholar.googleusercontent.com/citation...,"Mustafa Prize Laureate, ACM/IEEE/NSERC Steacie...","['Mining Software Repositories', 'Software Ana...",@cs.queensu.ca,http://sail.cs.queensu.ca/,32995,,,
1,Author,['basics'],DnnCWN0AAAAJ,AUTHOR_PROFILE_PAGE,Christopher Ré,https://scholar.googleusercontent.com/citation...,"Computer Science, Stanford University","['machine learning', 'machine learning systems...",@cs.stanford.edu,http://cs.stanford.edu/people/chrismre/,30244,8.539678734835078e+18,Stanford University,USA
2,Author,['basics'],BYm7qHAAAAAJ,AUTHOR_PROFILE_PAGE,Jaechang Nam,https://scholar.googleusercontent.com/citation...,Handong Global University,"['Software Quality', 'Software Defect Predicti...",@handong.edu,http://lifove.github.io/,2922,,,


In [78]:
all_coauthor_df.to_csv('all_coauthor_details.csv')

note: the author_interest.json for the chord data is scrap and make manually