# Data preprocessing (by Kasey)

Please refer to `gs-vis-project/src/data/` for the final data output as multiple group members did some minor changes in preprocessing.

In this file, the following data preprocessing are done:
- Fetch information from Google Scholar for each CSE regular faculty member (manually)
- Intra-department coauthorship netword data preprocessing
- Matching of CSE official website information with Google Scholar information

In [44]:
from scholarly import scholarly
import json
import pandas as pd

# Fetching information of each HKUST CSE Regular Faculty member

As frequent fetching from scholarly API will be blocked by Google, this process is done manually.

In [18]:
profs = []

In [131]:
search_query = scholarly.search_author_id('y6m820wAAAAJ')
result = scholarly.fill(search_query, [])
print(json.dumps(result, indent=4, sort_keys=True))

{
    "affiliation": "Hong Kong University of Science and Technology",
    "citedby": 21770,
    "citedby5y": 13315,
    "cites_per_year": {
        "2004": 63,
        "2005": 79,
        "2006": 102,
        "2007": 121,
        "2008": 214,
        "2009": 276,
        "2010": 354,
        "2011": 501,
        "2012": 601,
        "2013": 809,
        "2014": 823,
        "2015": 1108,
        "2016": 1416,
        "2017": 1511,
        "2018": 1905,
        "2019": 1986,
        "2020": 2230,
        "2021": 2284,
        "2022": 2381,
        "2023": 2488
    },
    "coauthors": [
        {
            "affiliation": "Professor of Computer Science, University of Electronic Science and Technology of China",
            "container_type": "Author",
            "filled": [],
            "name": "Kai Zheng \u90d1\u51ef",
            "scholar_id": "EM-l50cAAAAJ",
            "source": "CO_AUTHORS_LIST"
        },
        {
            "affiliation": "ARC Future Fellow and Professor, Uni

In [132]:
profs.append(result)
pd.DataFrame.from_dict(profs, orient='columns').tail(2)

Unnamed: 0,container_type,filled,scholar_id,source,name,affiliation,interests,email_domain,citedby,citedby5y,...,hindex5y,i10index,i10index5y,cites_per_year,coauthors,publications,public_access,url_picture,organization,homepage
49,Author,"[basics, indices, counts, coauthors, publicati...",XTlaQMkAAAAJ,AuthorSource.AUTHOR_PROFILE_PAGE,Qian Zhang,"Tencent Professor of Engineering, Chair Prof. ...","[Wireless Networking, IoT, Smart Healthcare, C...",@cse.ust.hk,29921,8329,...,42,432,201,"{2002: 94, 2003: 170, 2004: 330, 2005: 495, 20...","[{'container_type': 'Author', 'filled': [], 's...","[{'container_type': 'Publication', 'source': '...","{'available': 95, 'not_available': 50}",https://scholar.googleusercontent.com/citation...,9.568811e+18,http://www.cse.ust.hk/~qianzh
50,Author,"[basics, indices, counts, coauthors, publicati...",y6m820wAAAAJ,AuthorSource.AUTHOR_PROFILE_PAGE,Xiaofang Zhou,Hong Kong University of Science and Technology,"[databases, big data, data science]",@cse.ust.hk,21770,13315,...,55,302,209,"{2004: 63, 2005: 79, 2006: 102, 2007: 121, 200...","[{'container_type': 'Author', 'filled': [], 's...","[{'container_type': 'Publication', 'source': '...","{'available': 137, 'not_available': 68}",https://scholar.googleusercontent.com/citation...,9.568811e+18,https://cse.hkust.edu.hk/~zxf


In [134]:
# Convert all the retrieved data as a pandas dataframe
gs_cse_df = pd.DataFrame.from_dict(profs, orient='columns')

In [282]:
cse_prof_df = pd.read_csv("cse_prof_id.csv")

In [149]:
df_merged = pd.merge(gs_cse_df, cse_prof_df[['gs_id', 'rank']], left_on='scholar_id', right_on='gs_id', how='left')
df_merged

Unnamed: 0,container_type,filled,scholar_id,source,name,affiliation,interests,email_domain,citedby,citedby5y,...,i10index5y,cites_per_year,coauthors,publications,public_access,url_picture,organization,homepage,gs_id,rank
0,Author,"[basics, indices, counts, coauthors, publicati...",Azu2w_MAAAAJ,AuthorSource.AUTHOR_PROFILE_PAGE,Sunil Arya,"Associate Professor, Computer Science and Engi...",[Computational Geometry],@cse.ust.hk,7286,1601,...,29,"{1995: 20, 1996: 23, 1997: 44, 1998: 72, 1999:...",[],"[{'container_type': 'Publication', 'source': '...","{'available': 8, 'not_available': 1}",,,,Azu2w_MAAAAJ,Associate Professor
1,Author,"[basics, indices, counts, coauthors, publicati...",aQblKVwAAAAJ,AuthorSource.AUTHOR_PROFILE_PAGE,Brahim Bensaou,Associate Professor of Computer Science and En...,"[Computer Networking, Wireless Networks]",@cse.ust.hk,4638,939,...,31,"{1997: 15, 1998: 16, 1999: 16, 2000: 38, 2001:...","[{'container_type': 'Author', 'filled': [], 's...","[{'container_type': 'Publication', 'source': '...","{'available': 7, 'not_available': 6}",https://scholar.googleusercontent.com/citation...,,,aQblKVwAAAAJ,Associate Professor
2,Author,"[basics, indices, counts, coauthors, publicati...",uiCSOycAAAAJ,AuthorSource.AUTHOR_PROFILE_PAGE,S.-H. Gary Chan,"Professor, Department of Computer Science and ...","[Smart sensing and IoT systems, Video/user/dat...",@cse.ust.hk,7225,2835,...,55,"{1999: 31, 2000: 15, 2001: 44, 2002: 78, 2003:...","[{'container_type': 'Author', 'filled': [], 's...","[{'container_type': 'Publication', 'source': '...","{'available': 38, 'not_available': 0}",https://scholar.googleusercontent.com/citation...,9.568811e+18,http://home.cse.ust.hk/~gchan/,uiCSOycAAAAJ,Professor
3,Author,"[basics, indices, counts, coauthors, publicati...",Z_t5DjwAAAAJ,AuthorSource.AUTHOR_PROFILE_PAGE,Hao Chen,"Assistant Professor, The Hong Kong University ...","[Trustworthy AI, Medical Image Analysis, Multi...",@cse.ust.hk,22097,21297,...,98,"{2016: 125, 2017: 554, 2018: 1243, 2019: 2543,...","[{'container_type': 'Author', 'filled': [], 's...","[{'container_type': 'Publication', 'source': '...","{'available': 68, 'not_available': 9}",https://scholar.googleusercontent.com/citation...,9.568811e+18,https://cse.hkust.edu.hk/~jhc/,Z_t5DjwAAAAJ,Assistant Professor
4,Author,"[basics, indices, counts, coauthors, publicati...",tnRV5QYAAAAJ,AuthorSource.AUTHOR_PROFILE_PAGE,Kai Chen,"Professor, HKUST","[Data Center Networking, Machine Learning Syst...",@cse.ust.hk,5673,4395,...,78,"{2010: 17, 2011: 43, 2012: 72, 2013: 92, 2014:...","[{'container_type': 'Author', 'filled': [], 's...","[{'container_type': 'Publication', 'source': '...","{'available': 68, 'not_available': 1}",https://scholar.googleusercontent.com/citation...,9.568811e+18,http://www.cse.ust.hk/~kaichen/,tnRV5QYAAAAJ,Professor
5,Author,"[basics, indices, counts, coauthors, publicati...",gtglwgYAAAAJ,AuthorSource.AUTHOR_PROFILE_PAGE,Lei Chen,Hong Kong University of Science and Technology,"[Human Powered Machine Learning, Databases, Da...",@cse.ust.hk,24780,14477,...,276,"{2006: 112, 2007: 142, 2008: 234, 2009: 426, 2...","[{'container_type': 'Author', 'filled': [], 's...","[{'container_type': 'Publication', 'source': '...","{'available': 209, 'not_available': 127}",https://scholar.googleusercontent.com/citation...,9.568811e+18,http://www.cse.ust.hk/~leichen,gtglwgYAAAAJ,Chair Professor
6,Author,"[basics, indices, counts, coauthors, publicati...",-gtmMpIAAAAJ,AuthorSource.AUTHOR_PROFILE_PAGE,Long Chen（陈隆）,Hong Kong University of Science and Technology,"[Computer Vision, Deep Learning, Multimedia, N...",@ust.hk,4328,4291,...,33,"{2017: 22, 2018: 132, 2019: 338, 2020: 503, 20...","[{'container_type': 'Author', 'filled': [], 's...","[{'container_type': 'Publication', 'source': '...","{'available': 32, 'not_available': 0}",https://scholar.googleusercontent.com/citation...,9.568811e+18,https://zjuchenlong.github.io/,-gtmMpIAAAAJ,Assistant Professor
7,Author,"[basics, indices, counts, coauthors, publicati...",lLMX9hcAAAAJ,AuthorSource.AUTHOR_PROFILE_PAGE,Qifeng Chen,HKUST,"[Computational Photography, Image Synthesis, L...",@ust.hk,8328,7853,...,73,"{2013: 25, 2014: 43, 2015: 81, 2016: 99, 2017:...","[{'container_type': 'Author', 'filled': [], 's...","[{'container_type': 'Publication', 'source': '...","{'available': 6, 'not_available': 0}",https://scholar.googleusercontent.com/citation...,9.568811e+18,https://cqf.io/,lLMX9hcAAAAJ,Assistant Professor
8,Author,"[basics, indices, counts, coauthors, publicati...",_LkC1yoAAAAJ,AuthorSource.AUTHOR_PROFILE_PAGE,Minhao Cheng,"Assistant Professor, The Hong Kong University ...","[Machine Learning, Deep Learning, Optimization...",@ust.hk,2024,2019,...,18,"{2018: 22, 2019: 148, 2020: 297, 2021: 447, 20...","[{'container_type': 'Author', 'filled': [], 's...","[{'container_type': 'Publication', 'source': '...","{'available': 11, 'not_available': 1}",https://scholar.googleusercontent.com/citation...,9.568811e+18,https://cse.hkust.edu.hk/~minhaocheng/,_LkC1yoAAAAJ,Assistant Professor
9,Author,"[basics, indices, counts, coauthors, publicati...",0nquEkQAAAAJ,AuthorSource.AUTHOR_PROFILE_PAGE,Siu-Wing Cheng,"Professor, Department of Computer Science and ...","[computational geometry, algorithm, data struc...",@cse.ust.hk,3870,1014,...,21,"{1991: 11, 1992: 18, 1993: 19, 1994: 30, 1995:...","[{'container_type': 'Author', 'filled': [], 's...","[{'container_type': 'Publication', 'source': '...","{'available': 25, 'not_available': 4}",https://scholar.googleusercontent.com/citation...,9.568811e+18,http://www.cse.ust.hk/faculty/scheng,0nquEkQAAAAJ,Professor


In [159]:
df_merged['rank'] = df_merged['rank'].apply(lambda x: x.split(' and '))

In [None]:
# Save the dataframe as pickle, so the group can process the data individually
df_merged.to_pickle("cse_gs.pkl")

# Intra-department coauthor network

In [174]:
# All the Google Scholar ID of UST CSE Regular Faculty, ignoring those who don't have one
cse_prof_ids = set(cse_prof_df['gs_id'].dropna())

In [None]:
coauthor_df = pd.DataFrame(columns=['author1_id', 'author2_id', 'author1_name', 'author2_name', 'author1_rank', 'author2_rank'])

In [264]:
for i in range(df_merged.shape[0]):
    temp_df = pd.DataFrame.from_dict(df_merged['coauthors'].iloc[i], orient='columns')
   
    if temp_df.shape[0] > 0:
        ust_co_df = temp_df[temp_df['scholar_id'].isin(cse_prof_ids)]

        for j in range(ust_co_df.shape[0]):
            idx = coauthor_df.shape[0]

            author1_id = df_merged['scholar_id'].iloc[i]
            author2_id = ust_co_df['scholar_id'].iloc[j]
            author1_name = cse_prof_df[cse_prof_df['gs_id'] == author1_id].iloc[0]['name']
            author1_rank = cse_prof_df[cse_prof_df['gs_id'] == author1_id].iloc[0]['rank']
            author2_name = cse_prof_df[cse_prof_df['gs_id'] == author2_id].iloc[0]['name']
            author2_rank = cse_prof_df[cse_prof_df['gs_id'] == author2_id].iloc[0]['rank']

            if author1_name < author2_name:
                coauthor_df.loc[idx] = [author1_id, author2_id, author1_name, author2_name, author1_rank, author2_rank]
            else:
                coauthor_df.loc[idx] = [author2_id, author1_id, author2_name, author1_name, author2_rank, author1_rank]

In [274]:
coauthor_df.drop_duplicates(inplace=True)
coauthor_df.sort_values(by=['author1_name', 'author2_name'], inplace=True)
coauthor_df.reset_index(drop=True, inplace=True)
coauthor_df

Unnamed: 0,author1_id,author2_id,author1_name,author2_name,author1_rank,author2_rank
0,MVf7Lq0AAAAJ,nEsOOx8AAAAJ,Albert Chi Shing CHUNG,Dit Yan YEUNG,Professor,Chair Professor
1,SLgWQLEAAAAJ,YooPxQwAAAAJ,Bo LI,Mordecai Jay GOLIN,Chair Professor,Professor
2,SLgWQLEAAAAJ,XTlaQMkAAAAJ,Bo LI,Qian ZHANG,Chair Professor,Chair Professor
3,SLgWQLEAAAAJ,FeJrzPMAAAAJ,Bo LI,Wei WANG,Chair Professor,Associate Professor
4,EWfpM74AAAAJ,lLMX9hcAAAAJ,Chi Keung TANG,Qifeng CHEN,Professor,Assistant Professor
5,EWfpM74AAAAJ,0nquEkQAAAAJ,Chi Keung TANG,Siu Wing CHENG,Professor,Professor
6,DORzgBQAAAAJ,ZMLhZJ8AAAAJ,Chiew Lan TAI,Long QUAN,Professor,Professor
7,H9zO5eYAAAAJ,YooPxQwAAAAJ,Cunsheng DING,Mordecai Jay GOLIN,Professor,Professor
8,nEsOOx8AAAAJ,-oTraZ4AAAAJ,Dit Yan YEUNG,James Tin Yau KWOK,Chair Professor,Professor
9,nEsOOx8AAAAJ,18_xlPUAAAAJ,Dit Yan YEUNG,Nevin Lianwen ZHANG,Chair Professor,Professor


In [275]:
coauthor_df.to_csv('coauthor.csv', index=False)

## Export JSON files to facilitate visualisation of network graph in D3.js

In [435]:
coauthor_df[['author1_id', 'author2_id']].rename(columns={
    "author1_id": "source",
    "author2_id": "target"
    }).to_json("ust_coauthor_link.json", orient="records")

In [434]:
coauthor1_df = coauthor_df[['author1_id', 'author1_name', 'author1_rank']].rename(columns={'author1_id': 'id', 'author1_name': 'label', 'author1_rank': 'group'})
coauthor2_df = coauthor_df[['author2_id', 'author2_name', 'author2_rank']].rename(columns={'author2_id': 'id', 'author2_name': 'label', 'author2_rank': 'group'})

unique_ust_coauthor_df = pd.concat([coauthor1_df, coauthor2_df]).drop_duplicates().reset_index(drop=True)

unique_ust_coauthor_df.to_json("unique_ust_coauthor.json", orient="records")

# Add pub_per_year column and export as JSON

In [519]:
cse_prof_id_df = pd.read_csv("cse_prof_id.csv")
df_merged2 = df_merged.copy()

In [520]:
df_merged2['pub_per_year'] = None

In [521]:
for i in range(df_merged2.shape[0]):
    bib_df = pd.DataFrame.from_dict(df_merged2.iloc[i]['publications'], orient="columns")['bib']
    bib_details_df = pd.DataFrame.from_dict(list(bib_df), orient="columns")
    temp_dict = dict(bib_details_df['pub_year'].value_counts())
    df_merged2.loc[i, 'pub_per_year'] = [temp_dict]

In [522]:
cse_prof_id_merge_gs_df = pd.merge(cse_prof_df, df_merged2[['scholar_id', 'citedby', 'citedby5y', 'hindex', 'hindex5y', 
                                'i10index', 'i10index5y', 'cites_per_year', 'url_picture', 'pub_per_year']], 
                                left_on="gs_id", right_on="scholar_id", how="left")
cse_prof_id_merge_gs_df.drop(columns="scholar_id", inplace=True)
cse_prof_id_merge_gs_df.head()

Unnamed: 0,name,rank,tel,email,gs_id,orcid,linked_arxiv,citedby,citedby5y,hindex,hindex5y,i10index,i10index5y,cites_per_year,url_picture,pub_per_year
0,Sunil ARYA,Associate Professor,(852) 2358 8769,arya@ust.hk,Azu2w_MAAAAJ,0000-0003-0939-4192,1,7286,1601,29,19,48,29,"{1995: 20, 1996: 23, 1997: 44, 1998: 72, 1999:...",,"[{'2019': 5, '2000': 4, '1998': 3, '2006': 3, ..."
1,Brahim BENSAOU,Associate Professor,(852) 2358 7014,csbb@ust.hk,aQblKVwAAAAJ,0000-0002-4473-3658,0,4638,939,31,17,78,31,"{1997: 15, 1998: 16, 1999: 16, 2000: 38, 2001:...",https://scholar.googleusercontent.com/citation...,"[{'2004': 15, '2000': 13, '2007': 13, '2008': ..."
2,Gary Shueng Han CHAN,Professor,(852) 2358 6990,gchan@ust.hk,uiCSOycAAAAJ,0000-0003-4207-764X,0,7225,2835,43,22,140,55,"{1999: 31, 2000: 15, 2001: 44, 2002: 78, 2003:...",https://scholar.googleusercontent.com/citation...,"[{'2012': 23, '2007': 23, '2006': 22, '2015': ..."
3,Hao CHEN,Assistant Professor,(852) 2358 8346,jhc@ust.hk,Z_t5DjwAAAAJ,0000-0002-8400-3780,1,22097,21297,63,62,98,98,"{2016: 125, 2017: 554, 2018: 1243, 2019: 2543,...",https://scholar.googleusercontent.com/citation...,"[{'2023': 43, '2019': 25, '2022': 21, '2017': ..."
4,Kai CHEN,Professor,(852) 2358 7028,kaichen@ust.hk,tnRV5QYAAAAJ,0000-0003-2587-6028,1,5673,4395,38,33,91,78,"{2010: 17, 2011: 43, 2012: 72, 2013: 92, 2014:...",https://scholar.googleusercontent.com/citation...,"[{'2022': 32, '2023': 23, '2020': 17, '2021': ..."


In [523]:
cse_prof_id_merge_gs_df.to_json("ust_cse_prof.json", orient="records")