In [None]:
# main libraries
import pandas as pd
import numpy as np
import re
import glob

#technical tools
from tqdm.notebook import tqdm_notebook
tqdm_notebook.pandas()

import warnings
warnings.filterwarnings('ignore')

from collections import ChainMap

# ML libraries
from nltk.metrics import edit_distance
from sklearn.metrics import jaccard_score
from fuzzywuzzy import fuzz

<font size="3">Constructing unified dataset </font>

In [None]:
# please use your path 
path = r'C:\Users\altz7\Downloads\physics_conference'
all_files = glob.glob(path + "\*.json")

data_list = []

for filename in all_files:
    df = pd.read_json(filename)
    data_list.append(df)

main_data = pd.concat(data_list, axis=0, ignore_index=True)
del data_list

<font size="3">Unpack data from nested dictionary values </font>

In [None]:
# Google_scholar

google_scholar = main_data["google_scholar"].progress_apply(pd.Series)

google_scholar_list = []

for col in google_scholar.columns:
    """
    unpack all columns for each search result column
    columns are named like integers 0, 1, 2. Total number depends on how many names are given. 
    If in some row 5 names are provided than total number of columns will be also 5 - to create space for placing all data
    
    Each "google_scholar_element" will represent all data given for each name inside this search result    
    """
    google_scholar_element = google_scholar[col].progress_apply(pd.Series) #unpack all columns for each search result column
    
    #remove columns that named as "0", no information in it
    only_str_columns = [c for c in google_scholar_element.columns if isinstance(c, (str))]
    google_scholar_element = google_scholar_element[google_scholar_element.columns.intersection(only_str_columns)]
    
    google_scholar_element = google_scholar_element.add_suffix('_' + str(col)) #rename column for each set of columns for each provided name
    google_scholar_list.append(google_scholar_element)
    
google_scholar = pd.concat(google_scholar_list, axis=1) #google_scholar dataframe with all data
del google_scholar_list

In [None]:
# ResearchGate

main_data["researchgate_dict"] = main_data["researchgate"].apply(lambda row: dict(ChainMap(*row)))

researchgate = main_data["researchgate_dict"].apply(pd.Series)

In [None]:
# LinkedIn

linkedin = main_data["linkedin"].progress_apply(pd.Series)

linkedin_list = []

def extract_linkedin_affiliation(row):
    list_value = row["LinkedIn_Experience"]
    if len(list_value) > 0:
        dict_value = list_value[0]
        return dict_value['company_name']
    else:
        return ""

    
def extract_linkedin_interests(row):
    list_of_interests = []
    list_value = row["LinkedIn_Skills"]
    if len(list_value) > 0:
        for element in list_value:
            #extract data for each of available "name_of_skill" from dictionary: [{'name_of_skill': 'Research'}, {'name_of_skill': 'Experimental Physics'}]
            list_of_interests.append(element['name_of_skill'])
        return ", ".join(list_of_interests)
    else:
        return ""

for col in linkedin.columns:
    #unpack all columns for each search result column
    linkedin_element = linkedin[col].progress_apply(pd.Series)
    
    #remove columns that named as "0", no information in it
    only_str_columns = [c for c in linkedin_element.columns if isinstance(c, (str))]
    linkedin_element = linkedin_element[linkedin_element.columns.intersection(only_str_columns)]
    
    #replace NaN values for empty list
    linkedin_element.loc[linkedin_element['LinkedIn_Experience'].isnull(),['LinkedIn_Experience']] = linkedin_element.loc[linkedin_element['LinkedIn_Experience'].isnull(), \
                                                                                                                          'LinkedIn_Experience'].apply(lambda x: [])
    
    linkedin_element.loc[linkedin_element['LinkedIn_Skills'].isnull(),['LinkedIn_Skills']] = linkedin_element.loc[linkedin_element['LinkedIn_Skills'].isnull(), \
                                                                                                                          'LinkedIn_Skills'].apply(lambda x: [])
  
    #get affiliation from "LinkedIn_Experience" column
    linkedin_element["LinkedIn_affiliation"] = linkedin_element.apply(lambda row: extract_linkedin_affiliation(row), axis=1)
    
    # Extract LinkedIn_interests -> will use column "LinkedIn_Skills", not "LinkedIn_Interests" cause it doesn't contain relevant data
    #get interests from "LinkedIn_Skills" column
    linkedin_element["LinkedIn_interests"] = linkedin_element.apply(lambda row: extract_linkedin_interests(row), axis=1)
   
    #rename column for each set of columns for each provided name
    linkedin_element = linkedin_element.add_suffix('_' + str(col)) 
    linkedin_list.append(linkedin_element)

#google_scholar dataframe with all data
linkedin = pd.concat(linkedin_list, axis=1)
del linkedin_list

In [None]:
# Orcid

main_data["orcid_dict"] = main_data["orcid"].apply(lambda row: dict(ChainMap(*row)))

orcid = main_data["orcid_dict"].apply(pd.Series)

# Extract Orcid_affiliation
orcid.loc[orcid['Orcid_Employment'].isnull(),['Orcid_Employment']] = orcid.loc[orcid['Orcid_Employment'].isnull(), \
                                                                               'Orcid_Employment'].apply(lambda x: [])
def extract_orcid_affiliation(row):
    list_value = row["Orcid_Employment"]
    if len(list_value) > 0:
        dict_value = list_value[0]
        return dict_value['affiliation_name']
    else:
        return ""

orcid["Orcid_affiliation"] = orcid.apply(lambda row: extract_orcid_affiliation(row), axis=1)

# Transform Orcid_interests
orcid.loc[orcid['Orcid_Keywords'].isnull(),['Orcid_Keywords']] = orcid.loc[orcid['Orcid_Keywords'].isnull(), \
                                                                               'Orcid_Keywords'].apply(lambda x: [])

def extract_orcid_interests(row):
    list_value = row["Orcid_Keywords"]
    if len(list_value) > 0:
        return ", ".join(list_value)
    else:
        return ""

orcid["Orcid_interests"] = orcid.apply(lambda row: extract_orcid_interests(row), axis=1)

<font size="3">Combine search results data with main data</font>

In [None]:
main_data = pd.concat([main_data, researchgate, google_scholar, linkedin, orcid], axis=1)

#create "full_name" column for original data
main_data.insert(0, 'full_name', main_data["first_name"] + " " + main_data["last_name"])
main_data.drop(columns=['first_name', 'last_name'])

#fill NaN values with empty string
main_data.fillna('', inplace = True)

#free memory space
del researchgate, google_scholar, linkedin, orcid

<font size="3">Preprocessing the names columns</font>

In [None]:
google_scholar_name_columns = [c for c in main_data.columns if "GoogleScholar_Name" in c]

linkedin_name_columns = [c for c in main_data.columns if "LinkedIn_Full_Name" in c]

name_columns = ["full_name", "ResearchGate_Full_Name", "Orcid_Full_Name"] \
                + google_scholar_name_columns \
                + linkedin_name_columns

def remove_chinese_characters(name):
    chinese_characters = re.sub("[^\u4E00-\u9FA5]", "", str(name))
    name = name.replace(chinese_characters, "")
    return name

def preprocess_names(name):
    name = name.str.title()
    name = name.str.replace("(", "").str.replace(")", "").str.replace("Dr. ", "").str.replace(" PhD", "") \
                .str.replace("Ph.D.", "").str.replace("Ph. D.", "").str.replace(",", "")
    name = name.str.strip()
    return name

#apply function to string objects
for col in name_columns:
    main_data[col] = main_data[col].apply(remove_chinese_characters)

#apply function to series (columns)
main_data[name_columns] = main_data[name_columns].apply(preprocess_names)

<font size="3">Scoring system</font>

In [None]:
def fuzz_score_for_mult_columns(required_columns: list, new_column_name: str):
    """
    Creates score for set of LinkedIn and GoogleScholar columns
    
    :param1 required_columns: list of columns to apply function 
    :param2 new_column_name: general column name, like "score_name_origin_linkedin"
    
    """
    for col in required_columns:
        suffix_linkedin = col[-2:]
        main_data[new_column_name + suffix_linkedin] = main_data[["full_name", col]].apply(lambda x: fuzz.partial_ratio(*x), axis=1)
        
def jaccard_score_for_mult_columns(required_columns: list, new_column_name: str, original_data_column: str):
    """
    Creates score for set of LinkedIn and GoogleScholar columns
    
    :param1 required_columns: list of columns to apply function 
    :param2 new_column_name: general column name, like "score_name_origin_linkedin"
    :param3 original_data_column: column name to compare with from original data
    
    """
    for col in required_columns:
        suffix_linkedin = col[-2:]
        main_data[new_column_name + suffix_linkedin] = main_data.apply(lambda x: improved_jaccard_similarity(x[original_data_column], x[col]), axis=1)

In [None]:
#Name similarity between original data and each of 4 search results

main_data["score_name_origin_researchgate"] = main_data[["full_name", "ResearchGate_Full_Name"]].apply(lambda x: fuzz.partial_ratio(*x), axis=1)

main_data["score_name_origin_orcid"] = main_data[["full_name", "Orcid_Full_Name"]].apply(lambda x: fuzz.partial_ratio(*x), axis=1)

fuzz_score_for_mult_columns(linkedin_name_columns, "score_name_origin_linkedin")

fuzz_score_for_mult_columns(google_scholar_name_columns, "score_name_origin_googlescholar")

In [None]:
# Affiliation similarity between original data and each of 4 search results

"""
Jaccard similarity takes into account only the set of unique words for each text document. 
This makes it the likely candidate for assessing the similarity of documents when repetition is not an issue. 
A prime example of such an application is comparing product descriptions. 
For instance, if a term like “HD” or “thermal efficiency” is used multiple times in one description and just once in another, 
the Euclidean distance and cosine similarity would drop. On the other hand, 
if the total number of unique words stays the same, the Jaccard similarity will remain unchanged. 

"""

def jaccard_similarity(x, y):
    """ returns the jaccard similarity between two lists """
    if len(x) != 0 and len(y) != 0:
        intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
        union_cardinality = len(set.union(*[set(x), set(y)]))
        return intersection_cardinality / float(union_cardinality)
    else:
        return 0.0
    
def improved_jaccard_similarity(x, y):
    if len(x) != 0 and len(y) != 0:
        sentences = [x, y]
        sentences = [sent.lower().split(" ") for sent in sentences]
        return jaccard_similarity(sentences[0], sentences[1])
    else:
        return 0.0

# get list of affiliation columns for LinkedIn and GScholar

google_scholar_affiliation_columns = [c for c in main_data.columns if "GoogleScholar_Organization" in c]

linkedin_affiliation_columns = [c for c in main_data.columns if "LinkedIn_affiliation" in c]

#create scrore columns for affiliation

jaccard_score_for_mult_columns(linkedin_affiliation_columns, "score_affiliation_origin_linkedin", 'affiliation')

jaccard_score_for_mult_columns(google_scholar_affiliation_columns, 'score_affiliation_origin_googlescholar', 'affiliation')

main_data['score_affiliation_origin_researchgate'] = main_data.apply(lambda x: improved_jaccard_similarity(x['affiliation'], x['ResearchGate_Affiliation']), axis=1)

main_data['score_affiliation_origin_orcid'] = main_data.apply(lambda x: improved_jaccard_similarity(x['affiliation'], x['Orcid_affiliation']), axis=1)

In [None]:
# Affiliation similarity between 4 search results

# For LinkedIn and GScholar done between first name given

main_data['score_affiliation_linkedin_googlescholar'] = main_data.apply(lambda x: improved_jaccard_similarity(x['LinkedIn_affiliation_0'], x['GoogleScholar_Organization_0']), axis=1)

main_data['score_affiliation_linkedin_researchgate'] = main_data.apply(lambda x: improved_jaccard_similarity(x['LinkedIn_affiliation_0'], x['ResearchGate_Affiliation']), axis=1)

main_data['score_affiliation_linkedin_orcid'] = main_data.apply(lambda x: improved_jaccard_similarity(x['LinkedIn_affiliation_0'], x['Orcid_affiliation']), axis=1)

main_data['score_affiliation_researchgate_orcid'] = main_data.apply(lambda x: improved_jaccard_similarity(x['ResearchGate_Affiliation'], x['Orcid_affiliation']), axis=1)

main_data['score_affiliation_researchgate_googlescholar'] = main_data.apply(lambda x: improved_jaccard_similarity(x['ResearchGate_Affiliation'], x['GoogleScholar_Organization_0']), axis=1)

main_data['score_affiliation_orcid_googlescholar'] = main_data.apply(lambda x: improved_jaccard_similarity(x['Orcid_affiliation'], x['GoogleScholar_Organization_0']), axis=1)


In [None]:
# Interests similarity between original data and each of 4 search results

# get list of interests columns for LinkedIn and GScholar

google_scholar_interest_columns = [c for c in main_data.columns if "GoogleScholar_Interests" in c]

linkedin_interests_columns = [c for c in main_data.columns if "LinkedIn_interests" in c]

#create scrore columns for affiliation

jaccard_score_for_mult_columns(linkedin_interests_columns, 'score_interests_origin_linkedin', 'research_topics')

jaccard_score_for_mult_columns(google_scholar_interest_columns, 'score_interests_origin_googlescholar', 'research_topics')

main_data['score_interests_origin_researchgate'] = main_data.apply(lambda x: improved_jaccard_similarity(x['research_topics'], x['ResearchGate_Skills_and_Expertise']), axis=1)

main_data['score_interests_origin_orcid'] = main_data.apply(lambda x: improved_jaccard_similarity(x['research_topics'], x['Orcid_interests']), axis=1)

In [None]:
#Final score for each of 4 search results

"""
WEIGHTING MATCHES: .70 FOR NAMES, .20 FOR AFFILIATION AND .10 FOR INTERESTS.

"""
for i in range(len(linkedin_name_columns)):
    main_data['score_final_linkedin_{}'.format(i)] = (main_data["score_name_origin_linkedin_{}".format(i)] * 0.7) \
                                                         + (main_data['score_affiliation_origin_linkedin_{}'.format(i)] * 0.2) \
                                                         + (main_data['score_interests_origin_linkedin_{}'.format(i)] * 0.1)
    
for i in range(len(google_scholar_name_columns)):
    main_data['score_final_googlescholar_{}'.format(i)] = (main_data["score_name_origin_googlescholar_{}".format(i)] * 0.7) \
                                                          + (main_data['score_affiliation_origin_googlescholar_{}'.format(i)] * 0.2) \
                                                          + (main_data['score_interests_origin_googlescholar_{}'.format(i)] * 0.1)
    
main_data['score_final_researchgate'] = (main_data["score_name_origin_researchgate"] * 0.7) \
                                                          + (main_data['score_affiliation_origin_researchgate'] * 0.2) \
                                                          + (main_data['score_interests_origin_researchgate'] * 0.1)

main_data['score_final_orcid'] = (main_data["score_name_origin_orcid"] * 0.7) \
                                                          + (main_data['score_affiliation_origin_orcid'] * 0.2) \
                                                          + (main_data['score_interests_origin_orcid'] * 0.1)

In [None]:
# select best final score from all LinkedIn and Google_Scholar final scores 

# get best score result from all available

all_score_linkedin = [c for c in main_data.columns if "score_final_linkedin" in c]

all_score_googlescholar = [c for c in main_data.columns if "score_final_googlescholar" in c]

all_score = all_score_linkedin + all_score_googlescholar + ['score_final_researchgate', 'score_final_orcid']


main_data['best_linkedin'] = main_data[all_score_linkedin].idxmax(axis=1)

main_data['best_googlescholar'] = main_data[all_score_googlescholar].idxmax(axis=1)

main_data['best_score_of_all'] = main_data[all_score].idxmax(axis=1)