In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
from glob import glob
from skimage import io, transform
from google.colab import drive

%matplotlib inline

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Load the dataset
df = pd.read_csv('drive/MyDrive/Capstone/author-capstone-20K-limit.csv', sep = '|', header = None)
df.columns = ["pii", "Section_title", "Section_content", "Authors"]

df = df.drop(columns="Section_title")

# Convert the two useful columns into string
df['Authors'] = df['Authors'].str.replace(r'\\', "",regex=True).astype(str)
df['Section_content'] = df['Section_content'].astype(str)


In [None]:
df

Unnamed: 0,pii,Section_content,Authors
0,B9780128227060000123,COVID-19 has made a huge impact on everyone’s ...,
1,B9780323856799000143,"Valentí Gómez, Ramazan Gundogdu, and Alexander...",
2,S0001457520316912,"Qinghong Chen: Conceptualization, Data - proce...","[{Authorseq"":1,""auid"":57221096756,""given_name_..."
3,S000145752031770X,The authors confirm contribution to the paper ...,"[{Authorseq"":1,""auid"":57221288000,""given_name_..."
4,S0001457521002001,The authors confirm contribution to the paper ...,"[{Authorseq"":1,""auid"":57208572807,""given_name_..."
...,...,...,...
19995,S0024320519309415,"Y.Z. conceived the work, designed and performe...","[{Authorseq"":1,""auid"":57196239544,""given_name_..."
19996,S0024320520305774,"Emily Durham: Conceptualization, Methodology, ...","[{Authorseq"":1,""auid"":36875917900,""given_name_..."
19997,S0024320520305853,"Junling Lin: Major performer, LeaderXiaokai Fe...","[{Authorseq"":1,""auid"":57209316649,""given_name_..."
19998,S0024320520309747,"C.Z. performed the experiments; C.Z., Y.Q., an...","[{Authorseq"":1,""auid"":57218442730,""given_name_..."


In [None]:
def author_conversion(authors, contents):

  # check whether author feature is null
  if authors:

    # split the author string into info of authors
    s = authors.split("},{")

    # create a dictionary mapping name expressions to the author id
    d = {}
    for author in s:
      author = author.split(",")

      auid = ' '
      given_name = ' '
      surname = ' '
      initial = ' '
      for entry in author:
        # each entry will include one info of authors
        # search the info by the entry's title
        if "auid" in entry:
          auid = entry.split(":")[1]
        elif "given_name_pn" in entry:
          # strip the info entry since it includes "" in the head and tail
          given_name = entry.split(":")[1].strip('"')
        elif "surname_pn" in entry:
          surname = entry.split(":")[1].strip('"')
        elif "initials_pn" in entry:
          initial = entry.split(":")[1].strip('"').replace("}]","").strip('"')
  
      # create a list to store variations with one name
      var = []
      # variations of full name format
      var.append(given_name + ' '+surname)                          # Harry James Potter
  
      # variations of initial + surname
      var.append(initial + ' '+surname)                             # H.J. Potter
      var.append(initial +surname)                                  # H.J.Potter
      var.append(initial.replace(".", '')+' '+surname)              # HJ Potter
      var.append(initial.replace(".", '')+'- '+surname)             # HJ- Potter
      var.append(initial[0]+'.' +surname)                           # H.Potter
      var.append(initial[0]+'. ' +surname)                          # H. Potter

      # variations of initials
      var.append(initial +surname[0]+'.')                           # H.J.P.
      var.append(initial +' '+surname[0]+'.')                       # H.J. P.
      var.append(initial +surname[0])                               # H.J.P
      var.append(initial +' '+surname[0])                           # H.J. P
      var.append(initial[0] +'.'+surname[0])                        # H.P
      var.append(initial[0] +'.'+surname[0]+'.')                    # H.P.
      var.append(initial[0] +'. '+surname[0])                       # H. P
      var.append(initial[0] +'. '+surname[0]+'.')                   # H. P.
      var.append(initial.replace(".", ". ") +surname[0]+'.')        # H. J. P.
      var.append(initial.replace(".", ". ") +surname[0])            # H. J. P
      var.append(initial.replace(".", ".-",1)+' '+surname[0])       # H.-J. P
      var.append(initial.replace(".", ".-",1)+surname[0])           # H.-J.P
      var.append(initial.replace(".", "-",1)+surname[0])            # H-J.P
      var.append(initial.replace(".", "-",1)+' '+surname[0])        # H-J. P

      # variations of abbreviations
      var.append(initial[0]+surname[0])                             # HP
      var.append(initial.replace(".",'')+surname[0])                # HJP

      # variations of reversed initials
      var.append(surname[0] + '.' + initial)                        # P.H.
      var.append(surname[0] + '.' + initial[0])                     # P.H
      var.append(surname[0] + '. ' + initial)                       # P. H.
      var.append(surname[0] + '. ' + initial[0])                    # P. H

      # variations of reversed abbreviations
      var.append(surname[0]+initial[0])                             # PH

      # variations of only surname or only given name
      var.append(surname)
      var.append(given_name)

      # projects the variations to author id
      for i in var:
        if i not in d:
          if auid != ' ':
            d[i] = auid

  if not contents:
    return None
    
  c = (contents + '.')[:-1]
  for i in d:
    # add a '#' at the beginning of the author id to facilitate further parsing
    c = c.replace(i, ' #'+d[i]+' ')
  
  return c


In [None]:
# Create a new feature to store the converted text
df["Converted_Section_Content"] = df.apply(lambda x: author_conversion(x.Authors, x.Section_content), axis=1)

In [None]:
df.to_csv('drive/MyDrive/Capstone/data/converted.csv')
df

Unnamed: 0,pii,Section_content,Authors,Converted_Section_Content
0,B9780128227060000123,COVID-19 has made a huge impact on everyone’s ...,,COVID-19 has made a huge impact on everyone’s ...
1,B9780323856799000143,"Valentí Gómez, Ramazan Gundogdu, and Alexander...",,"Valentí Gómez, Ramazan Gundogdu, and Alexander..."
2,S0001457520316912,"Qinghong Chen: Conceptualization, Data - proce...","[{Authorseq"":1,""auid"":57221096756,""given_name_...","#57221096756 : Conceptualization, Data - proc..."
3,S000145752031770X,The authors confirm contribution to the paper ...,"[{Authorseq"":1,""auid"":57221288000,""given_name_...",The authors confirm contribution to the paper ...
4,S0001457521002001,The authors confirm contribution to the paper ...,"[{Authorseq"":1,""auid"":57208572807,""given_name_...",The authors confirm contribution to the paper ...
...,...,...,...,...
19995,S0024320519309415,"Y.Z. conceived the work, designed and performe...","[{Authorseq"":1,""auid"":57196239544,""given_name_...","#57196239544 conceived the work, designed an..."
19996,S0024320520305774,"Emily Durham: Conceptualization, Methodology, ...","[{Authorseq"":1,""auid"":36875917900,""given_name_...","#36875917900 : Conceptualization, Methodology..."
19997,S0024320520305853,"Junling Lin: Major performer, LeaderXiaokai Fe...","[{Authorseq"":1,""auid"":57209316649,""given_name_...","#57209316649 : Major performer, Leader #57209..."
19998,S0024320520309747,"C.Z. performed the experiments; C.Z., Y.Q., an...","[{Authorseq"":1,""auid"":57218442730,""given_name_...",#57218442730 performed the experiments; #57...


In [None]:
def converted_status(content, converted_content, authors):
  if converted_content != content:
    return "Converted"
  elif authors == 'nan':
    return "Missing"
  else:
    return "Edge Cases"

In [None]:
df["Conversion_Status"] = df.apply(lambda x: converted_status(x.Section_content, 
                                                              x.Converted_Section_Content,
                                                              x.Authors), 
                                   axis=1)

In [None]:
'''
The new column Conversion_Status will include three types of output:
Converted: the author names in contents are successfully converted into author ids
Missing: that observation does not include any author info (is nan)
Edge Cases: no author name is shown in the content
'''

df['Conversion_Status'].value_counts()

Converted     16210
Missing        3051
Edge Cases      739
Name: Conversion_Status, dtype: int64