# Search the VIS paper xmls for keywords

The xml files can be found in all subfolders here and include full text extraction results from the grobid tool.

Authors: Petra Isenberg, Tingying He
Date: October 2021

Edit cell 5 Anne-Flore Cabouat - Mar 2023

In [None]:
#First we load a couple of required libraries

from lxml import etree
import xml.etree.ElementTree as ET
import os
import csv
from shutil import copyfile
import pandas as pd

In [None]:
teins = {'tei':'http://www.tei-c.org/ns/1.0'} #info on the xml structure

In [None]:
#path of the xml files
path = "../papers-xml-extractions/"

#csv output path (results)
results_path = "../1-keywords-literature-search-results/"

#if you're doing multiple rounds (different xml sources dir for same term), if not keep ''
round_nb = ""

In [None]:
#variables we will retrieve from XML files to build a dataframe
titleColumn = []
yearColumn = []
pathColumn = []
filenameColumn = []
doiColumn = []
searchtermColumn = []
conferenceColumn = []

In [None]:
#our XML files are stored in directories with naming convention: conference-year

#Function that will extract the conference name out of the folder name
def findConferenceName(dirpath):
    if 'InfoVis' in dirpath:
        return 'InfoVis'
    elif 'Vast' in dirpath:
        return 'Vast'
    elif 'SciVis' in dirpath:
        return 'SciVis'
    elif 'Vis-' in dirpath:
        return 'Vis'
    else:
        return 'not found'
    
#function that will extract the year out of the folder name
def findYear(dirpath):
    return dirpath[-4:]

#function that will find the year in DOI
years = [*range(1990, 2023, 1)]
def findYearFromDOI(doi, years):
    for year in years:
        if doi.find(f'.{year}.') != -1:
            YearFound = year
            return YearFound

In [None]:
#function called to drop duplicates files found
def dup_drop(df, subset_str):
    len_before = len(df)
    df.drop_duplicates(subset=[subset_str], keep='last', inplace=True, ignore_index=True) #droping duplicates
    len_after = len(df)
    print(f'Dropped {len_before} duplicates by {subset_str}')

In [None]:
#let's walk through all files
def search_term(searchterm):
    #let's walk through all files
    for dirpath, dirnames, filenames in os.walk(path):
        for filename in [f for f in filenames if f.endswith(".xml")]:
            firstTime = 1
            searchtermText = ""

            conference = findConferenceName(dirpath)
            year = findYear(dirpath)

            filepath = dirpath + "/" + filename
            
            dirname = dirpath.replace(path, '')

            with open(filepath,'r', encoding='utf8') as file:
                try:
                    #first we check that we have information to identify the paper
                    tree = etree.parse(file)
                    root = etree.XML(etree.tostring(tree))
                    teiheader = root.find(".//tei:teiHeader",teins)
                    textNode = root.find(".//tei:text",teins)

                    #do we have a readable title?
                    title = teiheader.find(".//tei:title",teins)
                    if title is None or title =='':
                        title = f"Unknown title {dirname}/{filename}" #"unique" str to avoid duplicate check errors
                    if "Ã—Ã˜ ÃœÃ˜Ã" in title:
                        print("Probably error with file: " + filepath)
                    elif "@@@@" in title:
                        print("Probably error with file: " + filepath)

                    #do we have a DOI?
                    idno = teiheader.findall(".//tei:idno",teins)
                    doi = f"Unknown DOI {dirname}/{filename}" #"unique" str to avoid duplicate check errors
                    for i in idno:
                        if i.get("type") == "DOI":
                            doi = i.text
                            break
                    #attempt to retrieve year from DOI if it wasn't in dir name (old DOIs)
                    if not year.isnumeric():
                        year = findYearFromDOI(doi, years)
                        
                    print("Working on file: " + filename)

                    for elem in textNode.iter():
                        if elem.text: 
                            if searchterm.lower() in elem.text.lower():
                                searchtermText = searchtermText + "<" + elem.tag.replace("{http://www.tei-c.org/ns/1.0}","") + ">" + elem.text + "+"
                                #if the search term is found, do this

                                if firstTime == 1: #When the searchterm is found in a paper for the first time, do this
                                    print(doi)
                                    yearColumn.append(year)
                                    titleColumn.append(title.text)
                                    doiColumn.append(doi)
                                    pathColumn.append(filepath)
                                    filenameColumn.append(filename.replace(".tei.xml",""))
                                    firstTime = 0
                        if elem.tail: #check whether searchterm is in text after a closing tag
                            if searchterm.lower() in elem.tail.lower():
                                searchtermText = searchtermText + "<" + elem.tag.replace("{http://www.tei-c.org/ns/1.0}","") + ">" + elem.tail + "+"
                                #if the search term is found, do this

                                if firstTime == 1: #When the searchterm is found in a paper for the first time, do this
                                    yearColumn.append(year)
                                    titleColumn.append(title.text)
                                    doiColumn.append(doi)
                                    pathColumn.append(filepath)
                                    filenameColumn.append(filename.replace(".tei.xml",""))
                                    firstTime = 0

                    if searchtermText: # If the searchterm is found in this paper, do this
                        searchtermColumn.append(searchtermText)



                except Exception as e: # work on python 3.x
                    print(str(e))
                    #rows.append(["ERROR",file,"ERROR"])
                    #copyErrorFiles(filepath)

    data = {'path':pathColumn,
            'filename':filenameColumn,
            'title':titleColumn,
            'doi link':doiColumn,
            'year':yearColumn,
            'foundText':searchtermColumn,
            'conference': conferenceColumn
            }

    print(str(len(filenameColumn))+" and "+str(len(searchtermColumn)))

    df = pd.DataFrame(data) 
    df[f"term"] = searchterm+"; " #modified to facilitate merging dfs, keeping track of terms (AFC)
    
    #droping duplicates 
    dup_drop(df, 'foundText')
    dup_drop(df, 'path')
    dup_drop(df, 'title')
    
    #if there are duplicates left, store them for manual review
    dup_df = df[df.duplicated(subset=['filename'],keep=False)]
    if len(dup_df) > 0:
        print(f"\nRemaining duplicates by filename: {len(dup_df)}")
        dup_df.to_csv(f"{results_path}/paperFoundWith-"+searchterm+round_nb+"DUPLICATES.csv",index=False)
    else:
        print("No duplicates remaining based on filename")
    
    df.to_csv(f"{results_path}/paperFoundWith-"+searchterm+round_nb+".csv",index=False)

In [None]:
#Run one by one. Restart kernel or clear memory before running a new line

#search_term('readab')
#search_term('legib')
#search_term('likert')
search_term('deciph')

## Cleaning remaining duplicates
Different papers may have the same filename across folders without a distinctive DOI - in which case those files have been stored in dedicated csv files for each term searched.

Please manually go through -DUPLICATES.csv files in the results forlder to review them 
    > and clean the corresponding paperFoundwith*searchterm*.csv file

Then you can move on to the next part if needed (if you have mutliple terms) = merging multiple output csv files

# Merging the results

Author: Anne-Flore Cabouat
Date: March 2023

In [1]:
#required libraries

import pandas as pd
import numpy as np

In [2]:
##please change your local settings here

## input folder (default = previous step's out folder)
files_path = results_path

#output (defaut = same as input)
out_path = files_path
out_name = 'paperFound-MERGED'

#list the files you want to merge
#!! they all need to have the exact same column structure
files_dict = {
    #!! please double check that all keys are unique and represent a range of numbers starting from 1
    '1' : "paperFoundWith-readab.csv",
    '2' : "paperFoundWith-legib.csv",
    '3' : "paperFoundWith-likert.csv",
    '4' : "paperFoundWith-deciph.csv",
}
#warning for duplicate csv files in the dict
if len(files_dict) != len(set(files_dict.values())):
    raise ValueError("The dictionary contains repetition in filenames, please check again")

In [3]:
# let's read the files
def read_csvs(source_dict):
    #listing files
    files_list = []
    files_count = len(list(files_dict.keys()))
    print(f"{files_count} files listed")
    global files_range
    files_range = [*range(1,files_count+1,1)] #!! please double check that all keys are unique and represent a range of numbers starting from 1
    print(f"Range of files: {files_range}")
    #list of full file paths
    for k, f in files_dict.items():
        files_list.append(f"{files_path}/{f}")
    #reading csv and storing dfs in the global dict
    global df_dict
    try:
        df_dict = {i: pd.read_csv(files_list[i-1]) for i in files_range}
    except Exception as e:
        print(f"The function passed with an error: {str(e)}")
        pass


In [None]:
read_csvs(files_dict)

In [5]:
#column structure in a list
df_columns_list_full = df_dict[1].columns.values.tolist()
print(df_columns_list_full)

column_merge = 'path' #this columns will be used for identifying duplicates during the merge
df_columns_saved = ['foundText', 'term'] #these columns will be concatenated
df_columns_list = [item for item in df_columns_list_full if item not in df_columns_saved] #these columns will be combined_first
df_columns_list.remove(column_merge)
dup_drop = 'foundText' #how duplicates will be identified and droped (best on foundText)
dup_check = 'doi link' #to check for possible remaining duplicates after the drop

#intializing first df
df_merged = pd.DataFrame(columns=df_columns_list_full)
df_dup = df_merged
df_merged.info()

['path', 'filename', 'title', 'doi link', 'year', 'foundText', 'term']
<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   path       0 non-null      object
 1   filename   0 non-null      object
 2   title      0 non-null      object
 3   doi link   0 non-null      object
 4   year       0 non-null      object
 5   foundText  0 non-null      object
 6   term       0 non-null      object
dtypes: object(7)
memory usage: 0.0+ bytes


In [6]:
#clean columns after merging
def combine_columns(df):
    #combine_first = if left has valuem keep that - else take value from right
    for n in df_columns_list : #leaving out first col (path, used to merge) and last col (term, we need to keep both)
        try:
            df[f'{n}_x'] = df[f'{n}_x'].combine_first(df[f'{n}_y']) #combining _x (left from merging) and _y (right from merging) columns into _x
            df.rename(columns = {f'{n}_x':f'{n}'}, inplace=True) #rename column _x
            del df[f'{n}_y'] #dropping the _y column
        except Exception as e:
            print(f"!! Couldn't combine columns {n}, error {e}")
            
    # concatenate right value to left one
    for m in df_columns_saved:
        ## replacing NaN with empty strings first
        df[f'{m}_x'] = df[f'{m}_x'].apply(lambda x: '' if x is np.nan else x)
        df[f'{m}_y'] = df[f'{m}_y'].apply(lambda x: '' if x is np.nan else x)
        ## concatenating terms & cleaning columns
        df[f'{m}_x'] = df[f'{m}_x'] + df[f'{m}_y']
        df.rename(columns = {f'{m}_x':f'{m}'}, inplace=True)
        del df[f'{m}_y']
    
    #droping duplicates & identifying possible remaning ones
    len_before = len(df)
    df.drop_duplicates(subset=[dup_drop], keep='last', inplace=True, ignore_index=True)
    len_after = len(df)
    print(f'Dropped {len_before} duplicates by {dup_drop}')

    print("\n************\nAFTER DUPLICATES CLEANED\n")
    df.info()
    
    global df_dup
    df_dup = pd.concat([df_dup, df[df.duplicated(subset=[dup_check],keep=False)]], ignore_index=True)
    if len(df_dup) > 0:
        print(f"\n!!! duplicates remaining based on {dup_check}: {len(df_dup)}.\n The full df_dup dataframe should be saved as csv.")
    else:
        print(f"\n No duplicates remaining based on {dup_check}\n")

    global df_merged
    df_merged = df
    return df

merge_count = 0

#merging 2 dataframess at a time
def merge_dfs(df_x, df_y):
    global merge_count
    merge_count += 1
    print(f"merging based on '{column_merge}' column")
    global df_merged
    df_merged = pd.merge(
        df_x, #left source
        df_y, #right source
        how="outer",
        on=column_merge,
        left_on=None,
        right_on=None,
        left_index=False,
        right_index=False,
        sort=True,
        suffixes=("_x", "_y"), #used to identify left and right sources in final df
        copy=True,
        indicator=False,
        validate=None,
    )
    print("\n************\nAFTER MERGE\n")
    df_merged.info()
    df_merged.to_csv(f"{out_path}/paperFound-MERGED+DOI_temp{merge_count}.csv",index=False)
    print("\n\n\n\n\n\n")
    combine_columns(df_merged)
    return df_merged


In [7]:
for i in files_range:
    try:
        merge_dfs(df_merged, df_dict[i])
    except Exception as e:
        print(f"\n *** An error occurred: {str(e)} *** \n \n")

merging based on 'path' column

************
AFTER MERGE

<class 'pandas.core.frame.DataFrame'>
Int64Index: 84 entries, 0 to 83
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   filename_x   0 non-null      object 
 1   title_x      0 non-null      object 
 2   doi link_x   0 non-null      object 
 3   year_x       0 non-null      object 
 4   foundText_x  0 non-null      object 
 5   term_x       0 non-null      object 
 6   path         84 non-null     object 
 7   filename_y   84 non-null     object 
 8   title_y      84 non-null     object 
 9   doi link_y   84 non-null     object 
 10  year_y       45 non-null     float64
 11  foundText_y  84 non-null     object 
 12  term_y       84 non-null     object 
dtypes: float64(1), object(12)
memory usage: 9.2+ KB







Dropped 84 duplicates by foundText

************
AFTER DUPLICATES CLEANED

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84 entries, 0 to 83
Data

In [8]:
#output files
df_merged.to_csv(f"{out_path}/{out_name}.csv",index=False)
if len(df_dup) > 0:
    df_dup.to_csv(f"{out_path}/{out_name}-remaining-duplicates.csv",index=False)

In [9]:
#Options for printing if you want intermediary checks in the functions
#pd.set_option('display.min_rows', 30)
#pd.set_option('display.max_columns', None)
#pd.set_option('display.max_colwidth', 25)

#or use print with:
#with pd.option_context('display.min_rows', 30, 'display.max_columns', None, 'display.max_colwidth', 25):
#    print(df) """