In [2]:
# This script creates the final combined csv, where all core publications 
# and their references are included with distinct retrieveable attributes in each column.
# The manually extracted data is also integrated in the end.

from csv import reader
import csv
from csv import DictReader
import re
import pandas as pd
from fuzzywuzzy import fuzz

papers = ['Baghizadeh2020', 'D_Arcy2011', 'Günther2017','Moeini2019', 
          'Oehlhorn2020', 'Peireira2020', 'Piccoli2005', 'Schneider2014',
          'Siponen2004', 'Jiang2021', 'Teubner2020', 'Tsai2017',
          'Wiener2020', 'Xiao2013']


# first the plain reference list is coupled with the 
# relational file of core publications citing the references to integrate all
# publications indexed by Scopus in the final CSV 

# additionally match the year and author for titles lesser than 5 words
def fuzzy_classification_author_year (author, year, reference_compare):
    first_author = author.split(",")[0]
    first_author = alphanumeric_filter_lower(first_author)
    if first_author not in reference_compare:
        if fuzz.partial_ratio(first_author, reference_compare) < 70:
            if author == "[No author name available]":
                if year_child not in reference_compare:
                    return False
            else:
                return False
    return True

def alphanumeric_filter_lower(string):
    alphanumeric_filter = filter(str.isalnum, string)
    string = "".join(alphanumeric_filter)
    string = string.lower()
    return string
    

for file in papers:
    #creating combined csv
    with open("./papers/" + file + "/" + file + "_joined" + '.csv', 'w') as joined_csv:
        writer = csv.writer(joined_csv)
        writer.writerow(["Author", "Title", "Year", "Source Title", "Document Type", "FAMILY_TYPE"])
        #iterating through parent and child documents (relational csv and reference csv)
        with open("./papers/" + file + "/scopus_parents.csv", "r") as read_obj_parents:
            csv_dict_reader_parents = DictReader(read_obj_parents)
            with open("./papers/" + file + "/scopus_children.csv", "r") as read_obj_children:
                csv_dict_reader_children = DictReader(read_obj_children)

                # for each core publication in the relational file
                for row_parent in csv_dict_reader_parents:

                    author_parent = row_parent['\ufeffAuthors']
                    title_parent = row_parent['Title']
                    year_parent = row_parent['Year']
                    source_title_parent = row_parent['Source title']
                    document_type_parent = row_parent['Document Type']
                    references_parent = row_parent['References']
                    already_appended_titles = []

                    reference_counter = 0
                    
                    # if reference list is empty -> show warning -> these articles will be manually integrated
                    if references_parent == "":
                        print("ERROR CORE PUBLICATION REFERENCE: " + file)
                        print(author_parent + ", " + title_parent + " - has no references")
                        continue

                    writer.writerow([author_parent, title_parent, year_parent, source_title_parent, document_type_parent, "PARENT"])
                    
                    # go through all references in the reference list and add the ones 
                    # that match to the according references in the relational file
                    for reference in references_parent.split(";"):
                        read_obj_children.seek(0)
                        reference_compare = alphanumeric_filter_lower(reference) 
                        for row_child in csv_dict_reader_children:
                            title_child = row_child["Title"]
                            author_child = row_child['\ufeffAuthors']
                            year_child = row_child['Year']
                            if title_child != "[No title available]":
                                title_compare = alphanumeric_filter_lower(title_child)
                                if title_compare in reference_compare:
                                    if len(title_child.split()) < 5:
                                        if fuzzy_classification_author_year (author_child, year_child, reference_compare) == False:
                                            continue
                                else: 
                                    continue
                                if title_compare not in already_appended_titles:
                                    reference_counter += 1
                                    already_appended_titles.append(title_compare)
                                    source_title_child = row_child['Source title']
                                    document_type_child = row_child['Document Type']
                                    writer.writerow([author_child, title_child, year_child, source_title_child, document_type_child, "CHILD"])
                                    break

                    writer.writerow(["Reference Count: " + str(reference_counter), "", "", "", "", ""])
                    writer.writerow(["", "", "", "", "", ""])

        read_obj_parents.close()
        read_obj_children.close()

        # here the manually exracted data is integrated in the combined CSV file
        author_pattern_sublime = '(?<=<a>)(.*?)(?=</a>)'
        pattern_sublime = '(?<=<t>)(.*?)(?=</t>)'
        pattern_scraping = '(?<=<t>)(.*?)(?=<t>)'
        reference_counter = 0

        def find_year(line):
            return re.findall("\d{4}", line)

        def same_dates(dates):
            return len(set(dates)) == 1

        def classified(line):
            if '<t>' in line:
                return True

        def year_filter(years):
            years_filtered = []
            for year in years:
                    if 1900 <= int(year) <= 2021:
                        years_filtered.append(year)
            return years_filtered

        def get_author(line):
            author_match = None
            author_match = re.search(author_pattern_sublime, line)
            author = author_match.group(0)
            return author

        def get_title(line):
            title_match = None
            if ('</t>' in line):
                # tags created with sublime
                title_match = re.search(pattern_sublime, line)
            else:
                #tags created with the scraping algorithm
                title_match = re.search(pattern_scraping, line)
            if title_match == None:
                print("no closing tag: " + line)
            title = title_match.group(0)
            return title

        # go through each manually added line and add publications to the combined CSV file
        with open("./papers/" + file + "/" + file + "_manually_added", "r") as read_obj_default:
            for line in read_obj_default:
                if '<PARENTARTICLE>' in line:  
                    if reference_counter > 0:
                        writer.writerow(["Reference Count: " + str(reference_counter), "", "", "", "", ""])
                        writer.writerow(["", "", "", "", "", ""])
                if '<t>' in line:
                    title = get_title(line)
                    if title != "":

                        year = ""
                        cutted_line = line.replace(get_title(line), "")
                        years_unfiltered = find_year(cutted_line)
                        years_filtered = year_filter(years_unfiltered)
                        if(len(years_filtered) == 0):
                            print('Error - no year: ' + line)
                        if (len(years_filtered) > 0):
                            year = years_filtered[0]

                        author = "[No author name available]"
                        if '<a>' in line:
                            author = get_author(line)

                        if '<PARENTARTICLE>' in line:                   
                            reference_counter = 0
                            writer.writerow([author, title, year, line, "", "PARENT"])
                        else: 
                            reference_counter += 1
                            writer.writerow([author, title, year, line, "", "CHILD"])
    
        if reference_counter > 0:
            writer.writerow(["Reference Count: " + str(reference_counter), "", "", "", "", ""])
            writer.writerow(["", "", "", "", "", ""])
    read_obj_default.close()
    joined_csv.close()

        
        

ERROR CORE PUBLICATION REFERENCE: Baghizadeh2020
Mähring M., Keil M., Information technology project escalation: A process model - has no references
ERROR CORE PUBLICATION REFERENCE: Baghizadeh2020
Bartis E., Mitev N., A multiple narrative approach to information systems failure: A successful system that failed - has no references
ERROR CORE PUBLICATION REFERENCE: Baghizadeh2020
Kasi V., Keil M., Mathiassen L., Pedersen K., The post mortem paradox: A Delphi study of IT specialist perceptions - has no references
ERROR CORE PUBLICATION REFERENCE: Baghizadeh2020
Keil Mark, Montealegre Ramiro, Cutting your losses: Extricating your organization when a big project goes awry - has no references
ERROR CORE PUBLICATION REFERENCE: Baghizadeh2020
Keil Mark, Mann Joan, Understanding the nature and extent of IS project escalation: Results from a survey of IS audit and control professionals - has no references


KeyboardInterrupt: 