In [1]:
# imports
import glob
import os
import re
import numpy as np
import pandas as pd

In [2]:
# configure dataframe appearance
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 10)

In [3]:
# save current working dir
cwd = os.getcwd()

# move one directory up to process global input files
os.chdir('..')

# read files from folder to dataframe
directory = './input/corpus/txtfiles_from_ocr/'
file_type = 'txt'
files = glob.glob("%s*.%s" % (directory, file_type))
df_files = pd.DataFrame(files, columns=['Filepath'])

In [4]:
# create checksum for each file to identify duplicates
# https://stackoverflow.com/questions/16874598/how-do-i-calculate-the-md5-checksum-of-a-file-in-python#16876405
import hashlib
def get_checksum(filepath: str) -> str:
    # Open,close, read file and calculate MD5 on its contents 
    with open(filepath, 'rb') as file_to_check:
        # read contents of the file
        data = file_to_check.read()    
        # pipe contents of the file through
        return hashlib.md5(data).hexdigest()
df_files['Checksum'] = df_files['Filepath'].apply(get_checksum)
df_files

Unnamed: 0,Filepath,Checksum
0,./input/corpus/txtfiles_from_ocr\CAPTUM_Allerg...,614144a0ffce9742be6d03e5d1f4327e
1,./input/corpus/txtfiles_from_ocr\CAPTUM_Allerg...,c70d4b5962b0baf267ad6574647815b1
2,./input/corpus/txtfiles_from_ocr\CAPTUM_Allerg...,94c1dcf77c98a1836066064856559340
3,./input/corpus/txtfiles_from_ocr\CAPTUM_Allerg...,73df94431b67b6f923b42db2113ec885
4,./input/corpus/txtfiles_from_ocr\CAPTUM_Allerg...,1ffa62ca53c5993d420bfdb2b230ab35
...,...,...
1041,./input/corpus/txtfiles_from_ocr\CAPTUM_TPO_Th...,1fc4032881cf480619bf7f24e48bb6b9
1042,./input/corpus/txtfiles_from_ocr\CAPTUM_TPO_Th...,f8179511c0b0643af87ce227ac553b98
1043,./input/corpus/txtfiles_from_ocr\CAPTUM_TPO_Th...,2c8067cf5b2aae6d59bb5175a75dde4e
1044,./input/corpus/txtfiles_from_ocr\CAPTUM_TPO_Th...,267aef8aa2b0e343c8f78002953cd2f3


In [5]:
# define functions to gather basic informations from the article files
def get_text_from_file(path: str) -> str:
    file = open(path,mode='r')
    text = file.read()
    file.close()
    return text

def get_year_from_file(path: str) -> str:    
    year = re.findall("\d+", path[-22:-17])[0]
    return year

def get_markers_from_file(path: str, id: int) -> str:
    info = os.path.basename(path).split("_")
    return info[id]

def get_authors_from_file(path: str) -> str:
    info = os.path.basename(path).split("_")
    return info[3][:-9]

In [6]:
# read basic informations from article files into dataframe
article_collection = []
for article in df_files.Filepath:
    article_details = {
        'Marker_1' : get_markers_from_file(article, 1),
        'Marker_2' : get_markers_from_file(article, 2),
        'Authors' : get_authors_from_file(article),
        'Year' : get_year_from_file(article),
        'Fulltext' : get_text_from_file(article),
    }
    article_collection.append(article_details)
df = pd.DataFrame(article_collection).join(df_files)

# reset working dir after input files have been processed
os.chdir(cwd)

df

Unnamed: 0,Marker_1,Marker_2,Authors,Year,Fulltext,Filepath,Checksum
0,Allergic Diseases,Angioedema,Arik yilmaz,2017,The persistence of chronic spontaneous urticar...,./input/corpus/txtfiles_from_ocr\CAPTUM_Allerg...,614144a0ffce9742be6d03e5d1f4327e
1,Allergic Diseases,Angioedema,Bruno,2001,the Science of the\nTotal Environment\n\nAn In...,./input/corpus/txtfiles_from_ocr\CAPTUM_Allerg...,c70d4b5962b0baf267ad6574647815b1
2,Allergic Diseases,Angioedema,Cousin,2016,Received Date : 11-Jan-2016\nRevised Date : 27...,./input/corpus/txtfiles_from_ocr\CAPTUM_Allerg...,94c1dcf77c98a1836066064856559340
3,Allergic Diseases,Angioedema,Faisant,2016,\n\nExperimental Imm\n\nology\n\nORIGINAL ART...,./input/corpus/txtfiles_from_ocr\CAPTUM_Allerg...,73df94431b67b6f923b42db2113ec885
4,Allergic Diseases,Angioedema,Kahveci,2020,Allergol Immunopathol (Madr). 2020;48(4):368-3...,./input/corpus/txtfiles_from_ocr\CAPTUM_Allerg...,1ffa62ca53c5993d420bfdb2b230ab35
...,...,...,...,...,...,...,...
1041,TPO,Thyroglobulin,Levy,2003,ORIGINAL ARTICLE\n\n517\n\nChronic urticaria: ...,./input/corpus/txtfiles_from_ocr\CAPTUM_TPO_Th...,1fc4032881cf480619bf7f24e48bb6b9
1042,TPO,Thyroglobulin,Mozena,2010,ORIGINAL ARTICLE\n\n \n\nLack of a Role for Cr...,./input/corpus/txtfiles_from_ocr\CAPTUM_TPO_Th...,f8179511c0b0643af87ce227ac553b98
1043,TPO,Thyroglobulin,Sanchez,2020,Journal Pre-proof\n\nIdentification of antigen...,./input/corpus/txtfiles_from_ocr\CAPTUM_TPO_Th...,2c8067cf5b2aae6d59bb5175a75dde4e
1044,TPO,Thyroglobulin,Silvares,2017,Report\n\nInternational Journal of\n\nDermatol...,./input/corpus/txtfiles_from_ocr\CAPTUM_TPO_Th...,267aef8aa2b0e343c8f78002953cd2f3


In [7]:
# collect all markers for every unique document and append it to the dataframe as given markers
checksums = df['Checksum'].unique()
unique_markers_collection = []
for checksum in checksums:
    markers = df.loc[df['Checksum'] == checksum, ['Marker_1', 'Marker_2']]
    unique_markers = pd.unique(markers.values.ravel())  
    df.loc[df['Checksum'] == checksum, 'Given_Markers'] = ', '.join(map(str, unique_markers))
df

Unnamed: 0,Marker_1,Marker_2,Authors,Year,Fulltext,Filepath,Checksum,Given_Markers
0,Allergic Diseases,Angioedema,Arik yilmaz,2017,The persistence of chronic spontaneous urticar...,./input/corpus/txtfiles_from_ocr\CAPTUM_Allerg...,614144a0ffce9742be6d03e5d1f4327e,"Allergic Diseases, Angioedema, Duration, Severity"
1,Allergic Diseases,Angioedema,Bruno,2001,the Science of the\nTotal Environment\n\nAn In...,./input/corpus/txtfiles_from_ocr\CAPTUM_Allerg...,c70d4b5962b0baf267ad6574647815b1,"Allergic Diseases, Angioedema"
2,Allergic Diseases,Angioedema,Cousin,2016,Received Date : 11-Jan-2016\nRevised Date : 27...,./input/corpus/txtfiles_from_ocr\CAPTUM_Allerg...,94c1dcf77c98a1836066064856559340,"Allergic Diseases, Angioedema"
3,Allergic Diseases,Angioedema,Faisant,2016,\n\nExperimental Imm\n\nology\n\nORIGINAL ART...,./input/corpus/txtfiles_from_ocr\CAPTUM_Allerg...,73df94431b67b6f923b42db2113ec885,"Allergic Diseases, Angioedema, Duration, Antih..."
4,Allergic Diseases,Angioedema,Kahveci,2020,Allergol Immunopathol (Madr). 2020;48(4):368-3...,./input/corpus/txtfiles_from_ocr\CAPTUM_Allerg...,1ffa62ca53c5993d420bfdb2b230ab35,"Allergic Diseases, Angioedema, Duration, Omali..."
...,...,...,...,...,...,...,...,...
1041,TPO,Thyroglobulin,Levy,2003,ORIGINAL ARTICLE\n\n517\n\nChronic urticaria: ...,./input/corpus/txtfiles_from_ocr\CAPTUM_TPO_Th...,1fc4032881cf480619bf7f24e48bb6b9,"Thyroglobulin, ANA, TPO"
1042,TPO,Thyroglobulin,Mozena,2010,ORIGINAL ARTICLE\n\n \n\nLack of a Role for Cr...,./input/corpus/txtfiles_from_ocr\CAPTUM_TPO_Th...,f8179511c0b0643af87ce227ac553b98,"ASST, thyroglobulin, TPO, Thyroglobulin"
1043,TPO,Thyroglobulin,Sanchez,2020,Journal Pre-proof\n\nIdentification of antigen...,./input/corpus/txtfiles_from_ocr\CAPTUM_TPO_Th...,2c8067cf5b2aae6d59bb5175a75dde4e,"TPO, Thyroglobulin"
1044,TPO,Thyroglobulin,Silvares,2017,Report\n\nInternational Journal of\n\nDermatol...,./input/corpus/txtfiles_from_ocr\CAPTUM_TPO_Th...,267aef8aa2b0e343c8f78002953cd2f3,"ASST, thyroglobulin, TPO, Thyroglobulin"


In [8]:
# create dataframe for further analytics w/o the duplicate articles
df_analyze = df.drop_duplicates(subset=['Checksum'], keep='first')
df_analyze.reset_index(inplace=True, drop=True)
df_analyze = df_analyze.loc[:,['Authors', 'Year', 'Given_Markers', 'Fulltext']]
df_analyze

Unnamed: 0,Authors,Year,Given_Markers,Fulltext
0,Arik yilmaz,2017,"Allergic Diseases, Angioedema, Duration, Severity",The persistence of chronic spontaneous urticar...
1,Bruno,2001,"Allergic Diseases, Angioedema",the Science of the\nTotal Environment\n\nAn In...
2,Cousin,2016,"Allergic Diseases, Angioedema",Received Date : 11-Jan-2016\nRevised Date : 27...
3,Faisant,2016,"Allergic Diseases, Angioedema, Duration, Antih...",\n\nExperimental Imm\n\nology\n\nORIGINAL ART...
4,Kahveci,2020,"Allergic Diseases, Angioedema, Duration, Omali...",Allergol Immunopathol (Madr). 2020;48(4):368-3...
...,...,...,...,...
449,Sánchez,2019,"TPO, Severity",Hindawi\n\nJournal of Immunology Research\n\nV...
450,Czarnecka-Operacz,2017,"TPO, Thyroglobulin",Original paper\n\nThyroid function and thyroid...
451,Kasumagic-Halilovic,2017,"TPO, Thyroglobulin",Published online:05/02/2017\n\nORIGINAL PAPER\...
452,Sanchez,2020,"TPO, Thyroglobulin",Journal Pre-proof\n\nIdentification of antigen...
