### Document Preprocessing: Converting PDFs to TSV file
The papers are stored as PDF files. This notebook combines and converts them to a TSV format that is easier for Snorkel to process.

In [1]:
import numpy as np
import pandas as pd

In [2]:
# Apache Tika package to extract text from PDFs
import tika
from tika import parser

In [3]:
from pathlib import Path

pathlist = Path(r'more_pdfs_epicenter').glob('**/*.pdf')    # pdfs stored in this folder

dict_list = []
paths_list = []

# for loop to parse each documents' contents
for i, path in enumerate(pathlist):
    path_in_str = str(path)
    paths_list.append(path)
    parsed = parser.from_file(path_in_str)
    parsed["doc_name"] = "doc_" + str(i)
    dict_list.append(parsed)

In [4]:
# make csv of paths pointing to PDF file locations
paths_df = pd.DataFrame(paths_list)
paths_df.to_csv('paths.csv', header = False, index = False)

In [5]:
# number of docs
print(len(dict_list))

88


In [6]:
# write the file names and documents' contents to a dataframe
file_names = []
contents = []

for doc in dict_list:
    file_names.append(doc['metadata']['resourceName'])
    contents.append(doc['content'])
    
df = pd.DataFrame({'file_name':file_names, 'content': contents})

In [7]:
# remove newline characs
df.replace(r'\n', ' ', regex = True, inplace = True)
# strip leading whitespace
df['content'] = df['content'].str.lstrip()
# remove other whitespace
df.replace(r'\s\s+', ' ', regex = True, inplace = True)
df.replace(r'\t', ' ', regex = True, inplace = True)

In [8]:
# example of the first documents' contents
print(df.iloc[0]['content'])

ISOLATION OF VIRUSES FROM WILD MAMMALS IN WEST AFRICA, 1966-1970 Journal of Wildlife Diseases Vol. 10, July, 1974 27’? ISOLATION OF VIRUSES FROM WILD MAMMALS IN WEST AFRICA, 1966-1970 GRAHAM E. KEMP*, OTTIS R. CAUSEY*, HENRY W. SETZERt, and DOROTHY 1. MOORE* Abstract: During the S-year period 1966-1970, a total of 7497 wild mammals of at least 101 different species were collected from 36 locations in Nigeria, Dahomey, and Togo and sampled for virus. The collections were made in five ecologically distinct vegetative zones: high forest, Guinea, Sudan, and Sahel woodland, and the J05 Plateau. Sixteen different virus types, represented by 83 isolates, were recovered, as follows: Arumowot (6 isolates), Bhanja (2), bluetongue type 7 (1), Chandipura (1), Congo (2), Dakar bat (3), Dugbe (1), IbAn 17143 (1), IbAn 33709 (1), Lebombo (1), Mokola (4), poxvirus IbAn 34325 (1), Semliki Forest (1), SudAn 754/6 1 (53), Uganda 5 (3), and West Nile (2). Viruses were isolated from Nigeria, the principal 

In [9]:
# export to tsv file
df.to_csv('pdfs_big.tsv', sep='\t', header = False, index = False, encoding = 'utf-8')

## NOTE: After exporting, the format may not save perfectly, we will edit out extra rows in Excel, then save the tsv file, before reading into Snorkel notebook #1

In [25]:
# saving metadata
metadata = pd.read_csv(r'Flaviviruses_v2.txt', sep = '\t', encoding = "cp1252")

In [26]:
# clean up file column
metadata['file'] = metadata['file'].str[26:]

In [27]:
metadata = metadata.rename(columns = {'file':'file_name'})

In [28]:
metadata['file_name'] = metadata['file_name'].astype(str)
df['file_name'] = df['file_name'].astype(str)

In [29]:
df2 = df.merge(metadata, how = 'left', on = 'file_name')

In [30]:
# final dataframe of file title, file content, and metadata info all in one table
df2

Unnamed: 0,file_name,content,Title,Journal,Year,Authors,Volum,Issue,Date,Month
0,0090-3558-10.3.279.pdf,ISOLATION OF VIRUSES FROM WILD MAMMALS IN WEST...,Isolation of viruses from wild mammals in West...,J Wildl Dis,1974.0,"G. E. Kemp, O. R. Causey, H. W. Setzer and D. ...",10.0,3,279-93,Jul
1,059.041.0109.pdf,BioOne Complete (complete.BioOne.org) is a ful...,New records of ectoparasites Echinolaelaps ech...,Southwestern Entomologist,2016.0,"C. M. Baak-Baak, N. Cigarroa-Toledo, O. M. Tor...",41.0,1,75-86,
2,09-642913p460.pdf,SoutheaSt aSian J trop Med public health 460 V...,Zika Virus Infection in Australia Following a ...,Southeast Asian J Trop Med Public Health,2015.0,"G. H. Leung, R. W. Baird, J. Druce and N. M. A...",46.0,3,460-4,May
3,1-s2.0-0035920362900743-main.pdf,PII: 0035-9203(62)90074-3 504 TRANSACTIONS OF ...,Isolation of Ilheus virus from human beings in...,Trans R Soc Trop Med Hyg,1962.0,"L. Spence, C. R. Anderson and W. G. Downs",56.0,6,504-9,Nov
4,1-s2.0-S0147957197000246-main.pdf,PII: S0147957197000246 Epidemiology of tick-bo...,Epidemiology of tick-borne encephalitis in Japan,Comp Immunol Microbiol Infect Dis,1998.0,I. Takashima,21.0,2,81-90,Apr
5,1-s2.0-S1090023313005315-main.pdf,Detection of Usutu virus in a bullfinch (Pyrrh...,Detection of Usutu virus in a bullfinch (Pyrrh...,Vet J,2014.0,"M. M. Garigliany, D. Marlier, K. Tenner-Racz, ...",199.0,1,191-3,Jan
6,1-s2.0-S1877959X13000587-main.pdf,Detection and genetic characterization of tick...,Detection and genetic characterization of tick...,Ticks Tick Borne Dis,2014.0,"L. Jemersic, D. Dezdek, D. Brnic, J. Prpic, Z....",5.0,1,13-Jul,Feb
7,1-s2.0-S1877959X13001179-main.pdf,Surveillance of tick-borne encephalitis virus ...,Surveillance of tick-borne encephalitis virus ...,Ticks Tick Borne Dis,2014.0,"T. P. Mikryukova, N. S. Moskvitina, Y. V. Kono...",5.0,2,145-51,Mar
8,1040638711433325.pdf,Confirmed case of encephalitis caused by Murra...,Confirmed case of encephalitis caused by Murra...,J Vet Diagn Invest,2012.0,"A. N. Gordon, C. R. Marbach, J. Oakey, G. Edmu...",24.0,2,431-6,Mar
9,1040638712452723.pdf,Real-time fluorogenic reverse transcription po...,Real-time fluorogenic reverse transcription po...,J Vet Diagn Invest,2012.0,"D. Buitrago, A. Rocha, C. Tena-Tomas, M. Vigo,...",24.0,5,959-63,Sep


In [31]:
# export metadata
df2.to_csv('metadata_big.tsv', sep='\t', header = True, index = False, encoding = 'utf-8')

## Extracting more info from the text (eg: virus abbreviations)

In [32]:
import re
#example = "hello 2131242 ffgg (TBEV) f fdr (ABCB) fsdr"
#match = re.findall("\([a-zA-Z]{2,6}?\)", example)
#print(match)

#example2 = "(USA) USA ERF Fegrd"
#match2 = re.findall("\(?USA\)?", example2)
#print(match2)

In [33]:
matches = df2['content'].str.findall("\([A-Z]{3,4}?\)").tolist()
virus_abbrev = []
def removeNestings(l): 
    for i in l: 
        if type(i) == list: 
            removeNestings(i) 
        else: 
            virus_abbrev.append(i) 
  
removeNestings(matches)
virus_abbrev = list(set(virus_abbrev))

In [34]:
virus_abbrev_series = pd.Series(virus_abbrev)

In [35]:
# remove parentheses around the virus acronym
no_paren = virus_abbrev_series.str.replace("\(|\)", "")

In [36]:
# replace some virus acronym (ending with letter V) with a fuller virus acronym
# eg,JEV => JE virus
just_v = no_paren[no_paren.str.endswith('V')].str.replace(r".$", " virus")

In [37]:
# combine all acronyms to one df
no_paren = no_paren.append(just_v)
virus_abbrev_series = virus_abbrev_series.append(no_paren)

In [38]:
# Remove false acronyms that are not virus names
virus_abbrev_series.drop(virus_abbrev_series[virus_abbrev_series.str.contains('\)?MAY\)?')].index, inplace=True)
virus_abbrev_series.drop(virus_abbrev_series[virus_abbrev_series.str.contains('\)?USA\)?')].index, inplace=True)
virus_abbrev_series.drop(virus_abbrev_series[virus_abbrev_series.str.contains('\)?PRNT\)?')].index, inplace=True)
virus_abbrev_series.drop(virus_abbrev_series[virus_abbrev_series.str.contains('\)?UVRI\)?')].index, inplace=True)
virus_abbrev_series.drop(virus_abbrev_series[virus_abbrev_series.str.contains('\)?con\)?')].index, inplace=True)
virus_abbrev_series.drop(virus_abbrev_series[virus_abbrev_series.str.contains('\)?PCR\)?')].index, inplace=True)
virus_abbrev_series.drop(virus_abbrev_series[virus_abbrev_series.str.contains('\)?CAL\)?')].index, inplace=True)
virus_abbrev_series.drop(virus_abbrev_series[virus_abbrev_series.str.contains('\)?EHD\)?')].index, inplace=True)
virus_abbrev_series.drop(virus_abbrev_series[virus_abbrev_series.str.contains('\)?PBS\)?')].index, inplace=True)


In [39]:
# export virus abbreviations to csv file 
virus_abbrev_series.to_csv('virus_abbrev.csv', sep = ",", index = False, header = False)