In [1]:
import PyPDF2 
import textract
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [2]:
filename = '/home/paperspace/fastai/courses/dl1/data/Niyogi-JACS2008.pdf'

## example of extracting text from pdf using textract
### entire document is extracted; no need for looping through page numbers

In [3]:
text = textract.process(filename)
text

b'Published on Web 01/25/2007\n\nSelective Aggregation of Single-Walled Carbon Nanotubes via Salt Addition\nSandip Niyogi,\xe2\x80\xa0 Sofiane Boukhalfa,\xe2\x80\xa0 Satishkumar B. Chikkannanavar,\xe2\x80\xa0 Timothy J. McDonald,\xe2\x80\xa1\nMichael J. Heben,\xe2\x80\xa1 and Stephen K. Doorn*,\xe2\x80\xa0\nChemistry DiVision, Los Alamos National Laboratory, C-CSE, Los Alamos, New Mexico 87545, and Center for\nBasic Sciences, National Renewable Energy Laboratory, Golden, Colorado 80401\nReceived November 28, 2006; E-mail: skdoorn@lanl.gov\n\nSingle-walled carbon nanotubes (SWNTs) can be dispersed as\nindividuals in H2O using sodium dodecylsulfate (SDS), typically\nat 1% (35 mM) concentration. At and above the critical micellar\nconcentration (cmc 8 mM at 25 \xc2\xb0C), the degree of ionization of\nSDS is 0.27.1 Once intertube van der Waals (VdW) attraction is\novercome by intense sonication, free SDS adsorbs to SWNT\nsurfaces and creates a net surface density of negative charge, which\

In [4]:
### Save the extracted text as a *.txt file

f = open('Niyogi-JACS2008.txt', 'wb')
f.write(text)
f.close()

## example of extracting text from pdf using pyPDF2
### results in parts of sentences between punctuations interpreted as words

In [None]:
#open allows you to read the file
pdfFileObj = open(filename, 'rb')

In [None]:
#The pdfReader variable is a readable object that will be parsed
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)

In [None]:
pageObj = pdfReader.getPage(0)
pageObj.extractText()

In [None]:
#discerning the number of pages will allow us to parse through all #the pages
num_pages = pdfReader.numPages
count = 0
text = ""
#The while loop will read each page
while count < num_pages:
    pageObj = pdfReader.getPage(count)
    count +=1
    text += pageObj.extractText()

In [None]:
#This if statement exists to check if the above library returned #words. It's done because PyPDF2 cannot read scanned files.
if text != "":
   text = text
#If the above returns as False, we run the OCR library textract to #convert scanned/image based PDF files into text
else:
       text = textract.process(filename, method='tesseract', language='eng')

### Save the extracted text as a *.txt file

In [None]:
f = open('Niyogi-JACS2008.txt', 'wb')
f.write(text)
f.close()

In [None]:
print(text)

# Method for creating tokens from the .txt file using fastai library

In [5]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.io import *
from fastai.conv_learner import *

from fastai.column_data import *

In [6]:
import sys
sys.path.append('/home/paperspace/fastai/')

In [7]:
path='/home/paperspace/fastai/courses/dl1/data/'

In [8]:
file=f'{path}Niyogi-JACS2008.txt'

In [9]:
#get_data("https://s3.amazonaws.com/text-datasets/nietzsche.txt", f'{PATH}nietzsche.txt')
text = open(f'{file}').read()
print('corpus length:', len(text))

corpus length: 11135


In [10]:
text[:400]

'Published on Web 01/25/2007\n\nSelective Aggregation of Single-Walled Carbon Nanotubes via Salt Addition\nSandip Niyogi,† Sofiane Boukhalfa,† Satishkumar B. Chikkannanavar,† Timothy J. McDonald,‡\nMichael J. Heben,‡ and Stephen K. Doorn*,†\nChemistry DiVision, Los Alamos National Laboratory, C-CSE, Los Alamos, New Mexico 87545, and Center for\nBasic Sciences, National Renewable Energy Laboratory, Golden'

In [11]:
chars = sorted(list(set(text)))
vocab_size = len(chars)+1
print('total chars:', vocab_size)

total chars: 86


In [12]:
chars.insert(0, "\0")

''.join(chars[1:-6])

'\n\x0c $%()*+,-./0123456789:;@ABCDEFGHIJKLMNOPRSTUVWXYZabcdefghijklmnopqrstuvwxyz©°'

In [13]:
char_indices = {c: i for i, c in enumerate(chars)}
indices_char = {i: c for i, c in enumerate(chars)}

In [14]:
idx = [char_indices[c] for c in text]

idx[:10]

[42, 72, 53, 63, 60, 70, 59, 56, 55, 3]

In [15]:
''.join(indices_char[i] for i in idx[:100000])

'Published on Web 01/25/2007\n\nSelective Aggregation of Single-Walled Carbon Nanotubes via Salt Addition\nSandip Niyogi,† Sofiane Boukhalfa,† Satishkumar B. Chikkannanavar,† Timothy J. McDonald,‡\nMichael J. Heben,‡ and Stephen K. Doorn*,†\nChemistry DiVision, Los Alamos National Laboratory, C-CSE, Los Alamos, New Mexico 87545, and Center for\nBasic Sciences, National Renewable Energy Laboratory, Golden, Colorado 80401\nReceived November 28, 2006; E-mail: skdoorn@lanl.gov\n\nSingle-walled carbon nanotubes (SWNTs) can be dispersed as\nindividuals in H2O using sodium dodecylsulfate (SDS), typically\nat 1% (35 mM) concentration. At and above the critical micellar\nconcentration (cmc 8 mM at 25 °C), the degree of ionization of\nSDS is 0.27.1 Once intertube van der Waals (VdW) attraction is\novercome by intense sonication, free SDS adsorbs to SWNT\nsurfaces and creates a net surface density of negative charge, which\nprevents SWNT reaggregation. In such solutions, the excitonic\nabsorption

## Don't Need if using the fastai library
### Convert Text into Keywords

In [None]:
import nltk
nltk.download('punkt')

In [None]:
#The word_tokenize() function will break our text phrases into #individual words
tokens = word_tokenize(text, language = 'english', preserve_line = True)

In [None]:
#we'll create a new list which contains punctuation we wish to clean
punctuations = ['(',')',';',':','[',']',',', ' ']

In [None]:
nltk.download('stopwords')

In [None]:
#We initialize the stopwords variable which is a list of words like #"The", "I", "and", etc. that don't hold much value as keywords
stop_words = stopwords.words('english')

In [None]:
#We create a list comprehension which only returns a list of words #that are NOT IN stop_words and NOT IN punctuations.
keywords = [word for word in tokens if not word in stop_words and not word in str(punctuations)]

In [None]:
print(keywords)