In [1]:
# Libraries for reading pdf
# PyPDF2 is the library that is used for handling .pdf files
import PyPDF2 as pdf2
# Textract is the Tesseract OCR (Optical Character Recognition) Python IO wrapper
# This is used to read and extract text data from picture pdf files. can also be used with other picture formats
import textract

In [2]:
# for data preprocessing
# Regular Expressions is used for finding patterns within the collected data
import re
# Dateutil library is used to parse text data to find date formats, datetime is simply being used for formatting date
from dateutil.parser import parse
from datetime import date
# The following libraries are used for any calculations that are needed, Biopython for DNA/Primer sequence data.
import math
from Bio.SeqUtils import molecular_weight as mw
from Bio.SeqUtils import MeltingTemp as mt
from Bio.Alphabet import generic_dna
from Bio.Seq import Seq

In [3]:
# For generating the JSON file array and dumping into the output file
import json

In [4]:
# Here we are setting up the input file to be ready for reading with pypdf and textract and setting global variables
filename = "TestSpec.pdf"

pdfFileObj = open(filename, 'rb')

pdfReader = pdf2.PdfFileReader(pdfFileObj)

text = ""

pageObj = pdfReader.getPage(0)

text += pageObj.extractText()


In [5]:
# Since there are different types of .pdf files there are different ways we can read them
# This if statement checks if the file is a scanned or tabular (picture or text) pdf file
if text != "":
    text = text
else:
    text = textract.process(filename, method="tesseract", language="eng")
    
# Since we are now done with collecting data we should close the pdf file as to not cause corruption
pdfFileObj.close()


In [6]:
# Here we are going to be cleaning the data so that it is easier to handle
# Converts text to string from byte data for preprocessing
stringedText = str(text)

# Removed escaped lines and replaced them with actual new lines.
formattedText = stringedText.replace('\\n', '\n')

# Slices the long string into a workable piece (only contains useful data)
slice1 = formattedText[(formattedText.index("SHEET") + 10): (formattedText.index("Secondary") - 2)]

# And finally we are just removing extra values that are not really human readable.
clean = re.sub('\n', " ", slice1)

clean2 = re.sub(' +', ' ', clean)


In [8]:
# Collect the sequence name (The name of the JSON file)
name = clean2[clean2.index("Sequence") + 11: clean2.index("Sequence") + 19]


In [9]:
# Collecting Shipment
ordered = input("Who placed this order? ")

received = input("Who is receiving this order? ")

dateOrder = re.findall(
    r'(\d{2}[\/\- ](\d{2}|January|Jan|February|Feb|March|Mar|April|Apr|May|May|June|Jun|July|Jul|August|Aug|September|Sep|October|Oct|November|Nov|December|Dec)[\/\- ]\d{2,4})',
    clean2)

dateReceived = date.today()

refNo = clean2[clean2.index("ref.No. ")+8: clean2.index("ref.No.")+17]

orderNo = clean2[clean2.index("Order No.") +
                 10: clean2.index("Order No.") + 18]


Who placed this order?  Brett Plemons
Who is receiving this order?  Brett Plemons


In [10]:
# Here we are collecting the Oligo-data and calculating the necessary data
# The necessary data is GC%, TM, Molecular Weight (product specific), Dilution Weight (product Specific), and Dilution volume
bases = int(clean2[clean2.index("bases") - 3:clean2.index("bases") - 1])

seqList = [line for line in clean2 if re.match(r'^[AGCT]+$', line)]

sequence = "".join(i for i in seqList[:bases])

# Function used to calculate GC%, BioPython has a module for this but it was more inaccurate for this
# small of a sequence compared to calculating it myself.
def gcContent(sequence):
    count = 0
    for i in sequence:
        if i == 'G' or i == 'C':
            count += 1
        else:
            count = count
    return round((count / bases) * 100, 1)

gc = gcContent(sequence)

tm = mt.Tm_GC(sequence, Na=50)

moleWeight = round(mw(Seq(sequence, generic_dna)), 2)

dilWeight = float(clean2[clean2.index("ug/OD260:") +
                         10: clean2.index("ug/OD260:") + 14])

dilution = dilWeight * 10

In [13]:
# Lastly we will create a dictionary (the easiest format to dump into a .JSON) with the collected data
primerDict = {"Primer Data":{
                           "Sequence": sequence,
                           "Bases": bases,
                           "TM (50mM NaCl)": tm,
                           "% GC content": gc,
                           "Molecular weight": moleWeight,
                           "ug/0D260": dilWeight,
                           "Dilution volume (uL)": dilution
               },
               "Shipment Info": {
                           "Ref. No.": refNo,
                           "Order No.": orderNo,
                           "Ordered by": ordered,
                           "Date of Order": dateOrder,
                           "Received By": received,
                           "Date Received": str(dateReceived.strftime("%d-%b-%Y"))
               }}


In [14]:
# We create a JSON array datatype and 'dump' it into a .JSON file that is labeled as the sequence name.
with open("".join(name) + ".json", 'w') as file:
    primerJSON = json.dumps(primerDict, ensure_ascii=False)
    file.write(primerJSON)
    