In [2]:
# Perform standard imports
import spacy
nlp = spacy.load('en_core_web_lg')

In [3]:
import PyPDF2
import pdftotext
import os
import re
import subprocess
import pprint
import datefinder
import re
from spacy import displacy

In [4]:
class SyllabusParser:
    """"""
    def __init__(self, syllabus, html = False):
        self.syllabus = syllabus # Path to syllabus
        self.pages = {} #{Page: [Sentences]}
        self.numPages = len(syllabus)
        if html:
            self.addHTML(syllabus)
    
    def addHTML(self, sentences):
        self.pages[0] = sentences 
    
    def parse(self, end = None):
        """
        Method for extracting text from pdf files
        Parameters: 
            file (PDF): pdf file
            end (0-n): Parse up to this page
        Returns: 
            text (String): extracted text of document line by line       
        """
        #if end is None: end = self.numPages
        pdf = pdftotext.PDF(open(self.syllabus, 'rb'))
        end = len(pdf)
        
        allText = []
        for i in range(end):
            sentences = []
            lines = []
            page = pdf[i].splitlines()
            for sentence in page:
                # Remove all extra spacing 
                lines.append(sentence.split())
                
            for line in lines:
                # Add appropriate spacing
                sentences.append(" ".join(line))
                allText.append(" ".join(line))
            self.pages[i] = sentences   
            
        return allText
    
    def getInstructorInfo(self):
        pdf = pdftotext.PDF(open(self.syllabus, 'rb'))
        # Instructor 
        pg1 = pdf[0]
        pass
    
    def addHTMLEvents(self, num):
        """
        TODO: Add support for quizzes, hw, etc. 
        """
        # Adds exam to date
        toAdd = {} #"{Event: Time}"
        exs = {'exam', 'Exam', 'Test', 'test'}
        text = self.pages[num]
        aText = " ".join(self.pages[num])
        doc = nlp(aText)
        for sentence in text:
            doc = nlp(sentence)
            ents = [ent for ent in doc.ents]
            for ent in ents:
                if ent.label_ == 'DATE':
                    if 'Exam' in sentence:
                        toAdd[ent] = ('Exam', sentence)
        return toAdd

    
    def addEventsPg(self, num):
        """
        TODO: Add support for quizzes, hw, etc. 
        """
        # Adds exam to date
        toAdd = {} #"{Event: Time}"
        exs = {'exam', 'Exam', 'Test', 'test'}
        pdf = pdftotext.PDF(open(self.syllabus, 'rb'))
        text = self.pages[num]
        aText = " ".join(self.pages[num])
        doc = nlp(aText)
        for sentence in text:
            doc = nlp(sentence)
            ents = [ent for ent in doc.ents]
            for ent in ents:
                if ent.label_ == 'DATE':
                    if 'Exam' in sentence:
                        toAdd[ent] = ('Exam', sentence)
        return toAdd

In [5]:
class extractInfo:
    @classmethod
    def professorName(cls, text):
        # Using __ to extract the professor's name from text 
        salutations = ["Prof.", "Prof", "Dr.", "Dr", "Instructor"]
        doc = nlp(u"{}".format(text))
        for ent in doc.ents:
            name = ent.text.split()
            if ent.label_ == "PERSON" and name[0] in salutations:
                return ent.text
            
    @classmethod
    def phoneNumbers(cls, text):
        phoneNbrs = []
        # Regex pattern to extract phone numbers from text 
        pattern = r'\({0,1}(\d{3})-*\s*\){0,1}\s*(\d{3})-*\s*(\d{4})\n{0,1}'
        found = re.findall(pattern, text)
        for nbr in found:
            group1, group2, group3 = nbr
            phoneNbrs.append(" ".join([group1, group2, group3]))
        return phoneNbrs
            
    @classmethod
    def emailAddresses(cls, text):
        """
        Finds phone numbers of the form:
            111-111-1111, 111 111 1111, (111)111-1111, (111) 111-111, (111) 111 111
        """
        # Regex pattern to extract email addresses from text 
        pattern = r'[a-zA-Z0-9-_.]+@[a-zA-Z0-9-_.]+'
        return re.findall(pattern, str(text))
    
    @classmethod
    def exams(cls, text):
        events = []
        pattern = r"Exam\s*[A-Za-z0-9_-]*$"
        words = text.split()
        print("Split", words)
        for word in words:
            x = re.findall(pattern, str(text))
            if len(x) > 0: events.append(x[0])
        return events

In [6]:
def print_fine_pos(token):
    return (token.tag_)

def pos_tags(sentence):
    sentence = u'{}'.format(s)
    tokens = nlp(sentence)
    tags = []
    for tok in tokens:
        tags.append((tok,print_fine_pos(tok)))
    return tags

In [7]:
def expand_person_entities(doc):
    new_ents = []
    for ent in doc.ents:
        print(ent.text)
        if ent.label_ == "PERSON" and ent.start != 0:
            prev_token = doc[ent.start - 1]
            if ent.start.text in ("Dr", "Dr.", "Mr", "Mr.", "Ms", "Ms.", "Mrs", "Mrs.","Prof", "Prof.", "Instructor:"):
                new_ent = Span(doc, ent.start - 1, ent.end, label=ent.label)
                new_ents.append(new_ent.text)
        else:
            new_ents.append(ent)
    doc.ents = new_ents
    return doc

# Add the component after the named entity recognizer
# nlp.add_pipe(expand_person_entities, after='ner')

In [8]:
def cleanMe(html):
    soup = BeautifulSoup(html) # create a new bs4 object from the html data loaded
    for script in soup(["script", "style"]): # remove all javascript and stylesheet code
        script.extract()
    text = soup.get_text()
    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines
    text = '\n'.join(chunk for chunk in chunks if chunk)
    return text

# f = open(nlph, "r")
# x = cleanMe(f)

# table_words = ['topics', 'sections', 'reading', 'lecture', 'assignment', 'date', 'week', 'sections', 'schedule']


In [9]:
bio = "./Syllabuses/BIOL_1107_Fall_2016_Syllabus.pdf"
mys = SyllabusParser(bio)
mys.parse()
text = " ".join(mys.parse())

print(extractInfo.emailAddresses(text))
print(extractInfo.phoneNumbers(text))
print(extractInfo.professorName(text))

['thomas.abbott@uconn.edu']
['860 486 2939']
None


In [10]:
displacy.render(nlp(text), style = 'ent', jupyter = True)

In [11]:
sent = "I shot an elephant"
sent = "Exam I:  Monday September 19th, 2016, (Testing Center) "
doc=nlp(sent)

sub_toks = [tok for tok in doc if (tok.dep_ == "nsubj") ]

print(sub_toks) 

[]


In [12]:
exams = ["Exam I: Monday September 19th, 2016, (Testing Center)",
      "Exam II: Friday October 7th, 2016, (Testing Center)",
      "Exam III: Monday October 24th, 2016, (Testing Center)",
      "Exam IV: Tuesday November 15th, 2016, (Testing Center)",
      "Exam V: During Final Exam Week, taken in class/lecture hall as hard copy."]

for s in exams:
    print("Sentence:", s)
    displacy.render(nlp(s), style = 'dep', jupyter = True)

Sentence: Exam I: Monday September 19th, 2016, (Testing Center)


Sentence: Exam II: Friday October 7th, 2016, (Testing Center)


Sentence: Exam III: Monday October 24th, 2016, (Testing Center)


Sentence: Exam IV: Tuesday November 15th, 2016, (Testing Center)


Sentence: Exam V: During Final Exam Week, taken in class/lecture hall as hard copy.
