In [5]:
import JobInterviewTreeBank as corpus

# Corpus functions 

Job interview corpus, this data is collected from interview book guides, contains over 500 questions and their answers. The contents in this corpus cover the interview about general job interview. Our code mimics the structure from Python nltk library containing functions as follows: 

    - raw_text(): str corpus texts
    - raw_question_answer(): pandas dataframe of questions and answers
    - raw(): str unprocessed corpus contents
    - words(): list of str
    - sents(): list of (list of str)
    - tagged_words(): list of (str,str) tuple
    - tagged_sents(): list of (list of (str,str))
    - parsed_sents(): list of (Tree with str leaves)
    - dependency_parsed_sents(): list of dependency parsing str
    - visualize_parsed_tree(): visualize tree structure of a parsed_tree of a sentence

In [7]:
raw_text = corpus.raw_text()
print("Datatype : ", type(raw_text))
print("--- Sample Data ---")
print(raw_text[:80])

Datatype :  <class 'str'>
--- Sample Data ---
 What are your key skills?
 After spending the past six years as a senior system


In [8]:
raw = corpus.raw()
print("Datatype : ", type(raw))
print("--- Sample Data ---")
print(raw[:110])

Datatype :  <class 'str'>
--- Sample Data ---

(  (SBARQ
    (WHNP (WP What))
    (SQ (VBP are)
      (NP (PRP$ your) (JJ key) (NNS skills)))
    (. ?))


In [9]:
raw = corpus.raw_question_answer()
print("Datatype : ", type(raw))
print("--- Sample Data ---")
raw.head()

Datatype :  <class 'pandas.core.frame.DataFrame'>
--- Sample Data ---


Unnamed: 0,question,answer
0,What are your key skills?,After spending the past six years as a senior...
1,What skills do you think are most critical to...,"As technology is ever changing, I think that ..."
2,"If you were to stay in your current job, what...","If I were to stay at my current job, I'd like..."
3,What skills would you like to develop in this...,I'd like to develop my negotiating skills. I'...
4,How well do you write?,I would say that my writing skills are above ...


In [10]:
words = corpus.words()
print("Datatype : ", type(words))
print("--- Sample Data ---")
print(words[:50])

Datatype :  <class 'list'>
--- Sample Data ---
['What', 'are', 'your', 'key', 'skills', '?', 'After', 'spending', 'the', 'past', 'six', 'years', 'as', 'a', 'senior', 'systems', 'analyst', ',', 'I', "'ve", 'developed', 'a', 'number', 'of', 'important', 'skills', ',', 'including', 'business', 'modeling', ',', 'process', 're-engineering', ',', 'software-package', 'evaluation', ',', 'and', 'advanced', 'programming', 'capabilities', 'in', 'UNIX', 'and', 'C', 'environments', '.', 'I', 'was', 'very']


In [11]:
sents = corpus.sents()
print("Datatype : ", type(sents))
print("--- Sample Data ---")
print(sents[:10])

Datatype :  <class 'list'>
--- Sample Data ---
[' What are your key skills?', "After spending the past six years as a senior systems analyst, I've developed a number of important skills, including business modeling, process re-engineering, software-package evaluation, and advanced programming capabilities in UNIX and C environments.", 'I was very pleased to discover that these are the skills you are seeking.', 'Would you like to hear about specific examples of my work?', 'What skills do you think are most critical to this job?', 'As technology is ever changing, I think that it is important to keep up with the latest marketing trends.', 'Knowing what kinds of new technologies exist and how to go about incorporating them into my own marketing plans is what will keep me ahead of the competition.', 'Creativity is also of major importance to the marketing industry; new ideas can quickly become stale and stagnant.', 'A successful marketing associate will always be looking ahead to the next b

In [12]:
tagged_words = corpus.tagged_words()
print("Datatype : ", type(tagged_words))
print("--- Sample Data ---")
print(tagged_words[:10])

Datatype :  <class 'list'>
--- Sample Data ---
[('What', 'WP'), ('are', 'VBP'), ('your', 'PRP$'), ('key', 'JJ'), ('skills', 'NNS'), ('?', '.'), ('After', 'IN'), ('spending', 'VBG'), ('the', 'DT'), ('past', 'JJ')]


In [13]:
tagged_sents = corpus.tagged_sents()
print("Datatype : ", type(tagged_sents))
print("--- Sample Data ---")
print(tagged_sents[1])

Datatype :  <class 'list'>
--- Sample Data ---
[('After', 'IN'), ('spending', 'VBG'), ('the', 'DT'), ('past', 'JJ'), ('six', 'CD'), ('years', 'NNS'), ('as', 'IN'), ('a', 'DT'), ('senior', 'JJ'), ('systems', 'NNS'), ('analyst', 'NN'), (',', ','), ('I', 'PRP'), ("'ve", 'VBP'), ('developed', 'VBN'), ('a', 'DT'), ('number', 'NN'), ('of', 'IN'), ('important', 'JJ'), ('skills', 'NNS'), (',', ','), ('including', 'VBG'), ('business', 'NN'), ('modeling', 'NN'), (',', ','), ('process', 'NN'), ('re-engineering', 'NN'), (',', ','), ('software', 'NN'), ('-', 'HYPH'), ('package', 'NN'), ('evaluation', 'NN'), (',', ','), ('and', 'CC'), ('advanced', 'JJ'), ('programming', 'NN'), ('capabilities', 'NNS'), ('in', 'IN'), ('UNIX', 'NNP'), ('and', 'CC'), ('C', 'NNP'), ('environments', 'NNS'), ('.', '.')]


In [15]:
parsed_sents = corpus.parsed_sents()
print("Datatype : ", type(parsed_sents))
print("--- Sample Data ---")
print(parsed_sents[0])

Datatype :  <class 'list'>
--- Sample Data ---
(ROOT
  (SBARQ
    (WHNP (WP What))
    (SQ (VBP are) (NP (PRP$ your) (JJ key) (NNS skills)))
    (. ?)))


In [16]:
corpus.visualize_parsed_tree(parsed_sents[0])

          ROOT                   
           |                      
         SBARQ                   
  _________|___________________   
 |              SQ             | 
 |     _________|___           |  
WHNP  |             NP         | 
 |    |     ________|____      |  
 WP  VBP  PRP$      JJ  NNS    . 
 |    |    |        |    |     |  
What are  your     key skills  ? 



In [17]:
dependency_parsed_sents = corpus.dependency_parsed_sents()
print("Datatype : ", type(dependency_parsed_sents))
print("--- Sample Data ---")
print(dependency_parsed_sents[0])

Datatype :  <class 'list'>
--- Sample Data ---
What	WP	0	root
are	VBP	1	cop
your	PRP$	5	nmod:poss
key	JJ	5	amod
skills	NNS	1	nsubj

