# Text chunking, also referred to as shallow parsing, is a task that follows Part-Of-Speech Tagging and that adds more structure to the sentence. The result is a grouping of the words in “chunks”. Chunking up or down allows the speaker to use certain language patterns, to utilize the natural internal process through language, to reach for higher meanings or search for more specific bits/portions of missing information.

In [5]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer


In [6]:
train_text = state_union.raw("C:/Users/Aayush/Downloads/1660_131107_bundle_archive/Ford_1975.txt")
sample_text = state_union.raw("C:/Users/Aayush/Downloads/1660_131107_bundle_archive/Ford_1976.txt")

In [7]:
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

tokenized = custom_sent_tokenizer.tokenize(sample_text)

In [None]:
def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            result = chunkParser.parse(nltk.pos_tag(train_text))
            print(result)
            chunked.draw()
    except Exception as e:
        print(str(e))

process_content()

(S
  (Chunk M/NNP r/NN)
  ./.
   /CC
  (Chunk S/NNP)
  p/VBP
  e/VBZ
  a/DT
  k/NN
  e/NN
  r/NN
  ,/,
  (Chunk  /NNP M/NNP r/NN)
  ./.
   /JJ
  (Chunk V/NNP i/NN)
  c/VBP
  e/NN
  (Chunk  /NNP P/NNP r/NN)
  e/NN
  s/NN
  i/NN
  d/VBP
  e/NN
  n/JJ
  t/NN
  ,/,
  (Chunk  /NNP M/NNP e/NN)
  m/NN
  b/NN
  e/NN
  r/NN
  s/NN
  (Chunk  /NNP)
  o/VBZ
  f/JJ
  (Chunk  /NNP t/NN)
  h/NN
  e/NN
   /VBD
  9/CD
  4/CD
  t/NN
  h/NN
  (Chunk  /NNP C/NNP)
  o/MD
  n/VB
  g/JJ
  r/NN
  e/NN
  s/NN
  s/NN
  ,/,
   /VB
  a/DT
  n/JJ
  d/NN
  (Chunk 
/NNP d/NN)
  i/NN
  s/VBP
  t/NN
  i/NN
  n/VBP
  g/NN
  u/NN
  i/NN
  s/VBP
  h/NN
  e/NN
  d/NN
  (Chunk  /NNP g/NN)
  u/JJ
  e/NN
  s/NN
  t/NN
  s/NN
  :/:
  
/JJ
  (Chunk 
/NNP T/NNP)
  w/VBD
  e/JJ
  n/JJ
  t/NN
  y/SYM
  -/:
  s/NN
  i/NN
  x/VBP
   /JJ
  y/NN
  e/VBZ
  a/DT
  r/NN
  s/NN
   /VBZ
  a/DT
  g/NN
  o/NN
  ,/,
   /VB
  a/DT
   /NN
  f/NN
  r/NN
  e/NN
  s/NN
  h/NN
  m/VBZ
  a/DT
  n/JJ
  (Chunk  /NNP C/NNP)
  o/MD
  n/VB
  g/JJ
  r/NN

(S
  (Chunk M/NNP r/NN)
  ./.
   /CC
  (Chunk S/NNP)
  p/VBP
  e/VBZ
  a/DT
  k/NN
  e/NN
  r/NN
  ,/,
  (Chunk  /NNP M/NNP r/NN)
  ./.
   /JJ
  (Chunk V/NNP i/NN)
  c/VBP
  e/NN
  (Chunk  /NNP P/NNP r/NN)
  e/NN
  s/NN
  i/NN
  d/VBP
  e/NN
  n/JJ
  t/NN
  ,/,
  (Chunk  /NNP M/NNP e/NN)
  m/NN
  b/NN
  e/NN
  r/NN
  s/NN
  (Chunk  /NNP)
  o/VBZ
  f/JJ
  (Chunk  /NNP t/NN)
  h/NN
  e/NN
   /VBD
  9/CD
  4/CD
  t/NN
  h/NN
  (Chunk  /NNP C/NNP)
  o/MD
  n/VB
  g/JJ
  r/NN
  e/NN
  s/NN
  s/NN
  ,/,
   /VB
  a/DT
  n/JJ
  d/NN
  (Chunk 
/NNP d/NN)
  i/NN
  s/VBP
  t/NN
  i/NN
  n/VBP
  g/NN
  u/NN
  i/NN
  s/VBP
  h/NN
  e/NN
  d/NN
  (Chunk  /NNP g/NN)
  u/JJ
  e/NN
  s/NN
  t/NN
  s/NN
  :/:
  
/JJ
  (Chunk 
/NNP T/NNP)
  w/VBD
  e/JJ
  n/JJ
  t/NN
  y/SYM
  -/:
  s/NN
  i/NN
  x/VBP
   /JJ
  y/NN
  e/VBZ
  a/DT
  r/NN
  s/NN
   /VBZ
  a/DT
  g/NN
  o/NN
  ,/,
   /VB
  a/DT
   /NN
  f/NN
  r/NN
  e/NN
  s/NN
  h/NN
  m/VBZ
  a/DT
  n/JJ
  (Chunk  /NNP C/NNP)
  o/MD
  n/VB
  g/JJ
  r/NN

(S
  (Chunk M/NNP r/NN)
  ./.
   /CC
  (Chunk S/NNP)
  p/VBP
  e/VBZ
  a/DT
  k/NN
  e/NN
  r/NN
  ,/,
  (Chunk  /NNP M/NNP r/NN)
  ./.
   /JJ
  (Chunk V/NNP i/NN)
  c/VBP
  e/NN
  (Chunk  /NNP P/NNP r/NN)
  e/NN
  s/NN
  i/NN
  d/VBP
  e/NN
  n/JJ
  t/NN
  ,/,
  (Chunk  /NNP M/NNP e/NN)
  m/NN
  b/NN
  e/NN
  r/NN
  s/NN
  (Chunk  /NNP)
  o/VBZ
  f/JJ
  (Chunk  /NNP t/NN)
  h/NN
  e/NN
   /VBD
  9/CD
  4/CD
  t/NN
  h/NN
  (Chunk  /NNP C/NNP)
  o/MD
  n/VB
  g/JJ
  r/NN
  e/NN
  s/NN
  s/NN
  ,/,
   /VB
  a/DT
  n/JJ
  d/NN
  (Chunk 
/NNP d/NN)
  i/NN
  s/VBP
  t/NN
  i/NN
  n/VBP
  g/NN
  u/NN
  i/NN
  s/VBP
  h/NN
  e/NN
  d/NN
  (Chunk  /NNP g/NN)
  u/JJ
  e/NN
  s/NN
  t/NN
  s/NN
  :/:
  
/JJ
  (Chunk 
/NNP T/NNP)
  w/VBD
  e/JJ
  n/JJ
  t/NN
  y/SYM
  -/:
  s/NN
  i/NN
  x/VBP
   /JJ
  y/NN
  e/VBZ
  a/DT
  r/NN
  s/NN
   /VBZ
  a/DT
  g/NN
  o/NN
  ,/,
   /VB
  a/DT
   /NN
  f/NN
  r/NN
  e/NN
  s/NN
  h/NN
  m/VBZ
  a/DT
  n/JJ
  (Chunk  /NNP C/NNP)
  o/MD
  n/VB
  g/JJ
  r/NN