In [1]:
text="""London is the capital and most populous city of England and the United Kingdom. Standing on 
the River Thames in the south east of the island of Great Britain, 
London has been a major settlement for two millennia. It was founded by the Romans, who named it Londinium."""

In [2]:
import nltk
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.corpus import stopwords

In [3]:
from nltk import pos_tag
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import PorterStemmer
from nltk.parse.corenlp import CoreNLPDependencyParser

# Sentence Tokenizer

In [4]:
sent=list(sent_tokenize(text))
print(sent)

['London is the capital and most populous city of England and the United Kingdom.', 'Standing on \nthe River Thames in the south east of the island of Great Britain, \nLondon has been a major settlement for two millennia.', 'It was founded by the Romans, who named it Londinium.']


# Words Tokenizer

In [5]:
def words(l):
    i=0
    words=[]
    for j in l:
        words.append(word_tokenize(j))
        i=i+1
    return words

In [30]:
word=words(sent)
print(word)
token=word

[['London', 'is', 'the', 'capital', 'and', 'most', 'populous', 'city', 'of', 'England', 'and', 'the', 'United', 'Kingdom', '.'], ['Standing', 'on', 'the', 'River', 'Thames', 'in', 'the', 'south', 'east', 'of', 'the', 'island', 'of', 'Great', 'Britain', ',', 'London', 'has', 'been', 'a', 'major', 'settlement', 'for', 'two', 'millennia', '.'], ['It', 'was', 'founded', 'by', 'the', 'Romans', ',', 'who', 'named', 'it', 'Londinium', '.']]


# Stop-Words

In [20]:
def stop_words(l):
    stop=[]
    sr = stopwords.words('english')
    for i in range(len(l)):
        sent=[]
        for j in range(len(l[i])):
            if l[i][j] not in sr:
                sent.append(l[i][j])
        stop.append(sent)
    return stop 

In [21]:
stop=stop_words(token)
print(stop)

[['London', 'capital', 'populous', 'city', 'England', 'United', 'Kingdom', '.'], ['Standing', 'River', 'Thames', 'south', 'east', 'island', 'Great', 'Britain', ',', 'London', 'major', 'settlement', 'two', 'millennia', '.'], ['It', 'founded', 'Romans', ',', 'named', 'Londinium', '.']]


# Part of Speech

In [31]:
def part_speech(word):
    wr=word
    for i in range(len(wr)):
        wr[i]=pos_tag(wr[i])
    return wr
print(part_speech(word))

[[('London', 'NNP'), ('is', 'VBZ'), ('the', 'DT'), ('capital', 'NN'), ('and', 'CC'), ('most', 'RBS'), ('populous', 'JJ'), ('city', 'NN'), ('of', 'IN'), ('England', 'NNP'), ('and', 'CC'), ('the', 'DT'), ('United', 'NNP'), ('Kingdom', 'NNP'), ('.', '.')], [('Standing', 'VBG'), ('on', 'IN'), ('the', 'DT'), ('River', 'NNP'), ('Thames', 'NNP'), ('in', 'IN'), ('the', 'DT'), ('south', 'JJ'), ('east', 'NN'), ('of', 'IN'), ('the', 'DT'), ('island', 'NN'), ('of', 'IN'), ('Great', 'NNP'), ('Britain', 'NNP'), (',', ','), ('London', 'NNP'), ('has', 'VBZ'), ('been', 'VBN'), ('a', 'DT'), ('major', 'JJ'), ('settlement', 'NN'), ('for', 'IN'), ('two', 'CD'), ('millennia', 'NN'), ('.', '.')], [('It', 'PRP'), ('was', 'VBD'), ('founded', 'VBN'), ('by', 'IN'), ('the', 'DT'), ('Romans', 'NNPS'), (',', ','), ('who', 'WP'), ('named', 'VBD'), ('it', 'PRP'), ('Londinium', 'NNP'), ('.', '.')]]


# Stemminization

In [23]:
stemmer = LancasterStemmer()

In [24]:
def word_stem(w):
    return lancaster_stemmer.stem(w)

In [25]:
"""words = ["program", "programs", "programer", "programing", "programers"] 
ps=PorterStemmer()
for w in words: 
    print(w, " : ", ps.stem(w))"""

'words = ["program", "programs", "programer", "programing", "programers"] \nps=PorterStemmer()\nfor w in words: \n    print(w, " : ", ps.stem(w))'

In [26]:
ps=PorterStemmer()
def stem(l):
    for i in range(len(l)):
        for j in range(len(l[i])):
            l[i][j]=stemmer.stem(l[i][j])
    return l

In [27]:
stemr=stem(stop)

In [28]:
print(stemr)

[['london', 'capit', 'pop', 'city', 'england', 'unit', 'kingdom', '.'], ['stand', 'riv', 'tham', 'sou', 'east', 'island', 'gre', 'britain', ',', 'london', 'maj', 'settl', 'two', 'millenn', '.'], ['it', 'found', 'rom', ',', 'nam', 'londin', '.']]


# Dependency Parsing

In [16]:
par=part_speech(stemr)
print(stemr)

[[('london', 'NN'), ('capit', 'JJ'), ('pop', 'NN'), ('city', 'NN'), ('england', 'VBP'), ('unit', 'NN'), ('kingdom', 'NN'), ('.', '.')], [('stand', 'NN'), ('riv', 'NN'), ('tham', 'NN'), ('sou', 'VBD'), ('east', 'JJ'), ('island', 'NN'), ('gre', 'NN'), ('britain', 'NN'), (',', ','), ('london', 'JJ'), ('maj', 'NN'), ('settl', 'VBD'), ('two', 'CD'), ('millenn', 'NN'), ('.', '.')], [('it', 'PRP'), ('found', 'VBD'), ('rom', 'NN'), (',', ','), ('nam', 'JJ'), ('londin', 'NN'), ('.', '.')]]


In [37]:
parser = CoreNLPDependencyParser()
#for i in stemr:
#    parse = next(parser.raw_parse(i))
#parse = next(parser.raw_parse(sent[0]))