# Learning Objectives
To implement POS Tagging from scratch using Decision Tree Classifier

In [3]:
#Imports
import nltk
from nltk.corpus import treebank as tb

In [4]:
#Packages
nltk.download('treebank')
nltk.download('punkt')
nltk.download('tagsets')


[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\Dil\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\treebank.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dil\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\Dil\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping help\tagsets.zip.


True

In [14]:
import pprint
tagged_sentences = tb.tagged_sents()

print("Tagged sentences: ",len(tagged_sentences))

print("Tagged Words: ",len(tb.tagged_words())) 

pprint.pprint(tagged_sentences[0])


Tagged sentences:  3914
Tagged Words:  100676
[('Pierre', 'NNP'),
 ('Vinken', 'NNP'),
 (',', ','),
 ('61', 'CD'),
 ('years', 'NNS'),
 ('old', 'JJ'),
 (',', ','),
 ('will', 'MD'),
 ('join', 'VB'),
 ('the', 'DT'),
 ('board', 'NN'),
 ('as', 'IN'),
 ('a', 'DT'),
 ('nonexecutive', 'JJ'),
 ('director', 'NN'),
 ('Nov.', 'NNP'),
 ('29', 'CD'),
 ('.', '.')]
[('Mr.', 'NNP'), ('Vinken', 'NNP'), ('is', 'VBZ'), ('chairman', 'NN'), ('of', 'IN'), ('Elsevier', 'NNP'), ('N.V.', 'NNP'), (',', ','), ('the', 'DT'), ('Dutch', 'NNP'), ('publishing', 'VBG'), ('group', 'NN'), ('.', '.')]


# Feature Extractions

In [15]:
def features(sentence,index):
    features_dict = {
        '_prefix':sentence[index][0],
        '__prefix':sentence[index][:2],
        '___prefix':sentence[index][:3],
        '_suffix':sentence[index][-1],
        '__suffix':sentence[index][-2:],
        '___suffix':sentence[index][-3:],
        '_prev_word': '' if index == 0 else sentence[index-1],
        '_next_word': '' if index == len(sentence) - 1 else sentence[index+1],
        '_is_numeric': sentence[index].isdigit()
    }
    return features_dict

## Untagging the sentence

In [17]:
def untag_sentence(tagged_sentence):
    return [word for word,tag in tagged_sentence]

## Generate the features and target values

In [37]:
def generate_dataset(tagged_sentences):
    X,y = [],[]
    for tagged in tagged_sentences:
        for index in range(len(tagged)):
            X.append(dict(zip(untag_sentence(tagged),index)))
            y.append(tagged[index][1])    
    return X,y

## Splitting in training and testing datasets

In [19]:
cutoff = int(.75 * len(tagged_sentences))
train = tagged_sentences[:cutoff]
test = tagged_sentences[cutoff:]

In [38]:
X,y = generate_dataset(train)

TypeError: 'int' object is not iterable

In [26]:
print(len(X),len(y))

75784 75784


# Modelling

In [31]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline

In [32]:
classifier = Pipeline([
    ('vectorizer', DictVectorizer(sparse=False)),
    ('classifier', DecisionTreeClassifier(criterion='entropy'))
])

In [35]:
classifier.fit(X[:20000],y[:20000])

AttributeError: 'tuple' object has no attribute 'items'

In [29]:
X_text,y_test = generate_dataset(test)

# Model Prediction