# PoS Implementation
Mohammad Amin Ghasvari Jahromi - 97521432

# Imports

In [2]:
import nltk
from nltk import word_tokenize
from nltk.corpus import treebank
import pprint
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline

from random import shuffle
from string import punctuation

# Datasets download

In [3]:
nltk.download('treebank')
nltk.download('punkt')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Tagged sentences

I select one of these tagged sentences to see the structure.

In [4]:
tagged_sentences = list(treebank.tagged_sents())

# For exapmle
print(tagged_sentences[0])

[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]


# Create Test and Train datasets
I shuffled the tagged sentences and consider the first %80 of this list for training and the rest of that for testing.

In [5]:
# Shuffling
shuffle(tagged_sentences)
length_of_tagged_sentences = len(tagged_sentences)

# Split the train and test datasets
train_data, test_data = tagged_sentences[:int(length_of_tagged_sentences * 0.8)], tagged_sentences[int(length_of_tagged_sentences * 0.8):]

# Show some statistics
print(f"All the data: {length_of_tagged_sentences}")
print(f"Train data: {len(train_data)}")
print(f"Test data: {len(test_data)}")

All the data: 3914
Train data: 3131
Test data: 783


I defined a function to create the X and Y by having tagged

In [6]:
def create_dataset(tagged_sentences):
  X, Y = [], []      
  for tagged in tagged_sentences:         
    untag_sen = [w for w, t in tagged]  
    for index in range(len(tagged)):
      X.append(features(untag_sen, index))
      Y.append(tagged[index][1])
  
  return X, Y  

# Features
It's time to define some features for our input dataset.

In [7]:
def features(sentence, index):
  return {
    'word': sentence[index], 
    'len': len(sentence[index]),

    # Position
    'is_first': index == 0, 
    'is_last': index == len(sentence) - 1,  
    
    # Other words
    'prev_word': '' if index == 0 else sentence[index - 1],
    'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],   
    
    # Suffix & Prefix  
    'prefix-1': sentence[index][0],       
    'prefix-2': sentence[index][:2],      
    'prefix-3': sentence[index][:3],       
    'suffix-1': sentence[index][-1],     
    'suffix-2': sentence[index][-2:],      
    'suffix-3': sentence[index][-3:],     

    # Type of characters
    'is_numeric': sentence[index].isdigit(),
    'is_punc': any([sentence[index] == p for p in punctuation]),

    # Captalization
    'is_capitalized': sentence[index][0].upper() == sentence[index][0],
    'capitals_inside': sentence[index][1:].lower() != sentence[index][1:],
    'is_all_caps': sentence[index].upper() == sentence[index],
    'is_all_lower': sentence[index].lower() == sentence[index]
  }  


# Training
First of all, let's create our dataset.

In [8]:
x_train, y_train = create_dataset(train_data)
# Print a sample
print(x_train[30])
print(y_train[30])

{'word': "'s", 'len': 2, 'is_first': False, 'is_last': False, 'prev_word': 'Japan', 'next_word': '.', 'prefix-1': "'", 'prefix-2': "'s", 'prefix-3': "'s", 'suffix-1': 's', 'suffix-2': "'s", 'suffix-3': "'s", 'is_numeric': False, 'is_punc': False, 'is_capitalized': True, 'capitals_inside': False, 'is_all_caps': False, 'is_all_lower': True}
POS


I used the Sklearn package to create a pipeline. I used the DictVectorizer which can vectorize the features for each sentence. DecisionTreeClassifier is the classifier that is going to train our data.

I used the first 20000 x's of my train dataset since I didn't have enough amount of memory to train all of the train datasets.

In [9]:
classifier = Pipeline([('vectorizer', DictVectorizer(sparse=False)),('classifier', DecisionTreeClassifier(criterion='entropy'))])
classifier.fit(x_train[:20000], y_train[:20000])

Pipeline(steps=[('vectorizer', DictVectorizer(sparse=False)),
                ('classifier', DecisionTreeClassifier(criterion='entropy'))])

# Test the model
I created the train dataset to test the provided model.

In [10]:
x_test, y_test = create_dataset(test_data)

print(f"Accuracy: %{classifier.score(x_test, y_test) * 100}")

Accuracy: %92.39497827136648


# Classify
Here is an example of the classification.

In [11]:
def pos_tag(sentence):
  tags = classifier.predict([features(sentence, index) for index in range(len(sentence))])
  return list(zip(sentence, tags)) 

print(pos_tag(word_tokenize('Hello everybody, My name is Amin.')))

[('Hello', 'NNP'), ('everybody', 'NN'), (',', ','), ('My', 'NNP'), ('name', 'NN'), ('is', 'VBZ'), ('Amin', 'NNP'), ('.', '.')]
