## POS tag and Hidden Markov Model

In [None]:
import nltk
with open("Data/Conll2000/train.txt") as f:
    lines = f.read().splitlines() 

words=[]
for i in lines:
    keylabel=i.split(' ')
    if len(keylabel[0])>0:
        words.append(tuple([keylabel[0], keylabel[1]]))

sentence=[]   
train_data=[]
for i in words:
    sentence.append(i)
    if i[0]=='.':
        train_data.append(sentence)
        sentence=[]

In [None]:
train_data

In [None]:
with open("Data/Conll2000/test.txt") as f:
    lines = f.read().splitlines() 
words=[]
for i in lines:
    keylabel=i.split(' ')
    if len(keylabel[0])>0:
        words.append(tuple([keylabel[0], keylabel[1]]))
sentence=[]   
test_data=[]
for i in words:
    sentence.append(i)
    if i[0]=='.':
        test_data.append(sentence)
        sentence=[]

In [None]:
# Import HMM module
from nltk.tag import hmm
# Setup a trainer with default(None) values
# And train with the data
hmm_pos = hmm.HiddenMarkovModelTrainer()
tagger = hmm_pos.train_supervised(train_data)

In [None]:
print (tagger.tag("Today is a good day .".split()))

In [None]:
print (tagger.tag("Joe met Joanne in Delhi .".split()))

In [None]:
tagger.tag([i[0] for i in test_data[0]])

In [None]:
test_data[0]

## NLP-Architecture and Neural Network Based POS-tagging

Reference http://nlp_architect.nervanasys.com

Installation Reference http://nlp_architect.nervanasys.com/installation.html

## Installation
```git clone https://github.com/NervanaSystems/nlp-architect.git```

```cd nlp-architect```

```pip install .```

## Conll 2000 Data Set
https://github.com/teropa/nlp/tree/master/resources/corpora/conll2000


## Neural Network Based Sequence Tagging 

http://nlp_architect.nervanasys.com/tagging/sequence_tagging.html

### Training
```python train.py --data_dir <path to CONLL2000 files>```

### Tagging
```python inference.py --model_name <model_name> --input <input_file>.txt```

e.g.

```python examples/chunker/inference.py --model_name 'chunker_model' --input_file ../Data/Conll2000/outsample.txt``` 

## POS tag use pre-trained Model

In [None]:
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from nltk.corpus import stopwords 
import nltk 

In [None]:
file = open("Data/J. K. Rowling - Harry Potter 1 - Sorcerer's Stone",'r')
raw_data_1 = file.read()
file.close()

word_tokens = wordpunct_tokenize(raw_data_1)

stop_words = set(stopwords.words('english')) 

word_tokens = [w.lower() for w in word_tokens] 
word_tokens = [w.lower() for w in word_tokens if not w in stop_words] 
word_tokens = [w.lower() for w in word_tokens if w.isalpha()] 

In [None]:
word_tokens

In [None]:
#nltk.download('averaged_perceptron_tagger')

In [12]:
harry_token_pos=nltk.pos_tag(word_tokens)

In [13]:
harry_token_pos

[('harry', 'NN'),
 ('potter', 'NN'),
 ('sorcerer', 'NN'),
 ('stone', 'NN'),
 ('chapter', 'NN'),
 ('one', 'CD'),
 ('boy', 'NN'),
 ('lived', 'VBD'),
 ('mr', 'JJ'),
 ('mrs', 'NN'),
 ('dursley', 'NN'),
 ('number', 'NN'),
 ('four', 'CD'),
 ('privet', 'NN'),
 ('drive', 'NN'),
 ('proud', 'NNS'),
 ('say', 'VBP'),
 ('perfectly', 'RB'),
 ('normal', 'JJ'),
 ('thank', 'NN'),
 ('much', 'JJ'),
 ('last', 'JJ'),
 ('people', 'NNS'),
 ('expect', 'VBP'),
 ('involved', 'VBN'),
 ('anything', 'NN'),
 ('strange', 'JJ'),
 ('mysterious', 'JJ'),
 ('hold', 'NN'),
 ('nonsense', 'NN'),
 ('mr', 'NN'),
 ('dursley', 'NN'),
 ('director', 'NN'),
 ('firm', 'NN'),
 ('called', 'VBD'),
 ('grunnings', 'NNS'),
 ('made', 'VBN'),
 ('drills', 'NNS'),
 ('big', 'JJ'),
 ('beefy', 'NN'),
 ('man', 'NN'),
 ('hardly', 'RB'),
 ('neck', 'CC'),
 ('although', 'IN'),
 ('large', 'JJ'),
 ('mustache', 'NN'),
 ('mrs', 'NN'),
 ('dursley', 'NN'),
 ('thin', 'JJ'),
 ('blonde', 'NN'),
 ('nearly', 'RB'),
 ('twice', 'RB'),
 ('usual', 'JJ'),
 ('amount

## Find all Nouns and count the number of occurence

In [14]:
import pandas as pd
import numpy as np

In [18]:
AllHarryPotter=[]
for word in word_tokens:
    AllHarryPotter.append(word)
    
WordTable=pd.DataFrame({"Key":AllHarryPotter, "Occurence": np.ones(len(AllHarryPotter))})
Wordcount=WordTable.groupby(['Key'], sort=True).Occurence.sum().reset_index()
Wordcount.sort_values(by = 'Occurence', ascending=[False])

Unnamed: 0,Key,Occurence
2217,harry,1327.0
3968,said,794.0
3901,ron,429.0
2167,hagrid,370.0
2277,hermione,270.0
224,back,261.0
3240,one,257.0
2611,know,212.0
2063,got,199.0
1007,could,198.0


In [None]:
print(len(harry_token_pos))
print(len(word_tokens))

In [15]:
NounsinHarryPotter=[]
for word, pos in harry_token_pos:
    if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS'):
        NounsinHarryPotter.append(word)

In [16]:
NounsinHarryPotter

['harry',
 'potter',
 'sorcerer',
 'stone',
 'chapter',
 'boy',
 'mrs',
 'dursley',
 'number',
 'privet',
 'drive',
 'proud',
 'thank',
 'people',
 'anything',
 'hold',
 'nonsense',
 'mr',
 'dursley',
 'director',
 'firm',
 'grunnings',
 'drills',
 'beefy',
 'man',
 'mustache',
 'mrs',
 'dursley',
 'blonde',
 'amount',
 'neck',
 'spent',
 'time',
 'garden',
 'fences',
 'neighbors',
 'son',
 'opinion',
 'finer',
 'boy',
 'everything',
 'fear',
 'somebody',
 'anyone',
 'potters',
 'potter',
 'mrs',
 'dursley',
 'sister',
 'years',
 'fact',
 'dursley',
 'sister',
 'nothing',
 'husband',
 'dursleys',
 'neighbors',
 'potters',
 'street',
 'dursleys',
 'potters',
 'son',
 'boy',
 'reason',
 'potters',
 'child',
 'mr',
 'mrs',
 'dursley',
 'gray',
 'story',
 'nothing',
 'cloudy',
 'sky',
 'things',
 'country',
 'dursley',
 'tie',
 'work',
 'mrs',
 'dursley',
 'chair',
 'none',
 'tawny',
 'owl',
 'flutter',
 'window',
 'half',
 'mr',
 'dursley',
 'briefcase',
 'dursley',
 'cheek',
 'dudley',
 

In [17]:
NounsTable=pd.DataFrame({"Key":NounsinHarryPotter, "Occurence": np.ones(len(NounsinHarryPotter))})

Nouncount=NounsTable.groupby(['Key'], sort=True).Occurence.sum().reset_index()
Nouncount.sort_values(by = 'Occurence', ascending=[False])

Unnamed: 0,Key,Occurence
1154,harry,943.0
2102,ron,367.0
1188,hermione,226.0
2322,snape,154.0
1949,professor,137.0
2346,something,132.0
1128,hagrid,120.0
2643,time,120.0
2781,vernon,106.0
842,eyes,101.0
