In [139]:
import pandas as pd
from sklearn.externals import joblib

PATH = 'ner_test_data.csv'
df = pd.read_csv(PATH, encoding = "ISO-8859-1")

In [140]:
df = df.fillna(method='ffill')
df

Unnamed: 0,Sentence #,Word,POS
0,Sentence: 1,Wildfires,NNS
1,Sentence: 1,devastated,VBD
2,Sentence: 1,southeastern,NN
3,Sentence: 1,Australia,NNP
4,Sentence: 1,in,IN
5,Sentence: 1,the,DT
6,Sentence: 1,final,NN
7,Sentence: 1,months,NNS
8,Sentence: 1,of,IN
9,Sentence: 1,2019,LS


In [141]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        #genero una tupla de palabras, POS y etiquetas
        agg_func = lambda s: [(w, p) for w, p in zip(s['Word'].values.tolist(), 
                                                           s['POS'].values.tolist())]
        
        #agrupo cada oración por sus palabras, POS y etiquetas
        self.grouped = self.data.groupby('Sentence #').apply(agg_func)
        self.sentences = [s for s in self.grouped]
        
    def get_next(self):
        try: 
            s = self.grouped['Sentence: {}'.format(self.n_sent)]
            self.n_sent += 1
            return s 
        except:
            return None
getter = SentenceGetter(df)
sentences = getter.sentences
sentences

[[('Wildfires', 'NNS'),
  ('devastated', 'VBD'),
  ('southeastern', 'NN'),
  ('Australia', 'NNP'),
  ('in', 'IN'),
  ('the', 'DT'),
  ('final', 'NN'),
  ('months', 'NNS'),
  ('of', 'IN'),
  ('2019', 'LS'),
  ('and', 'CC'),
  ('in', 'IN'),
  ('January', 'NNP'),
  ('2020', 'LS'),
  ('.', '.')]]

In [142]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0, 
        'word.lower()': word.lower(), 
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True
    return features
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]
def sent2labels(sent):
    return [label for token, postag, label in sent]
def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [143]:
X = [sent2features(s) for s in sentences]
X

[[{'bias': 1.0,
   'word.lower()': 'wildfires',
   'word[-3:]': 'res',
   'word[-2:]': 'es',
   'word.isupper()': False,
   'word.istitle()': True,
   'word.isdigit()': False,
   'postag': 'NNS',
   'postag[:2]': 'NN',
   'BOS': True,
   '+1:word.lower()': 'devastated',
   '+1:word.istitle()': False,
   '+1:word.isupper()': False,
   '+1:postag': 'VBD',
   '+1:postag[:2]': 'VB'},
  {'bias': 1.0,
   'word.lower()': 'devastated',
   'word[-3:]': 'ted',
   'word[-2:]': 'ed',
   'word.isupper()': False,
   'word.istitle()': False,
   'word.isdigit()': False,
   'postag': 'VBD',
   'postag[:2]': 'VB',
   '-1:word.lower()': 'wildfires',
   '-1:word.istitle()': True,
   '-1:word.isupper()': False,
   '-1:postag': 'NNS',
   '-1:postag[:2]': 'NN',
   '+1:word.lower()': 'southeastern',
   '+1:word.istitle()': False,
   '+1:word.isupper()': False,
   '+1:postag': 'NN',
   '+1:postag[:2]': 'NN'},
  {'bias': 1.0,
   'word.lower()': 'southeastern',
   'word[-3:]': 'ern',
   'word[-2:]': 'rn',
   'wo

In [144]:
crf = joblib.load("./models/crf_model.pkl")
y_pred = crf.predict(X)
y_pred

[['O',
  'O',
  'O',
  'B-geo',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-tim',
  'I-tim',
  'O']]