In [1]:
!pip install sklearn-crfsuite

Collecting sklearn-crfsuite
  Downloading https://files.pythonhosted.org/packages/25/74/5b7befa513482e6dee1f3dd68171a6c9dfc14c0eaa00f885ffeba54fe9b0/sklearn_crfsuite-0.3.6-py2.py3-none-any.whl
Collecting python-crfsuite>=0.8.3
[?25l  Downloading https://files.pythonhosted.org/packages/95/99/869dde6dbf3e0d07a013c8eebfb0a3d30776334e0097f8432b631a9a3a19/python_crfsuite-0.9.7-cp36-cp36m-manylinux1_x86_64.whl (743kB)
[K     |████████████████████████████████| 747kB 5.7MB/s 
[?25hInstalling collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.7 sklearn-crfsuite-0.3.6


In [2]:
#get authorization from google drive
from google.colab import drive
drive.mount('/content/drive')

In [3]:
import sklearn_crfsuite
from sklearn_crfsuite import metrics
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV
import pandas as pd
import pickle
import json

#Prepare data and features

In [4]:
#prepare training and test data
def prepare_data(path):
  df = pd.read_csv(path, engine = "python")
  sents = []
  record = []
  cur = df['number'][0]
  for index,row in df.iterrows():
    if row['number'] != cur:
      sents.append(record)
      record = [(row['pos'], row['tag'])]
    else:
      record.append((row['pos'], row['tag']))
    cur = row['number']
  sents.append(record)

  return sents

In [5]:
train_sents = prepare_data("/content/drive/My Drive/cd_processing/Data/1.working/input/1880/880_train.csv")
validation_sents = prepare_data("/content/drive/My Drive/cd_processing/Data/1.working/input/1880/1880_validation.csv")

An example of training data

In [None]:
train_sents[0]

[('START', 'START'),
 ('Otersen', 'NC'),
 ('Casten', 'NC'),
 (',', 'D'),
 ('produce', 'OC'),
 (',', 'D'),
 ('h', 'PA'),
 ('149', 'AC'),
 ('Franklin', 'AC'),
 ('END', 'END')]

#Define CRF features

Feature Explanation

is_junior_token: does it equal "jr"?

is_widow_token: does it equal "widow"?

contains_digit: does it contain any number?

is_delimiter: is it a delimiter?

is_start: start of record?

is_end: end of record?

is_lower: all lowercase letters?

is_upper: all uppercase letters?


In [6]:
def is_junior_token(input):
        dc = input.lower()
        if dc == "jr":
            return True
        return False

def is_widow_token(input):
        dc = input.lower()
        if dc == "wid" or dc == "widow":
            return True
        return False

def contains_digit(input):
        for c in input:
            if c.isdigit():
                return True
        return False

def is_delimiter(input):
        for c in input:
            if c == '.' or c == ',':
                return True
        return False

def is_start(input):
        if input == "START":
            return True
        return False

def is_end(input):
        if input == "END":
            return True
        return False

Notice that features are not only generated on the current word, but also previous and next word.

In [7]:
def word2features(sent, i):
    word = sent[i][0]

    features = {
        'bias': 1.0,
        'is_junior_token': is_junior_token(word),
        'is_widow_token': is_widow_token(word),
        'contains_digit': contains_digit(word),
        'is_delimiter': is_delimiter(word),
        'is_start': is_start(word),
        'is_end': is_end(word),
        'is_lower': word.islower(),
        'is_title': word.istitle(),
        'is_upper': word.isupper(),
        'substr[-2:]': word[-2:],
        'substr[-1:]': word[-1:]
    }

    if i == 0:
      features.update({'BOS':True})
    elif i == len(sent) - 1:
      features.update({'EOS':True})
    else:
      word_prev = sent[i-1][0]
      word_next = sent[i+1][0]
      features.update({'prev_is_lower': word_prev.islower(), 
                       'prev_is_title': word_prev.istitle(),
                       'prev_is_upper': word_prev.isupper(),
                       'prev_is_delimiter': is_delimiter(word_prev),
                       'next_is_lower': word_next.islower(),
                       'next_is_title': word_next.istitle(),
                       'next_is_upper': word_next.isupper(),
                       'next_contains_digit': contains_digit(word_next),
                       'next_is_end': is_end(word_next)})
    
    return features


In [8]:
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for word, label in sent]

In [9]:
#extract features from data
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]
X_validation = [sent2features(s) for s in validation_sents]
y_validation = [sent2labels(s) for s in validation_sents]

#Training

In [10]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=500,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)



CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=500,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

#Evaluation

Precision, recall and f1-score are used as evaluation metrics.

A detailed explanation on metrics: https://medium.com/analytics-vidhya/pos-tagging-using-conditional-random-fields-92077e5eaa31

In [11]:
labels = list(crf.classes_)
y_pred = crf.predict(X_validation)
metrics.flat_f1_score(y_validation, y_pred,
                      average='weighted', labels=labels)

0.9906933631825323

Notice that evaluation metrics are applied to each category (AC, NC, PA etc.)

In [12]:
#metrics by label
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_validation, y_pred, labels=sorted_labels, digits=3
))

              precision    recall  f1-score   support

           D      1.000     0.963     0.981        54
          PA      1.000     1.000     1.000        21
          AC      0.989     1.000     0.995        94
          NC      0.975     1.000     0.988        79
          OC      1.000     0.955     0.977        22
         END      1.000     1.000     1.000        27
       START      1.000     1.000     1.000        27

    accuracy                          0.991       324
   macro avg      0.995     0.988     0.991       324
weighted avg      0.991     0.991     0.991       324



#Apply model to the whole dataset

In [13]:
import glob
file_path = "/content/nypl_1880_clean_records.txt"
with open(file_path) as f:
  data = f.readlines()  

Format record to apply model.

In [14]:
formatted = []
for record in data:
  ls = record.split()
  if ls:
    if ls[0][1:] == "***":
      continue
    else:
      record_split = []
      for word in ls[1:]:
        if word[-1] == "." or word[-1] == ",":
          record_split.append(word[:-1])
          record_split.append(word[-1])
        else:
          record_split.append(word)
    formatted.append([ls[0], record_split])

In [15]:
def predict_label(s):
  sents = [[(word,0) for word in s]]
  sents[0].insert(0, ('START',0))
  sents[0].append(('END', 0))
  X = [sent2features(s) for s in sents]
  y = crf.predict(X)
  return y

In [16]:
n = len(formatted)
for i in range(n):
  labels = predict_label(formatted[i][1])
  formatted[i].append(labels[0])

In [33]:
#pickle the list to save preliminary result
with open("/content/drive/My Drive/cd_processing/Data/1.working/input/1880/prediction.pkl", "wb") as f:
  pickle.dump(formatted,f)

#Save Prediction into JSON format

In [34]:
#retrieve result
with open("/content/drive/My Drive/cd_processing/Data/1.working/input/1880/prediction.pkl", 'rb') as f:
  d = pickle.load(f)

In [19]:
output = []
for record in d:
  d_record = {}
  d_record["index"] = int(record[0])
  n = len(record[1])
  start = 1
  for i in range(1, n + 1):
    if i <  n:
      #skip delimiters
      if record[2][i] == "D":
        start = i+1
        continue
      elif record[2][i+1] == "D":
        try:
          d_record[record[2][i]].append(record[1][(start-1):i])
        except:
          d_record[record[2][i]] = [record[1][(start-1):i]]
        start = i + 2
    else:
      try:
          d_record[record[2][i]].append(record[1][(start-1):])
      except:
          d_record[record[2][i]] = [record[1][(start-1):]]

  output.append(d_record)


#Generate primary fields

Here 4 primary fields are generated to faciliate the generation of final output: Occupation, Name, Marriage_Status and Address.

In [28]:
final = {}
for record in output:
  new = {}
  try:
    occupation = record['OC']
    new['Occupation'] = []
    for oc in occupation:
      new['Occupation'].append(' '.join(oc))
    new['Occupation'] = ' '.join(new['Occupation'])
  except:
    pass
  try:
    #deal with widow marriage status
    name = record['NC']
    new['Name'] = ' '.join(name[0])
    if len(name) > 1:
      if name[1][0] == 'widow':
        new['Marriage_Status'] = 'widow'
      else:
        new['Marriage_Status'] = 'widow of ' + ' '.join(name[1][2:])
  except:
    pass

  try:
    #multiple address
    address = record['AC']
    if len(address) == 1:
      if ((address[0][0] == "h") or (address[0][0] == "h.")):
        new['Address'] = [['h', ' '.join(address[0][1:])]]
      else:
        new['Address'] = [['assume_h', ' '.join(address[0])]]
    else:
      new['Address'] = []
      for ad in address:
        if ((ad[0] == "h") or (ad[0] == "h.")):
          new['Address'].append(['h', ' '.join(ad[1:])])
        else:
          new['Address'].append(['w', ' '.join(ad)])
  except:
    pass
  
  final[str(record['index'])] = new

In [29]:
with open('/content/drive/My Drive/cd_processing/Data/1.working/input/1880/result.json', 'w') as f:
    json.dump(final,f)
f.close()

In [30]:
with open('/content/drive/My Drive/cd_processing/Data/1.working/input/1880/result.json', 'r') as f:
    d = json.load(f)
f.close()

In [24]:
'''
11
115
450
711
1044
1570
'''
d['1044']

{'Address': [['w', '23 First'], ['h', '104 Seventh']],
 'Name': 'Zacpal John jr',
 'Occupation': 'tailor'}

#Some examples

In [31]:
#10 alphabetical examples
for i in range(3100, 3110):
  print(' '.join(formatted[i-1][1]))
  print(final[str(i)])
  print('\n')

Woodruff Henry , lawyer , 111 B’way
{'Occupation': 'lawyer', 'Name': 'Woodruff Henry', 'Address': [['assume_h', '111 B’way']]}


Woodruff Henry C . h 70 W . Washn , pl .
{'Name': 'Woodruff Henry C', 'Address': [['h', '70 W . Washn , pl .']]}


Woodruff Henry K . W . police , h 1025 Third av .
{'Occupation': 'police', 'Name': 'Woodruff Henry K . W', 'Address': [['h', '1025 Third av .']]}


Woodruff I . B . sec . 6 Murray , h Ct .
{'Occupation': 'sec', 'Name': 'Woodruff I . B', 'Address': [['w', '6 Murray'], ['h', 'Ct .']]}


Woodruff Isaac D . clerk , h 409 W . 43d
{'Occupation': 'clerk', 'Name': 'Woodruff Isaac D', 'Address': [['h', '409 W . 43d']]}


Woodruff Isaac O . bluing , 18 College pl .
{'Occupation': 'bluing', 'Name': 'Woodruff Isaac O', 'Address': [['assume_h', '18 College pl .']]}


Woodruff J . wid . Edward , h 148 W . 46th
{'Name': 'Woodruff J . wid . Edward', 'Address': [['h', '148 W . 46th']]}


Woodruff Job , clerk , h 39 W . 43d
{'Occupation': 'clerk', 'Name': 'Woodruf

In [32]:
#5 challenging examples
index = [11, 4123, 219, 1515, 4597]
for i in index:
  print(' '.join(formatted[i-1][1]))
  print(final[str(i)])
  print('\n')

Zollinger Caroline , wid . William , h 241 W . 53d
{'Name': 'Zollinger Caroline', 'Marriage_Status': 'widow of William', 'Address': [['h', '241 W . 53d']]}


Wolf Margaret , widow , hr 545 Pearl
{'Name': 'Wolf Margaret', 'Marriage_Status': 'widow', 'Address': [['assume_h', 'hr 545 Pearl']]}


Zinsser William & Co . varnishes , 197 William & 507 W . 58th
{'Occupation': 'varnishes', 'Name': 'Zinsser William & Co', 'Address': [['assume_h', '197 William & 507 W . 58th']]}


Yonge George , agent , 409 B’way , h 479 Willoughby av . B’klyn
{'Occupation': 'agent', 'Name': 'Yonge George', 'Address': [['w', '409 B’way'], ['h', '479 Willoughby av . B’klyn']]}


Witte Francis , conductor , h 817 Tenth av .
{'Occupation': 'conductor', 'Name': 'Witte Francis', 'Address': [['h', '817 Tenth av .']]}


