In [54]:
!pip install sklearn-crfsuite



In [55]:
#get authorization from google drive
#from google.colab import drive
#drive.mount('/content/drive')

In [56]:
import nltk
import sklearn_crfsuite
from sklearn_crfsuite import metrics
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV
import pandas as pd
import scipy
import pickle
import json

### Prepare data and features

In [57]:
# Prepare training and test data
def prepare_data(path):
  df = pd.read_csv(path, engine = "python")
  sents = []
  record = []
  cur = df['number'][0]
  for index,row in df.iterrows():
    if row['number'] != cur:
      sents.append(record)
      record = [(row['pos'], row['tag'])]
    else:
      record.append((row['pos'], row['tag']))
    cur = row['number']
  sents.append(record)

  return sents

In [58]:
!ls "input/1880"

1880_train.csv                  Final_Output.ipynb
1880_validation.csv             final_output.json
CRF.ipynb                       prediction.pkl
City_Directory_Formatting.ipynb result.json


In [59]:
train_sents = prepare_data("input/1880/1880_train.csv")
validation_sents = prepare_data("input/1880/1880_validation.csv")

### Show an example of training data

In [60]:
train_sents[0]

[('START', 'START'),
 ('Otersen', 'NC'),
 ('Casten', 'NC'),
 (',', 'D'),
 ('produce', 'OC'),
 (',', 'D'),
 ('h', 'PA'),
 ('149', 'AC'),
 ('Franklin', 'AC'),
 ('END', 'END')]

### Define CRF features

Feature Explanation

is_junior_token: does it equal "jr"?

is_widow_token: does it equal "widow"?

contains_digit: does it contain any number?

is_delimiter: is it a delimiter?

is_start: start of record?

is_end: end of record?

is_lower: all lowercase letters?

is_upper: all uppercase letters?


In [61]:
def is_junior_token(input):
        dc = input.lower()
        if dc == "jr":
            return True
        return False

def is_widow_token(input):
        dc = input.lower()
        if dc == "wid" or dc == "widow":
            return True
        return False

def contains_digit(input):
        for c in input:
            if c.isdigit():
                return True
        return False

def is_delimiter(input):
        for c in input:
            if c == '.' or c == ',':
                return True
        return False

def is_start(input):
        if input == "START":
            return True
        return False

def is_end(input):
        if input == "END":
            return True
        return False

### Notice that features are not only generated on the current word, but also previous and next word.

In [62]:
def word2features(sent, i):
    word = sent[i][0]

    features = {
        'bias': 1.0,
        'is_junior_token': is_junior_token(word),
        'is_widow_token': is_widow_token(word),
        'contains_digit': contains_digit(word),
        'is_delimiter': is_delimiter(word),
        'is_start': is_start(word),
        'is_end': is_end(word),
        'is_lower': word.islower(),
        'is_title': word.istitle(),
        'is_upper': word.isupper(),
        'substr[-2:]': word[-2:],
        'substr[-1:]': word[-1:]
    }

    if i == 0:
      features.update({'BOS':True})
    elif i == len(sent) - 1:
      features.update({'EOS':True})
    else:
      word_prev = sent[i-1][0]
      word_next = sent[i+1][0]
      features.update({'prev_is_lower': word_prev.islower(), 
                       'prev_is_title': word_prev.istitle(),
                       'prev_is_upper': word_prev.isupper(),
                       'prev_is_delimiter': is_delimiter(word_prev),
                       'next_is_lower': word_next.islower(),
                       'next_is_title': word_next.istitle(),
                       'next_is_upper': word_next.isupper(),
                       'next_contains_digit': contains_digit(word_next),
                       'next_is_end': is_end(word_next)})
    
    return features


In [63]:
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for word, label in sent]

In [64]:
# Extract features from data
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]
X_validation = [sent2features(s) for s in validation_sents]
y_validation = [sent2labels(s) for s in validation_sents]

### Training

In [65]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=500,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=500,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

### Evaluation

Precision, recall and f1-score are used as evaluation metrics.

A detailed explanation on metrics: https://medium.com/analytics-vidhya/pos-tagging-using-conditional-random-fields-92077e5eaa31

In [66]:
labels = list(crf.classes_)
y_pred = crf.predict(X_validation)
metrics.flat_f1_score(y_validation, y_pred,
                      average='weighted', labels=labels)

0.9906933631825323

### Notice that evaluation metrics are applied to each category (AC, NC, PA etc.)

In [67]:
# Metrics by label
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_validation, y_pred, labels=sorted_labels, digits=3
))

              precision    recall  f1-score   support

           D      1.000     0.963     0.981        54
          PA      1.000     1.000     1.000        21
          AC      0.989     1.000     0.995        94
          NC      0.975     1.000     0.988        79
          OC      1.000     0.955     0.977        22
         END      1.000     1.000     1.000        27
       START      1.000     1.000     1.000        27

    accuracy                          0.991       324
   macro avg      0.995     0.988     0.991       324
weighted avg      0.991     0.991     0.991       324



### Apply model to the whole dataset

In [68]:
import glob
file_path = "input/nypl_1880_81_starred_clean.txt"

with open(file_path) as f:
  data = f.readlines()  

### Format record to apply model.

In [69]:
formatted = []
for record in data:
  ls = record.split()
  if ls:
    if ls[0][1:] == "***":
      continue
    else:
      record_split = []
      for word in ls[1:]:
        if word[-1] == "." or word[-1] == ",":
          record_split.append(word[:-1])
          record_split.append(word[-1])
        else:
          record_split.append(word)
    formatted.append([ls[0], record_split])

In [70]:
def predict_label(s):
  sents = [[(word,0) for word in s]]
  sents[0].insert(0, ('START',0))
  sents[0].append(('END', 0))
  X = [sent2features(s) for s in sents]
  y = crf.predict(X)
  return y

In [71]:
n = len(formatted)
for i in range(n):
  labels = predict_label(formatted[i][1])
  formatted[i].append(labels[0])

In [72]:
# Pickle the list to save preliminary result
with open("input/1880/prediction.pkl", "wb") as f:
  pickle.dump(formatted,f)

### Save Prediction into JSON format

In [73]:
# Retrieve result
with open("input/1880/prediction.pkl", 'rb') as f:
  d = pickle.load(f)

In [74]:
output = []
for record in d:
  d_record = {}
  d_record["index"] = int(record[0])
  n = len(record[1])
  start = 1
  for i in range(1, n + 1):
    if i <  n:
      # Skip delimiters
      if record[2][i] == "D":
        start = i+1
        continue
      elif record[2][i+1] == "D":
        try:
          d_record[record[2][i]].append(record[1][(start-1):i])
        except:
          d_record[record[2][i]] = [record[1][(start-1):i]]
        start = i + 2
    else:
      try:
          d_record[record[2][i]].append(record[1][(start-1):])
      except:
          d_record[record[2][i]] = [record[1][(start-1):]]

  output.append(d_record)


In [75]:
output

[{'index': 1,
  'NC': [['Zoller', 'Ignatz']],
  'OC': [['shoes']],
  'AC': [['106', 'W', '.', '24th']]},
 {'index': 2,
  'NC': [['Zoller', 'John']],
  'OC': [['tailor']],
  'AC': [['221', 'Ninth', 'av', '.']]},
 {'index': 3,
  'NC': [['Zoller', 'Robert', 'W']],
  'OC': [['birds']],
  'AC': [['5', 'N', '.', 'William']]},
 {'index': 4,
  'NC': [['Zoller', 'Stephen']],
  'OC': [['molder']],
  'AC': [['h', 'r', '504', 'W', '.', '55th']]},
 {'index': 5,
  'NC': [['Zollfrai', 'Sarah']],
  'OC': [['nurse']],
  'AC': [['h', '934', 'First', 'av', '.']]},
 {'index': 6,
  'NC': [['Zollfroi', 'Abraham']],
  'OC': [['pedlar']],
  'AC': [['h', '202', 'Seventh']]},
 {'index': 7,
  'NC': [['Zollfrey', 'Moses']],
  'OC': [['cutter']],
  'AC': [['h', '208', 'E', '.', '49th']]},
 {'index': 8,
  'NC': [['Zollie', 'Louis']],
  'AC': [['h', 'r', '10', 'Dominick']]},
 {'index': 9,
  'NC': [['Zollikoffer', 'Oscar']],
  'OC': [['pres']],
  'AC': [['1545)', 'B’way'], ['h', '210', 'W', '.', '46th']]},
 {'index':

### Generate primary fields

Here 4 primary fields are generated to faciliate the generation of final output: Occupation, Name, Marriage_Status and Address.

In [76]:
final = {}
for record in output:
  new = {}
  try:
    occupation = record['OC']
    new['Occupation'] = []
    for oc in occupation:
      new['Occupation'].append(' '.join(oc))
    new['Occupation'] = ' '.join(new['Occupation'])
  except:
    pass
  try:
    # Deal with widow marriage status
    name = record['NC']
    new['Name'] = ' '.join(name[0])
    if len(name) > 1:
      if name[1][0] == 'widow':
        new['Marriage_Status'] = 'widow'
      else:
        new['Marriage_Status'] = 'widow of ' + ' '.join(name[1][2:])
  except:
    pass

  try:
    # Multiple address
    address = record['AC']
    if len(address) == 1:
      if address[0][0] == "h":
        new['Address'] = [['h', ' '.join(address[0][1:])]]
      else:
        new['Address'] = [['assume_h', ' '.join(address[0])]]
    else:
      new['Address'] = []
      for ad in address:
        if ad[0] == "h":
          new['Address'].append(['h', ' '.join(ad[1:])])
        else:
          new['Address'].append(['w', ' '.join(ad)])
  except:
    pass
  
  final[str(record['index'])] = new

In [77]:
with open('input/1880/result.json', 'w') as f:
    json.dump(final,f)
f.close()

In [78]:
with open('input/1880/result.json', 'r') as f:
    d = json.load(f)
f.close()

In [79]:
'''
11
115
450
711
1044
1570
'''
d['1570']

{'Address': [['assume_h', 'Y*oung Alfred (Rev.) h W . 59th n Ninth av .']]}

### Some examples

In [80]:
# 10 alphabetical examples
for i in range(3100, 3110):
  print(' '.join(formatted[i-1][1]))
  print(final[str(i)])
  print('\n')

Woodruff Job , clerk , h 39 W . 43d
{'Occupation': 'clerk', 'Name': 'Woodruff Job', 'Address': [['h', '39 W . 43d']]}


Woodruff John , carpenter , h 2486 Second av .
{'Occupation': 'carpenter', 'Name': 'Woodruff John', 'Address': [['h', '2486 Second av .']]}


Woodruff Lucius L . stamps , 90 Nassau , h Col .
{'Occupation': 'stamps', 'Name': 'Woodruff Lucius L', 'Address': [['w', '90 Nassau'], ['h', 'Col .']]}


Woodruff Mahlon J . manager , 45 Chambers , h 310 Washn . av . B'klyn
{'Occupation': 'manager', 'Name': 'Woodruff Mahlon J', 'Address': [['w', '45 Chambers'], ['h', "310 Washn . av . B'klyn"]]}


Woodruff Marcus P . mer . 31 B’way , h Larchmont
{'Occupation': 'mer', 'Name': 'Woodruff Marcus P', 'Address': [['w', '31 B’way'], ['h', 'Larchmont']]}


Woodruff Margaret , wid . David , h 653 Hudson
{'Name': 'Woodruff Margaret', 'Marriage_Status': 'widow of David', 'Address': [['h', '653 Hudson']]}


Woodruff Morris , tea . 93 Front , h 27 E . 22d
{'Occupation': 'tea', 'Name': 'Woodr

In [81]:
# 5 challenging examples
index = [11, 4123, 219, 1515, 4597]
for i in index:
  print(' '.join(formatted[i-1][1]))
  print(final[str(i)])
  print('\n')

Zollinger Caroline , wid . William , h 241 W . 53d
{'Name': 'Zollinger Caroline', 'Marriage_Status': 'widow of William', 'Address': [['h', '241 W . 53d']]}


Wolf Mary , wid . William , drygds . 158 E . 4th
{'Occupation': 'drygds', 'Name': 'Wolf Mary', 'Marriage_Status': 'widow of William', 'Address': [['assume_h', '158 E . 4th']]}


Zinsser William & Co . varnishes , 197 William & 507 W . 58th
{'Occupation': 'varnishes', 'Name': 'Zinsser William & Co', 'Address': [['assume_h', '197 William & 507 W . 58th']]}


Yonge George , agent , 409 B’way , h 479 Willoughby av . B’klyn
{'Occupation': 'agent', 'Name': 'Yonge George', 'Address': [['w', '409 B’way'], ['h', '479 Willoughby av . B’klyn']]}


Witte John G . & Brother , importers of hardware cutlery and needles , 75 Chambers
{'Occupation': 'importers of hardware cutlery and needles', 'Name': 'Witte John G . & Brother', 'Address': [['assume_h', '75 Chambers']]}


