In [None]:
!pip install sklearn-crfsuite

Collecting sklearn-crfsuite
  Downloading https://files.pythonhosted.org/packages/25/74/5b7befa513482e6dee1f3dd68171a6c9dfc14c0eaa00f885ffeba54fe9b0/sklearn_crfsuite-0.3.6-py2.py3-none-any.whl
Collecting python-crfsuite>=0.8.3
[?25l  Downloading https://files.pythonhosted.org/packages/95/99/869dde6dbf3e0d07a013c8eebfb0a3d30776334e0097f8432b631a9a3a19/python_crfsuite-0.9.7-cp36-cp36m-manylinux1_x86_64.whl (743kB)
[K     |████████████████████████████████| 747kB 4.3MB/s 
Installing collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.7 sklearn-crfsuite-0.3.6


In [None]:
#get authorization from google drive
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
import sklearn_crfsuite
from sklearn_crfsuite import metrics
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV
import pandas as pd
import pickle
import json

#Prepare data and features

In [None]:
#prepare training and test data
def prepare_data(path):
  df = pd.read_csv(path, engine = "python")
  sents = []
  record = []
  cur = df['number'][0]
  for index,row in df.iterrows():
    if row['number'] != cur:
      sents.append(record)
      record = [(row['pos'], row['tag'])]
    else:
      record.append((row['pos'], row['tag']))
    cur = row['number']
  sents.append(record)

  return sents

In [None]:
train_sents = prepare_data("1880_train.csv")
validation_sents = prepare_data("1880_validation.csv")

An example of training data

In [None]:
train_sents[0]

[('START', 'START'),
 ('Otersen', 'NC'),
 ('Casten', 'NC'),
 (',', 'D'),
 ('produce', 'OC'),
 (',', 'D'),
 ('h', 'PA'),
 ('149', 'AC'),
 ('Franklin', 'AC'),
 ('END', 'END')]

#Define CRF features

Feature Explanation

is_junior_token: does it equal "jr"?

is_widow_token: does it equal "widow"?

contains_digit: does it contain any number?

is_delimiter: is it a delimiter?

is_start: start of record?

is_end: end of record?

is_lower: all lowercase letters?

is_upper: all uppercase letters?


In [None]:
def is_junior_token(input):
        dc = input.lower()
        if dc == "jr":
            return True
        return False

def is_widow_token(input):
        dc = input.lower()
        if dc == "wid" or dc == "widow":
            return True
        return False

def contains_digit(input):
        for c in input:
            if c.isdigit():
                return True
        return False

def is_delimiter(input):
        for c in input:
            if c == '.' or c == ',':
                return True
        return False

def is_start(input):
        if input == "START":
            return True
        return False

def is_end(input):
        if input == "END":
            return True
        return False

Notice that features are not only generated on the current word, but also previous and next word.

In [None]:
def word2features(sent, i):
    word = sent[i][0]

    features = {
        'bias': 1.0,
        'is_junior_token': is_junior_token(word),
        'is_widow_token': is_widow_token(word),
        'contains_digit': contains_digit(word),
        'is_delimiter': is_delimiter(word),
        'is_start': is_start(word),
        'is_end': is_end(word),
        'is_lower': word.islower(),
        'is_title': word.istitle(),
        'is_upper': word.isupper(),
        'substr[-2:]': word[-2:],
        'substr[-1:]': word[-1:]
    }

    if i == 0:
      features.update({'BOS':True})
    elif i == len(sent) - 1:
      features.update({'EOS':True})
    else:
      word_prev = sent[i-1][0]
      word_next = sent[i+1][0]
      features.update({'prev_is_lower': word_prev.islower(), 
                       'prev_is_title': word_prev.istitle(),
                       'prev_is_upper': word_prev.isupper(),
                       'prev_is_delimiter': is_delimiter(word_prev),
                       'next_is_lower': word_next.islower(),
                       'next_is_title': word_next.istitle(),
                       'next_is_upper': word_next.isupper(),
                       'next_contains_digit': contains_digit(word_next),
                       'next_is_end': is_end(word_next)})
    
    return features


In [None]:
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for word, label in sent]

In [None]:
#extract features from data
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]
X_validation = [sent2features(s) for s in validation_sents]
y_validation = [sent2labels(s) for s in validation_sents]

#Training

In [None]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=500,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)



CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=500,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

#Evaluation

Precision, recall and f1-score are used as evaluation metrics.

A detailed explanation on metrics: https://medium.com/analytics-vidhya/pos-tagging-using-conditional-random-fields-92077e5eaa31

In [None]:
labels = list(crf.classes_)
y_pred = crf.predict(X_validation)
metrics.flat_f1_score(y_validation, y_pred,
                      average='weighted', labels=labels)

0.9906933631825323

Notice that evaluation metrics are applied to each category (AC, NC, PA etc.)

In [None]:
#metrics by label
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_validation, y_pred, labels=sorted_labels, digits=3
))

              precision    recall  f1-score   support

           D      1.000     0.963     0.981        54
          PA      1.000     1.000     1.000        21
          AC      0.989     1.000     0.995        94
          NC      0.975     1.000     0.988        79
          OC      1.000     0.955     0.977        22
         END      1.000     1.000     1.000        27
       START      1.000     1.000     1.000        27

    accuracy                          0.991       324
   macro avg      0.995     0.988     0.991       324
weighted avg      0.991     0.991     0.991       324



#Apply model to the whole dataset

In [None]:
#import glob
#file_path = "/Users/prajwal/Desktop/Columbia/C4SR/hnyc_cd_processing-master/input/1880/nypl_1880_clean_records.txt"
#with open(file_path) as f:
  #data = f.readlines()  

In [None]:
#import glob
#file_path = "/Users/prajwal/Desktop/Columbia/C4SR/doubt/doubt5 filter11.txt"
#with open(file_path) as f:
  #data = f.readlines()  

In [None]:
import pandas as pd
#read_df = pd.read_excel('/Users/prajwal/Desktop/Columbia/C4SR/doubt/doubt6-filter11.xlsx')
#read_df = pd.read_excel('/Users/prajwal/Desktop/Columbia/C4SR/bk_1850/doubt-bk1850-v7.xlsx')
read_df = pd.read_excel('mn-1880-2.xlsx')
read_dict = {}
for i in range(len(read_df)):
    read_dict[i] = read_df['modified_clean_address'][i]
read_list = []
for i in range(0,len(read_dict)):
    print(i)
    try:
        read_list.append(str(list(read_dict.keys())[i]) + ' ' + list(read_dict.values())[i])
    except:
        read_list.append('')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
284991
284992
284993
284994
284995
284996
284997
284998
284999
285000
285001
285002
285003
285004
285005
285006
285007
285008
285009
285010
285011
285012
285013
285014
285015
285016
285017
285018
285019
285020
285021
285022
285023
285024
285025
285026
285027
285028
285029
285030
285031
285032
285033
285034
285035
285036
285037
285038
285039
285040
285041
285042
285043
285044
285045
285046
285047
285048
285049
285050
285051
285052
285053
285054
285055
285056
285057
285058
285059
285060
285061
285062
285063
285064
285065
285066
285067
285068
285069
285070
285071
285072
285073
285074
285075
285076
285077
285078
285079
285080
285081
285082
285083
285084
285085
285086
285087
285088
285089
285090
285091
285092
285093
285094
285095
285096
285097
285098
285099
285100
285101
285102
285103
285104
285105
285106
285107
285108
285109
285110
285111
285112
285113
285114
285115
285116
285117
285118
285119
285120
285121
285122
285123
2851

In [None]:
data = read_list

In [None]:
len(data)

289991

Format record to apply model.

In [None]:
formatted = []

for record in data:
    ls = record.split()
    #if ls:
    #if ls[0][1:] == "***":
    if ls[1:] == "***":
      continue
    else:
      record_split = []
      for word in ls[1:]:
        if word[-1] == "." or word[-1] == ",":
          record_split.append(word[:-1])
          record_split.append(word[-1])
        else:
          record_split.append(word)
    #formatted.append([ls[0], record_split])
    formatted.append([ls, record_split])


In [None]:
formatted = []
num1=-1
for record in data:
    num1+=1
    ls = record.split()
    if ls:
        if ls[0][1:] == "***":
          continue
        else:
          record_split = []
          for word in ls[1:]:
            if word[-1] == "." or word[-1] == ",":
              record_split.append(word[:-1])
              record_split.append(word[-1])
            else:
              record_split.append(word)
        formatted.append([ls[0], record_split])
    else:
        #print([num1,['']])
        formatted.append([num1,''])

In [None]:
len(formatted)

289991

In [None]:
def predict_label(s):
  sents = [[(word,0) for word in s]]
  sents[0].insert(0, ('START',0))
  sents[0].append(('END', 0))
  X = [sent2features(s) for s in sents]
  y = crf.predict(X)
  return y

In [None]:
n = len(formatted)
for i in range(n):
  labels = predict_label(formatted[i][1])
  formatted[i].append(labels[0])

In [None]:
formatted

[['0', ['h', '11', 'Eldridge'], ['START', 'PA', 'AC', 'AC', 'END']],
 ['1',
  ['97', 'William', ',', 'h', '.', '66', 'Ft', '', '.'],
  ['START', 'NC', 'NC', 'D', 'PA', 'AC', 'AC', 'AC', 'AC', 'AC', 'END']],
 ['2', ['Bâ€™klyn'], ['START', 'AC', 'END']],
 ['3', ['184', 'Forsyth'], ['START', 'AC', 'AC', 'END']],
 ['4', ['60', 'Eldridge'], ['START', 'AC', 'AC', 'END']],
 ['5',
  ['h', '214', 'E', '', '.', '13th'],
  ['START', 'PA', 'AC', 'AC', 'AC', 'AC', 'AC', 'END']],
 ['6',
  ['53', 'Walker', ',', 'h', '.', '70', 'W', '', '.', '48th'],
  ['START', 'NC', 'NC', 'D', 'PA', 'AC', 'AC', 'AC', 'AC', 'AC', 'AC', 'END']],
 ['7', ['h', '171', 'Attorney'], ['START', 'PA', 'AC', 'AC', 'END']],
 ['8', ['197', 'Grand'], ['START', 'AC', 'AC', 'END']],
 ['9',
  ['42', 'Clinton', ',', 'h', '.', '109', 'Hester'],
  ['START', 'NC', 'NC', 'D', 'PA', 'AC', 'AC', 'AC', 'END']],
 ['10',
  ['h', '352', 'Third', 'av', '', '.'],
  ['START', 'PA', 'AC', 'AC', 'AC', 'AC', 'AC', 'END']],
 ['11', ['h', '121', 'Colu

In [None]:
#pickle the list to save preliminary result
#with open("/Users/prajwal/Desktop/Columbia/C4SR/doubt/doubt6-filter11.pkl", "wb") as f:


#with open('/Users/prajwal/Desktop/Columbia/C4SR/mn_1880/not so confident-mn1880-v7.pkl', "wb") as f:
  #pickle.dump(formatted,f)
    
    
with open('mn-1880-3.pkl', "wb") as f:
  pickle.dump(formatted,f)

In [None]:
#pickle the list to save preliminary result
#with open("/Users/prajwal/Desktop/Columbia/C4SR/hnyc_cd_processing-master/input/1880/prediction.pkl", "wb") as f:
  #pickle.dump(formatted,f)