In [6]:
!pip install sklearn-crfsuite



In [7]:
#get authorization from google drive
#from google.colab import drive
#drive.mount('/content/drive')

In [8]:
import sklearn_crfsuite
from sklearn_crfsuite import metrics
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV
import pandas as pd
import pickle
import json

#Prepare data and features

In [9]:
#prepare training and test data
def prepare_data(path):
  df = pd.read_csv(path, engine = "python")
  sents = []
  record = []
  cur = df['number'][0]
  for index,row in df.iterrows():
    if row['number'] != cur:
      sents.append(record)
      record = [(row['pos'], row['tag'])]
    else:
      record.append((row['pos'], row['tag']))
    cur = row['number']
  sents.append(record)

  return sents

In [10]:
train_sents = prepare_data("1880_train.csv")
validation_sents = prepare_data("1880_validation.csv")

An example of training data

In [11]:
train_sents[0]

[('START', 'START'),
 ('Otersen', 'NC'),
 ('Casten', 'NC'),
 (',', 'D'),
 ('produce', 'OC'),
 (',', 'D'),
 ('h', 'PA'),
 ('149', 'AC'),
 ('Franklin', 'AC'),
 ('END', 'END')]

#Define CRF features

Feature Explanation

is_junior_token: does it equal "jr"?

is_widow_token: does it equal "widow"?

contains_digit: does it contain any number?

is_delimiter: is it a delimiter?

is_start: start of record?

is_end: end of record?

is_lower: all lowercase letters?

is_upper: all uppercase letters?


In [12]:
def is_junior_token(input):
        dc = input.lower()
        if dc == "jr":
            return True
        return False

def is_widow_token(input):
        dc = input.lower()
        if dc == "wid" or dc == "widow":
            return True
        return False

def contains_digit(input):
        for c in input:
            if c.isdigit():
                return True
        return False

def is_delimiter(input):
        for c in input:
            if c == '.' or c == ',':
                return True
        return False

def is_start(input):
        if input == "START":
            return True
        return False

def is_end(input):
        if input == "END":
            return True
        return False

Notice that features are not only generated on the current word, but also previous and next word.

In [13]:
def word2features(sent, i):
    word = sent[i][0]

    features = {
        'bias': 1.0,
        'is_junior_token': is_junior_token(word),
        'is_widow_token': is_widow_token(word),
        'contains_digit': contains_digit(word),
        'is_delimiter': is_delimiter(word),
        'is_start': is_start(word),
        'is_end': is_end(word),
        'is_lower': word.islower(),
        'is_title': word.istitle(),
        'is_upper': word.isupper(),
        'substr[-2:]': word[-2:],
        'substr[-1:]': word[-1:]
    }

    if i == 0:
      features.update({'BOS':True})
    elif i == len(sent) - 1:
      features.update({'EOS':True})
    else:
      word_prev = sent[i-1][0]
      word_next = sent[i+1][0]
      features.update({'prev_is_lower': word_prev.islower(), 
                       'prev_is_title': word_prev.istitle(),
                       'prev_is_upper': word_prev.isupper(),
                       'prev_is_delimiter': is_delimiter(word_prev),
                       'next_is_lower': word_next.islower(),
                       'next_is_title': word_next.istitle(),
                       'next_is_upper': word_next.isupper(),
                       'next_contains_digit': contains_digit(word_next),
                       'next_is_end': is_end(word_next)})
    
    return features


In [14]:
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for word, label in sent]

In [15]:
#extract features from data
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]
X_validation = [sent2features(s) for s in validation_sents]
y_validation = [sent2labels(s) for s in validation_sents]

#Training

In [16]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=500,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)



CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=500,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

#Evaluation

Precision, recall and f1-score are used as evaluation metrics.

A detailed explanation on metrics: https://medium.com/analytics-vidhya/pos-tagging-using-conditional-random-fields-92077e5eaa31

In [17]:
labels = list(crf.classes_)
y_pred = crf.predict(X_validation)
metrics.flat_f1_score(y_validation, y_pred,
                      average='weighted', labels=labels)

0.9906933631825323

Notice that evaluation metrics are applied to each category (AC, NC, PA etc.)

In [18]:
#metrics by label
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_validation, y_pred, labels=sorted_labels, digits=3
))

              precision    recall  f1-score   support

           D      1.000     0.963     0.981        54
          PA      1.000     1.000     1.000        21
          AC      0.989     1.000     0.995        94
          NC      0.975     1.000     0.988        79
          OC      1.000     0.955     0.977        22
         END      1.000     1.000     1.000        27
       START      1.000     1.000     1.000        27

    accuracy                          0.991       324
   macro avg      0.995     0.988     0.991       324
weighted avg      0.991     0.991     0.991       324



#Apply model to the whole dataset

In [19]:
#import glob
#file_path = "/Users/prajwal/Desktop/Columbia/C4SR/hnyc_cd_processing-master/input/1880/nypl_1880_clean_records.txt"
#with open(file_path) as f:
  #data = f.readlines()  

In [20]:
#import glob
#file_path = "/Users/prajwal/Desktop/Columbia/C4SR/doubt/doubt5 filter11.txt"
#with open(file_path) as f:
  #data = f.readlines()  

In [21]:
import pandas as pd
#read_df = pd.read_excel('/Users/prajwal/Desktop/Columbia/C4SR/doubt/doubt6-filter11.xlsx')
#read_df = pd.read_excel('/Users/prajwal/Desktop/Columbia/C4SR/bk_1850/doubt-bk1850-v7.xlsx')
#read_df = pd.read_excel('bk-1850-2.xlsx')
read_df = pd.read_excel('bk_1850_address2_afterDoubt.xlsx')
read_dict = {}
for i in range(len(read_df)):
    read_dict[i] = read_df['modified_clean_address'][i]
read_list = []
for i in range(0,len(read_dict)):
    print(i)
    try:
        read_list.append(str(list(read_dict.keys())[i]) + ' ' + list(read_dict.values())[i])
    except:
        read_list.append('')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
10676
10677
10678
10679
10680
10681
10682
10683
10684
10685
10686
10687
10688
10689
10690
10691
10692
10693
10694
10695
10696
10697
10698
10699
10700
10701
10702
10703
10704
10705
10706
10707
10708
10709
10710
10711
10712
10713
10714
10715
10716
10717
10718
10719
10720
10721
10722
10723
10724
10725
10726
10727
10728
10729
10730
10731
10732
10733
10734
10735
10736
10737
10738
10739
10740
10741
10742
10743
10744
10745
10746
10747
10748
10749
10750
10751
10752
10753
10754
10755
10756
10757
10758
10759
10760
10761
10762
10763
10764
10765
10766
10767
10768
10769
10770
10771
10772
10773
10774
10775
10776
10777
10778
10779
10780
10781
10782
10783
10784
10785
10786
10787
10788
10789
10790
10791
10792
10793
10794
10795
10796
10797
10798
10799
10800
10801
10802
10803
10804
10805
10806
10807
10808
10809
10810
10811
10812
10813
10814
10815
10816
10817
10818
10819
10820
10821
10822
10823
10824
10825
10826
10827
10828
10829
10830
10831

In [22]:
data = read_list

In [23]:
len(data)

15676

Format record to apply model.

In [24]:
formatted = []

for record in data:
    ls = record.split()
    #if ls:
    #if ls[0][1:] == "***":
    if ls[1:] == "***":
      continue
    else:
      record_split = []
      for word in ls[1:]:
        if word[-1] == "." or word[-1] == ",":
          record_split.append(word[:-1])
          record_split.append(word[-1])
        else:
          record_split.append(word)
    #formatted.append([ls[0], record_split])
    formatted.append([ls, record_split])


In [25]:
formatted = []
num1=-1
for record in data:
    num1+=1
    ls = record.split()
    if ls:
        if ls[0][1:] == "***":
          continue
        else:
          record_split = []
          for word in ls[1:]:
            if word[-1] == "." or word[-1] == ",":
              record_split.append(word[:-1])
              record_split.append(word[-1])
            else:
              record_split.append(word)
        formatted.append([ls[0], record_split])
    else:
        #print([num1,['']])
        formatted.append([num1,''])

In [26]:
len(formatted)

15676

In [27]:
def predict_label(s):
  sents = [[(word,0) for word in s]]
  sents[0].insert(0, ('START',0))
  sents[0].append(('END', 0))
  X = [sent2features(s) for s in sents]
  y = crf.predict(X)
  return y

In [28]:
n = len(formatted)
for i in range(n):
  labels = predict_label(formatted[i][1])
  formatted[i].append(labels[0])

In [29]:
formatted

[[0, '', ['START', 'END']],
 [1, '', ['START', 'END']],
 [2, '', ['START', 'END']],
 [3, '', ['START', 'END']],
 ['4', ['merchant'], ['START', 'AC', 'END']],
 [5, '', ['START', 'END']],
 ['6', ['Court'], ['START', 'AC', 'END']],
 ['7', ['141', 'Front'], ['START', 'AC', 'AC', 'END']],
 [8, '', ['START', 'END']],
 [9, '', ['START', 'END']],
 ['10', ['2954', 'Pearl'], ['START', 'AC', 'AC', 'END']],
 [11, '', ['START', 'END']],
 ['12', ['Myrtle', 'c', 'Jay'], ['START', 'AC', 'AC', 'AC', 'END']],
 [13, '', ['START', 'END']],
 [14, '', ['START', 'END']],
 [15, '', ['START', 'END']],
 [16, '', ['START', 'END']],
 [17, '', ['START', 'END']],
 [18, '', ['START', 'END']],
 ['19', ['Y'], ['START', 'NC', 'END']],
 [20, '', ['START', 'END']],
 [21, '', ['START', 'END']],
 [22, '', ['START', 'END']],
 [23, '', ['START', 'END']],
 ['24', ['73', 'William'], ['START', 'AC', 'AC', 'END']],
 ['25', ['15', 'Cortlandt'], ['START', 'AC', 'AC', 'END']],
 [26, '', ['START', 'END']],
 ['27', ['23', 'Myrtle', '

In [30]:
#pickle the list to save preliminary result
#with open("/Users/prajwal/Desktop/Columbia/C4SR/doubt/doubt6-filter11.pkl", "wb") as f:


#with open('/Users/prajwal/Desktop/Columbia/C4SR/mn_1880/not so confident-mn1880-v7.pkl', "wb") as f:
  #pickle.dump(formatted,f)
    
    
with open('bk-1850-3-address2.pkl', "wb") as f:
  pickle.dump(formatted,f)

In [31]:
#pickle the list to save preliminary result
#with open("/Users/prajwal/Desktop/Columbia/C4SR/hnyc_cd_processing-master/input/1880/prediction.pkl", "wb") as f:
  #pickle.dump(formatted,f)