# Imports

In [1]:
import pandas as pd
import spacy 

from collections import Counter
from tqdm import tqdm

from labeling_functions import RuleAnnotator, DictionaryAnnotator, NameDisambiguationAnnotator, FrequencyDetector
from labeling_functions import AllCapsDetector, NameCaseStructureDetector, combine_lfs

from neat_preprocess import preprocess
from process_doc import create_doc, get_docs, store_doc_list

from skweak.generative import HMM

: 

: 

In [None]:
# Define paths to resources

data_path = "src/HTName.csv"

namelist_path = "src/nameslist.csv"
dictionary_path = "src/weights.json"
expanded_dictionary_path = "/src/generatedFemaleNamesPlusOriginalDict30000 (1).json"

model_path = "ht_bert_v3"

# # 1. Process data into spacy docs

In [None]:
text = ["HI MIA HERE  FIRST TIME IN THIS CITY,WOULD LIKE TO MEET NICE GUYS...  COME MEET ME TO HAVE UNFORGETTHABLE TIME TOGETHER...NEVER RUSH  OPEN - MINDED MENU  CALL TEXT... 6472055427 EGLINTON AVE E SCARBOROUGH"]
text = list(map(preprocess, text))

# load spacy model
nlp = spacy.load("en_core_web_sm")

# create docs
data_docs = create_doc(text, nlp)

In [None]:
print(text)

# Create LFs

In [None]:
# create lf
lfs = []

# NEAT LFs
# 0 - don't use, 1 - use, 2 - threshold (if applicable)
rule = 1
dictionary = 0
exp_dictionary = 1
disambiguation = 0
cap_disambiguation = 0
frequency = 0
all_caps = 0
name_structure = 0

if rule == 1:
  for i in range(27):
    lfs.append(RuleAnnotator(i))

if dictionary == 1:
  lfs = lfs + [DictionaryAnnotator(dictionary_path, "full_dictionary")]
elif dictionary == 2:
  thresholds = ["q1", "q2", "q3", "q4"]
  for threshold in thresholds:
    lfs.append(DictionaryAnnotator(dictionary_path, threshold + "_thr_dictionary"))

if exp_dictionary == 1:
  lfs = lfs + [DictionaryAnnotator(dictionary_path, "full_expanded_dictionary")]
elif exp_dictionary == 2:
  thresholds = ["q1", "q2", "q3", "q4"]
  for threshold in thresholds:
    lfs.append(DictionaryAnnotator(dictionary_path, threshold + "_thr_expanded_dictionary"))

if disambiguation == 1:
  lfs = lfs + [NameDisambiguationAnnotator(thr = 0.1, add_bound = 0.05, upper_bound = False, weights_dict_path = sources_path + "/full_dictionary.json")]
elif disambiguation == 2:
  thresholds = [0.1, 0.2, 0.3, 0.4]
  for threshold in thresholds:
    lfs.append(NameDisambiguationAnnotator(thr = threshold, add_bound = 0.05, upper_bound = False, weights_dict_path = sources_path + "/full_dictionary.json"))

# Extra Rule LFs

if frequency > 1:
  # all tokens that arent stop words or punctuations
  words = []
  for doc in tqdm(data_docs):  # for doc in data we want to fit hmm on
    words = words + [token.text for token in doc if not token.is_stop and not token.is_punct]

  # get most common tokens
  word_freq = Counter(words)
  sorted_word_freq = [x[0] for x in word_freq.most_common(len(word_freq))]

  # threshold frequency detector
  if frequency == 1:
    lfs.append(FrequencyDetector(sorted_word_freq, 0.01))
  elif frequency == 2:
    thresholds = [0.01, 0.02, 0.03, 0.04, 0.05]
    for threshold in thresholds:
      lfs.append(FrequencyDetector(sorted_word_freq, threshold))
    
# capital detectors
if all_caps == 1:
  lfs.append(AllCapsDetector())

if name_structure == 1:
  lfs.append(NameCaseStructureDetector())

# combine annotators
combined_annotator = combine_lfs(lfs)

# Run LFs

In [None]:
annotated_docs = []
for doc in tqdm(data_docs):
    combine_lfs(doc)

# Aggregate

In [None]:
unified_model = HMM("hmm", labels = ["PERSON_NAME", "NOT_NAME"])
unified_model.fit(annotated_docs)

unified_docs = []
for doc in tqdm(annotated_docs):
  unified_docs.append(unified_model(doc))

for doc in unified_docs:
    doc.ents = doc.spans["hmm"]

# Spacy docs -> list of strings

In [None]:
hmm_preds = []  # list of results
for doc in unified_docs:
  entities = ''
  for ent in doc.ents:
    if ent.label_ == "PERSON_NAME":
      entities+=ent.text+"|"
  if entities != '':
    hmm_preds.append(entities)
  else:
    hmm_preds.append('N')