In [1]:
!python -m spacy validate
import spacy
import srsly
import os
from spacy import displacy
import json
from spacy.pipeline import EntityRuler
from spacy.language import Language
from random import shuffle

⠙ Loading compatibility table...[2K[38;5;2m✔ Loaded compatibility table[0m
[1m
[38;5;4mℹ spaCy installation: /usr/local/lib/python3.7/dist-packages/spacy[0m

TYPE      NAME             MODEL            VERSION                            
package   en-core-web-sm   en_core_web_sm   [38;5;2m2.2.5[0m   [38;5;2m✔[0m
link      en               en_core_web_sm   [38;5;2m2.2.5[0m   [38;5;2m✔[0m



In [None]:
DUMP_PATH = r'/content/drive/MyDrive/4th year project/tyn_dump - Copy'
JSONL_PATH = r'/content/drive/MyDrive/4th year project/training_data.jsonl'
OUTPUT_PATH = r'/content/drive/MyDrive/4th year project/train.json'

def create_jsonl():
  files = os.listdir(DUMP_PATH)
  json_list = []
  for file in files:
    o = srsly.read_json(os.path.join(DUMP_PATH, file))
    json_list.append({'sentences': o['sentences'],  'tyn': o['tyn']})
  srsly.write_jsonl(JSONL_PATH, json_list)

In [None]:
doc = nlp(next(srsly.read_jsonl(JSONL_PATH))['sentences'][0])

doc.to_disk('/content/drive/MyDrive/4th year project/to_disc_test/doc1.txt')

In [None]:
articles = os.listdir(DUMP_PATH)
with open(os.path.join(DUMP_PATH, articles[10])) as f:
  obj = json.loads(f.read())
  TYN = obj['tyn']

In [None]:
nlp.pipeline

[]

In [None]:
patterns = [{"label": "TYN", "pattern" :thing} for thing in TYN]
#[{"LOWER": token} for token in nlp(thing)]

In [None]:
ruler = EntityRuler(nlp, 'LOWER', patterns=patterns)

In [None]:
nlp.remove_pipe('ner')


('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x7fe84c910bb0>)

In [None]:
nlp.remove_pipe("loc_ruler")

('loc_ruler', <spacy.pipeline.entityruler.EntityRuler at 0x7fe849238110>)

In [None]:
entity_ruler = EntityRuler(nlp)
entity_ruler.add_patterns([{"label": "LOC", "pattern": "London"}, {"label": "LOC", "pattern": "Berlin"}])
nlp.add_pipe(entity_ruler, name="loc_ruler")

doc = nlp('I like London and Berlin.')
print(doc.ents)

(London, Berlin)


In [None]:
for ent in doc.ents:
  print(doc[ent.start].idx, doc[ent.start].idx+len(ent.text), 'TYN')

7 13
18 24


In [None]:
with open('/content/drive/MyDrive/4th year project/train.json') as f:
  print(json.loads(f.read()))

[['Who is Shaka Khan?', {'entities': [[7, 17, 'PERSON']]}], ['I like London and Berlin.', {'entities': [[7, 13, 'LOC'], [18, 24, 'LOC']]}]]


In [None]:
def create_training_data():
  jsonl = srsly.read_jsonl(JSONL_PATH)
  training_data = []
  for obj in jsonl:
    training_data.append(get_training_data_from_obj(obj))
  return flatten(training_data)
    
def flatten(l):
  return [i for s in l for i in s]

def get_training_data_from_obj(obj):
  ruler = get_ruler(obj['tyn'])
  nlp.add_pipe(ruler, 'tyn_ruler')
  training_data = []
  for sentence in obj['sentences']:
    doc = nlp(sentence)
    entities = [[doc[ent.start].idx, doc[ent.start].idx+len(ent.text), 'TYN'] for ent in doc.ents]
    if entities:  
      sentence_data = [sentence, {"entities": entities}]
      training_data.append(sentence_data)
  n, c = nlp.remove_pipe('tyn_ruler')
  return training_data    

def get_ruler(tyn):
  patterns = [{"label": "TYN", "pattern" :thing} for thing in tyn]
  return EntityRuler(nlp, 'LOWER', True, patterns=patterns)

def write_training_data_to_json():
  data = create_training_data()
  srsly.write_json('/content/drive/MyDrive/4th year project/train.json', data)

def write_training_data_to_json_splits():
  data = create_training_data()
  split = int(0.9*len(data))
  shuffle(data)
  train = data[:split]
  dev = data[split:]
  srsly.write_json('/content/drive/MyDrive/4th year project/train.json', train)
  srsly.write_json('/content/drive/MyDrive/4th year project/dev.json', dev)

nlp = spacy.blank('en')

In [None]:
write_training_data_to_json()

In [None]:
len(training_data)

5804

In [None]:
!pip install -U featuretools



In [None]:
write_training_data_to_json_splits()

# Filtered categories data

In [2]:
import spacy
import srsly
import os
from spacy import displacy
import json
from spacy.pipeline import EntityRuler
from spacy.language import Language
from random import shuffle

DUMP_PATH = r'/content/drive/MyDrive/4th year project/tyn_dump'
JSONL_PATH = r'/content/drive/MyDrive/4th year project/filtered_training_data.jsonl'
TRAIN_OUTPUT_PATH = r'/content/drive/MyDrive/4th year project/train_filtered.json'
DEV_OUTPUT_PATH = r'/content/drive/MyDrive/4th year project/dev_filtered.json'
DESIRED_WIKI_CATEGORIES = ['Home and Garden', 'Hobbies and Crafts', 'Food and Entertaining']

def create_training_data():
  jsonl = srsly.read_jsonl(JSONL_PATH)
  training_data = []
  for obj in jsonl:
    training_data.append(get_training_data_from_obj(obj))
  return [i for s in training_data for i in s]

def get_training_data_from_obj(obj):
  ruler = get_ruler(obj['tyn'])
  nlp.add_pipe(ruler, 'tyn_ruler')
  training_data = []
  for sentence in obj['sentences']:
    doc = nlp(sentence)
    entities = [[doc[ent.start].idx, doc[ent.start].idx+len(ent.text), 'TYN'] for ent in doc.ents]
    if entities:  
      sentence_data = [sentence, {"entities": entities}]
      training_data.append(sentence_data)
  n, c = nlp.remove_pipe('tyn_ruler')
  return training_data    

def get_ruler(tyn):
  patterns = [{"label": "TYN", "pattern" :thing} for thing in tyn]
  return EntityRuler(nlp, phrase_matcher_attr='LOWER', validate=True, patterns=patterns)

def write_training_data_to_json_splits():
  data = create_training_data()
  split = int(0.9*len(data))
  shuffle(data)
  train = data[:split]
  dev = data[split:]
  srsly.write_json(TRAIN_OUTPUT_PATH, train)
  srsly.write_json(DEV_OUTPUT_PATH, dev)

nlp = spacy.blank("en")
write_training_data_to_json_splits()