In [20]:
import re
# Tutorial about Python regular expressions: https://pymotw.com/2/re/ import string
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from tqdm import tqdm
import os
import nltk
import spacy
import random
from spacy.util import compounding
from spacy.util import minibatch

import warnings
warnings.filterwarnings("ignore")

Reference: https://www.kaggle.com/rohitsingh9990/ner-training-using-spacy-ensemble

In [6]:
BASE_PATH = '../../data/'

train_df = pd.read_csv(BASE_PATH + 'train.csv')
test_df = pd.read_csv( BASE_PATH + 'test.csv')
submission_df = pd.read_csv( BASE_PATH + 'sample_submission.csv')

train_df.dropna(inplace=True)

In [8]:
def save_model(output_dir, nlp, new_model_name):
    output_dir = f'../../models/{output_dir}'
    if output_dir is not None:        
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        nlp.meta["name"] = new_model_name
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

In [9]:
def train(train_data, output_dir, n_iter=20, model=None):
    """Load the model, set up the pipeline and train the entity recognizer."""
    ""
    if model is not None:
        nlp = spacy.load(output_dir)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")
    
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")
    
    # add labels
    for _, annotations in train_data:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        # sizes = compounding(1.0, 4.0, 1.001)
        # batch up the examples using spaCy's minibatch
        if model is None:
            nlp.begin_training()
        else:
            nlp.resume_training()

        for itn in tqdm(range(n_iter)):
            random.shuffle(train_data)
            batches = minibatch(train_data, size=compounding(4.0, 500.0, 1.001))    
            losses = {}
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,   # dropout - make it harder to memorise data
                    losses=losses, 
                )
            
            print("Losses", losses)
    save_model(output_dir, nlp, 'st_ner')

In [11]:
def get_model_out_path(sentiment):
    model_out_path = None
    if sentiment == 'positive':
        model_out_path = 'model_pos'
    elif sentiment == 'negative':
        model_out_path = 'model_neg'
    else:
        model_out_path = 'model_neu'
    return model_out_path

In [22]:
def get_training_data(sentiment):
    train_data = []
    for index, row in train_df.iterrows():
        if row.sentiment == sentiment:
            selected_text = row.selected_text
            text = row.text
            start = text.find(selected_text)
            end = start + len(selected_text)
            train_data.append((text, {"entities": [[start, end, selected_text]]}))
    return train_data

---------------

In [23]:
train_data = get_training_data("positive")
model_path = get_model_out_path("positive")

train(train_data, model_path, n_iter=2, model=None)

Created blank 'en' model


  0%|                                                                                                                                                                            | 0/2 [05:18<?, ?it/s]


KeyboardInterrupt: 

In [24]:
train_data

[('is hoping that she left her bb at home and didn`t lose it on the Metro',
  {'entities': [[3, 9, 'hoping']]}),
 ('Is happy...Skool just let out..! Today was my last day..! Yippee! R.I.P Jackson Tyler Morris..! Always Loved And Never Forgotten..!',
  {'entities': [[103, 108, 'Loved']]}),
 (' You`re cycling tho` that`s good. Healthy eating  Healthy and eating are a contradiction in terms.',
  {'entities': [[1,
     98,
     'You`re cycling tho` that`s good. Healthy eating  Healthy and eating are a contradiction in terms.']]}),
 ('_Y_Yankees changed my default pic since you`ve been showing so much love!',
  {'entities': [[40, 73, 'you`ve been showing so much love!']]}),
 (' wish i could  want to stay and play with u guys and enjoy a block party for once',
  {'entities': [[51, 58, 'd enjoy']]}),
 (' so glad, I hope yall make it back down near New Orleans',
  {'entities': [[12, 16, 'hope']]}),
 ('_aureole Oh my gosh, so cute!!!', {'entities': [[24, 30, 'cute!!']]}),
 (' oooooooh I KNO  EE