In [4]:
import re
# Tutorial about Python regular expressions: https://pymotw.com/2/re/ import string
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from tqdm import tqdm
import os
import nltk
import spacy
import random
from spacy.util import compounding
from spacy.util import minibatch

import warnings
warnings.filterwarnings("ignore")
import pandas as pd

Reference: https://www.kaggle.com/rohitsingh9990/ner-training-using-spacy-ensemble

In [5]:
BASE_PATH = '../../data/'

train_df = pd.read_csv(BASE_PATH + 'train.csv')
test_df = pd.read_csv( BASE_PATH + 'test.csv')
submission_df = pd.read_csv( BASE_PATH + 'sample_submission.csv')

train_df.dropna(inplace=True)

In [6]:
def save_model(output_dir, nlp, new_model_name):
    output_dir = f'../../models/{output_dir}'
    if output_dir is not None:        
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        nlp.meta["name"] = new_model_name
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

In [12]:
def train(train_data, output_dir, n_iter=20, model=None):
    """Load the model, set up the pipeline and train the entity recognizer."""
    ""
    if model is not None:
        nlp = spacy.load(output_dir)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")
    
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")
    
    # add labels
    for _, annotations in train_data:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        # sizes = compounding(1.0, 4.0, 1.001)
        # batch up the examples using spaCy's minibatch
        if model is None:
            nlp.begin_training()
        else:
            nlp.resume_training()

        for itn in tqdm(range(n_iter)):
            random.shuffle(train_data)
            batches = minibatch(train_data, size=compounding(4.0, 500.0, 1.001))    
            losses = {}
            for batch in batches:
                texts, annotations = zip(*batch)
                print(f"Texts: {texts}")
                print(f"Annotations: {annotations}")
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,   # dropout - make it harder to memorise data
                    losses=losses, 
                )
            
            print("Losses", losses)
    save_model(output_dir, nlp, 'st_ner')

In [8]:
def get_model_out_path(sentiment):
    model_out_path = None
    if sentiment == 'positive':
        model_out_path = 'model_pos'
    elif sentiment == 'negative':
        model_out_path = 'model_neg'
    else:
        model_out_path = 'model_neu'
    return model_out_path

In [9]:
def get_training_data(sentiment):
    train_data = []
    for index, row in train_df.iterrows():
        if row.sentiment == sentiment:
            selected_text = row.selected_text
            text = row.text
            start = text.find(selected_text)
            end = start + len(selected_text)
            train_data.append((text, {"entities": [[start, end, selected_text]]}))
    return train_data

---------------

In [11]:
train_data = get_training_data("positive")
model_path = get_model_out_path("positive")

train_data

[('2am feedings for the baby are fun when he is all smiles and coos',
  {'entities': [[30, 33, 'fun']]}),
 (' Journey!? Wow... u just became cooler.  hehe... (is that possible!?)',
  {'entities': [[11, 39, 'Wow... u just became cooler.']]}),
 ('I really really like the song Love Story by Taylor Swift',
  {'entities': [[16, 20, 'like']]}),
 ('Playing Ghost Online is really interesting. The new updates are Kirin pet and Metamorph for third job.  Can`t wait to have a dragon pet',
  {'entities': [[31, 43, 'interesting.']]}),
 ('the free fillin` app on my ipod is fun, im addicted',
  {'entities': [[0,
     51,
     'the free fillin` app on my ipod is fun, im addicted']]}),
 ('juss came backk from Berkeleyy ; omg its madd fun out there  havent been out there in a minute . whassqoodd ?',
  {'entities': [[46, 49, 'fun']]}),
 ('I`m going home now. Have you seen my new twitter design? Quite....heavenly isn`****?',
  {'entities': [[57, 74, 'Quite....heavenly']]}),
 ('i hope unni will make the aud

In [13]:
train(train_data, model_path, n_iter=2, model=None)

Created blank 'en' model


  0%|                                                                                                                                                                            | 0/2 [00:00<?, ?it/s]

Texts: ('Soon my new job starts  I`m so happy!', 'i`m going to try to get some sleeeeeeep. goonight twitter  have a nice mother`s day.', 'Zzzz... I`m taking my mom out for breakfast tomorrow!  Shall be quite a treat.', ' at first i thought bar life meant you were partying nonstop to catch up for the last 2 years...lol shoulda known better')
Annotations: ({'entities': [[24, 37, 'I`m so happy!']]}, {'entities': [[59, 84, 'have a nice mother`s day.']]}, {'entities': [[55, 78, 'Shall be quite a treat.']]}, {'entities': [[114, 120, 'better']]})
Texts: (' Well it`s nice to `see` you. Have a great day and tweet again soon', '_MEXICO Hey hey. No problem.', ' Wish I could be there to enjoy a girls day!!!', 'listening to  after last nights Up from the Underground screening, awesome job ')
Annotations: ({'entities': [[1, 67, 'Well it`s nice to `see` you. Have a great day and tweet again soon']]}, {'entities': [[17, 28, 'No problem.']]}, {'entities': [[26, 31, 'enjoy']]}, {'entities': [[67, 74, 'a

Texts: (' cant say i havee, im at james` but i did some rs yesterdayy!. im learning my quotess', 'Just watched the movie The Holiday. I had forgotten what a feel good movie it was!  Lovely evening.', ' thanks for the add  I`ll try again signing in later when I get home. If still no such luck then I`ll email someone', '_13 Awe, THX so much! Neither of need 2b sick on Friday!!  Do u have any prayer requests?')
Annotations: ({'entities': [[61, 85, '. im learning my quotess']]}, {'entities': [[59, 99, 'feel good movie it was!  Lovely evening.']]}, {'entities': [[1, 7, 'thanks']]}, {'entities': [[4, 21, 'Awe, THX so much!']]})
Texts: (' Aural goodness', 'Sitting in mels house, just finished eating mcdicks, laughing at all the dumb stuff I said tonight!! Ready for bed? I think so', '_Reviews THANKS!', 'Hi this is http://gayorbispace.com We hope everybody is having a safe & fun weekend')
Annotations: ({'entities': [[7, 15, 'goodness']]}, {'entities': [[53, 85, 'laughing at all the dumb stuff 

  0%|                                                                                                                                                                            | 0/2 [00:12<?, ?it/s]


KeyboardInterrupt: 