In [7]:
import pandas as pd
from __future__ import unicode_literals,print_function
import plac
import random
from pathlib import Path
import spacy
from tqdm import tqdm

class TrainFoodNER():
    
    def __init__(self, df_path):
        self.df_path = df_path
        
    def generate_train_from_df(self):
        """This function generates the Train data in the format accepted by spacy from our csv file for training
           custom NER model and returns the training data variable created thereafter"""
        df = pd.read_excel(self.df_path)
        train = []
        gone = set()
        for index, value in df.iterrows():
            if index in gone:
                continue
            group = tuple()
            di = dict()
            li = []
            data = tuple()
            if pd.notnull(value['Start']) and pd.notnull(value['End']):
                data = data + (int(value['Start']),int(value['End']),value['Tag'].upper())
                li.append(data)
            c = index
            if ((c+1==df.shape[0])):
                break
            while(df.loc[c+1,'Review']==df.loc[index,'Review'] ):
                gone.add(c+1)
                data = tuple()
                data = data + (int(df.loc[c+1,'Start']),int(df.loc[c+1,'End']),df.loc[c+1,'Tag'].upper())
                li.append(data)
                c = c+1
                if ((c+1==df.shape[0])):
                    break
            di['entities'] = li
            group = group + (value['Review'],di)
            train.append(group)
            return train

    def train_ner(self, output_dr):
        """This function trains the ner model and saves the model in directory provided as parameter to function"""
        """Parameters:
           output_dr -> Path where you want the model to be saved, must be string (Ex. '/users/xyz/desktop')
           """
        model = None
        output_dir=Path(output_dr)
        n_iter=100
        if model is not None:
            nlp = spacy.load(model)  # load existing spaCy model
            print("Loaded model '%s'" % model)
        else:
            nlp = spacy.blank('en')  # create blank Language class
            print("Created blank 'en' model")

        if 'ner' not in nlp.pipe_names:
            ner = nlp.create_pipe('ner')
            nlp.add_pipe(ner, last=True)
        # otherwise, get it so we can add labels
        else:
            ner = nlp.get_pipe('ner')

        # add labels
        TRAIN_DATA = self.generate_train_from_df(self.df_path)
        for _, annotations in TRAIN_DATA:
            for ent in annotations.get('entities'):
                ner.add_label(ent[2])

        # get names of other pipes to disable them during training
        other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
        with nlp.disable_pipes(*other_pipes):  # only train NER
            optimizer = nlp.begin_training()
            for itn in range(n_iter):
                random.shuffle(TRAIN_DATA)
                losses = {}
                print(itn)
                for text, annotations in tqdm(TRAIN_DATA):
                    nlp.update(
                        [text],  # batch of texts
                        [annotations],  # batch of annotations
                        drop=0.35,  # dropout - make it harder to memorise data
                        sgd=optimizer,  # callable to update weights
                        losses=losses)
                print(losses)

        if output_dir is not None:
            output_dir = Path(output_dir)
            if not output_dir.exists():
                output_dir.mkdir()
            nlp.to_disk(output_dir)
            print("Saved model to", output_dir)
            
def generate_food_model(df_path,output_dir):
    """This function is the only one needed to be called to be called to generate FOOD NER model"""
    """Parameters:
       df_path : Path of excel file containing training data
       output_dir : Directory where you want the model to be saved (must be string, example : '/Users/XYZ/desktop')"""
    obj = TrainFoodNER(df_path = df_path)
    obj.train_ner(output_dir)