## Using spacy's inbuilt multiprocessing

In [5]:
import spacy  # Spacy Module
import os  # To create the folder structure
import pandas as pd  # To read the csv file
import timeit  # To calculate the time taken to run the code
from termcolor import colored  # Color the output

ModuleNotFoundError: No module named 'termcolor'

In [None]:

def spacy_runner(model: str = 'medium') -> None:
    '''
        Description: This function is used to load the data from the dataset and process it to identify the named entities.
        Output: Inside a file spacy_large.csv, the data is stored in the following
                format:	token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop
        Model Used: Spacy Small/Medium/Large
        Default: Spacy Medium
    '''

    # Determine and Load the spacy model
    model_name = 'en_core_web_sm' if model == 'small' else 'en_core_web_lg' if model == 'large' else 'en_core_web_md'
    nlp = spacy.load(model_name)
    del model_name
    # Write the data to a file
    filename = f'./data/spacy_{model}.csv'
    with open(filename, 'w') as f:
        for df in pd.read_csv('./data.csv', chunksize=32):
            data = df['text'].values.tolist()
            del df
            for text in data:
                # Process the text
                doc = nlp(u'{}'.format(text))
                for token in doc.ents:
                    if token.label_ == 'PERSON' or token.label_ == 'ORG' or token.label_ == 'PER':
                        f.write(f'{token.text},{token.label_}\n')
                    del token
                    continue
                del text
                del doc
                continue
            del data
            continue
    return


In [None]:
def process_model(f) -> None:
    '''
        Description: This function is used to run the NLP model on the dataset.
        Input: Function name
        Output: None
    '''

    # Run the function
    with open('./results.csv', 'a') as result_file:
        # Calcuate the time
        start_time = timeit.default_timer()
        print('Started {f}'.format(f=f))
        try:
            f['function'](**f['parameters'])
        except Exception as e:
            print(e)
        elapsed = timeit.default_timer() - start_time
        print( colored(f'[*] {f["function"]} completed in {elapsed} seconds', 'green'))
        # Calculate the number of tags
        filename = f'./data/{f["function"].__name__[:-7]}_{f["parameters"]["model"]}.csv'

        # Store the results to the target file
        with open(filename, 'r') as file:
            # Get the number of tags extracted, and write the (package, model, time, number_of_tags)
            num_tags = len(file.readlines())
            print(
                f"{f['function'].__name__[:-7]}, {f['parameters']['model']}, {elapsed}, {num_tags}", file=result_file)
    return


In [None]:
def time_counter(functions: list) -> None:
    '''
        Description: This function is used to time the execution of the functions  passed as an argument.
        Input: List of functions
        Output: None
    '''
    # Print the runtime of the functions
    for f in functions:
        process_model(f)
        # p1 = Process(process(f))
        # p1.start()
        del f
        continue
    return

In [None]:
def main():
    '''
        Description: Driver Function to call other functions
    '''

    functions = [
        {'function': spacy_runner, 'parameters': {'model': 'small'}},
    ]
    time_counter(functions)
    return


In [2]:
# Enforce the main function
if __name__ == '__main__':
    main()
    exit(0)

NameError: name 'main' is not defined