## Data Augmentation

**Author:** Benjamin Aw, Shaun Khoo  
**Date:** 13 Oct 2021  
**Context:** Trying to address the data quality issue (lack of quality labelled data)  
**Objective:** Develop a function to create more synthetic examples using data augmentation for text data

### A) Setting up

In [1]:
import os
os.chdir('..')

In [2]:
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc
from nlpaug.util import Action



In [56]:
import time
import nltk
import math
import random
import regex as re
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\benjamin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [4]:
# Download fasttext model, only run once
#from nlpaug.util.file.download import DownloadUtil
#DownloadUtil.download_fasttext(model_name = 'wiki-news-300d-1M', dest_dir = 'Models')

In [5]:
#import nltk
#nltk.download('averaged_perceptron_tagger')

In [6]:
model_dir = 'Models/'

In [7]:
import pandas as pd
SSOC_2020 = pd.read_csv('Data/Processed/Training/train-aws/SSOC_2020.csv')
data = pd.read_csv('Data/Processed/Training/train-aws/train_full.csv')
extra_info = pd.read_csv('Data/Processed/MCF_Training_Set_Full.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [8]:
# with open('ssoc_autocoder/sentaugment/data/sentences.txt', 'w') as f:
#     for item in SSOC_2020['Description'][295:296]:
#         f.write("%s\n" % ''.join([i if ord(i) < 128 else ' ' for i in item]))
#         f.write("%s\n" % ''.join([i if ord(i) < 128 else ' ' for i in item]))

### B) Testing different types of augmentation

In [None]:
text = SSOC_2020['Description'][SSOC_2020['SSOC 2020'] == 25121].values[0]

In [None]:
print(text)

#### 1. Using pretrained word embeddings (`fasttext`)

In [None]:
fasttext_aug = naw.WordEmbsAug(model_type = 'fasttext', 
                               model_path = model_dir + 'wiki-news-300d-1M.vec',
                               action = "substitute",
                               top_k = 5,
                               aug_p = 0.5,
                               aug_min = 10,
                               aug_max = None)

In [None]:
fasttext_augmented_text = fasttext_aug.augment(text, num_thread = 4)
print(fasttext_augmented_text)

#### 2. Using back translation
Back translation means translating the whole text to another language and back to English.

In [None]:
back_translation_aug = naw.BackTranslationAug(from_model_name='facebook/wmt19-en-de', 
                                              to_model_name='facebook/wmt19-de-en',
                                              device = 'cuda',
                                              max_length = 2000)

In [None]:
backtransl_augmented_text = back_translation_aug.augment(text, num_thread = 4)
print(backtransl_augmented_text)

#### 3. Using synonyms

In [None]:
synonym_aug = naw.SynonymAug(aug_src = 'ppdb', 
                             model_path = model_dir + 'ppdb-2.0-tldr',
                             aug_p = 0.5,
                             aug_min = 10,
                             aug_max = None)

In [None]:
synonym_augmented_text = synonym_aug.augment(text, num_thread = 4)
print(synonym_augmented_text)

#### 4. Using contextual word embeddings

In [None]:
distilbert_aug = naw.ContextualWordEmbsAug(model_path = 'distilbert-base-uncased', 
                                           action = "substitute",
                                           top_k = 10,
                                           aug_p = 0.7,
                                           aug_min = 5,
                                           aug_max = None,
                                           device = 'cpu')

In [None]:
distilbert_augmented_text = distilbert_aug.augment(text, num_thread = 4)
print(distilbert_augmented_text)

#### 5. Using sentence augmentation

In [None]:
sentence_aug = nas.ContextualWordEmbsForSentenceAug(model_path = 'distilgpt2',
                                                    min_length = 100,
                                                    max_length = 300,
                                                    top_k = 50,
                                                    top_p = .9,
                                                    device = 'cuda')

In [None]:
sentence_augmented_text = sentence_aug.augment(text, num_thread = 4)
print(sentence_augmented_text)

#### 6. Using summarisation

In [None]:
summ_aug = nas.AbstSummAug(model_path = 't5-base', 
                           min_length = 50,
                           max_length = 100,
                           top_k = 20)

In [None]:
summ_augmented_text = summ_aug.augment(text, num_thread = 4)

In [None]:
summ_augmented_text

#### 7. Adding spelling mistakes

In [None]:
spl_aug = naw.SpellingAug(dict_path=None, 
                          name='Spelling_aug',
                          aug_min=1, 
                          aug_max=10, 
                          aug_p=0.3)

In [None]:
spl_augmented_text = spl_aug.augment(text)

In [None]:
spl_augmented_text

In [None]:
nas.

### C) Using GloVE embeddings to find and label more examples

In [None]:
import spacy
from spacy.language import Language
nlp = spacy.load('en_core_web_lg', disable = ['tagger', 'parser', 'ner', 'lemmatizer'])
stopwords = nlp.Defaults.stop_words

Add in additional preprocessing to remove the stop words

In [None]:
@Language.component("additional_preprocessing")
def additional_preprocessing(doc):
    lemma_list = [tok for tok in doc
                  if tok.is_alpha and tok.text.lower() not in stopwords] 
    return lemma_list
nlp.add_pipe('additional_preprocessing', last = True)

Run the `nlp` processing pipeline over the two corpuses and convert the job postings into vectors

In [None]:
SSOC_2020_nlp = list(nlp.pipe(SSOC_2020['Description']))
data_nlp = list(nlp.pipe(data['Cleaned_Description']))

In [None]:
target_vecs = []
for i, desc in enumerate(data_nlp):
    if i % 100 == 0:
        print(f'Job posting {i}/{len(data_nlp)}...\r', end = '')
    if len(desc) == 0:
        target_vecs.append(np.array([0]*300))
    else:
        target_vecs.append(np.mean([token.vector for token in desc], axis = 0))

In [None]:
detailed_definitions_raw = pd.read_excel('Data/Raw/SSOC2020 Detailed Definitions.xlsx', skiprows = 4)

In [None]:
detailed_definitions = detailed_definitions_raw[(~detailed_definitions_raw['SSOC 2020'].astype('str').str.contains('X')) & (detailed_definitions_raw['SSOC 2020'].astype('str').apply(len) >= 5)]

In [None]:
to_replace = {
    '•': '',
    '\n': '.',
    '<Blank>': '',
    '\([A-Za-z0-9 ]+\)': ''
}

detailed_definitions['Jobs Cleaned'] = detailed_definitions['Examples of Job Classified Under this Code']

for k, v in to_replace.items():
    detailed_definitions['Jobs Cleaned'] = detailed_definitions['Jobs Cleaned'].str.replace(k, v)

In [None]:
detailed_definitions['Jobs Cleaned']

Write a simple function to identify the top `n` jobs that are closest to the selected SSOC

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
def identify_top_n(selected,
                   data,
                   extra_info,
                   target_vecs,
                   top_n = 10,
                   threshold = 0.9):
    
    source_vec = np.array([np.mean([token.vector for token in selected], axis = 0)])
    matrix = cosine_similarity(source_vec, target_vecs)
    indices = np.apply_along_axis(lambda x: x.argsort()[-top_n:][::-1], axis = 1, arr = matrix)
    above_threshold = matrix[0][indices][0] >= threshold
    indices = [idx for idx, above in zip(indices[0], above_threshold) if above]
    if len(indices) == 0:
        print('None meet the threshold required.')
    else:
        cosine_similarity_index = 0
        for i, row in data.loc[indices, :].iterrows():
            print(f'Index: {i}')
            print(f'Cosine similarity: {matrix[0][indices][cosine_similarity_index]}')
            print(f'Predicted SSOC: {row["SSOC 2020"]}')
            print(f'Job title: {extra_info["title"][i]}')
            print(f'Description: {row["Cleaned_Description"]}')
            print('================================================================')
            cosine_similarity_index += 1

In [None]:
def find_matching_job_title(data,
                            include,
                            exclude):
    
    output = copy.deepcopy(data)
    output['title'] = output['title'].str.lower()
    
    include_boolean = [False] * len(output)
    for words in include:
        entry_boolean = [True] * len(output)
        for word in words.split(' '):
            entry_boolean = entry_boolean & output['title'].str.contains(word.lower())
        include_boolean = include_boolean | entry_boolean
    
    for words in exclude:
        for word in words.split(' '):
            include_boolean = include_boolean & ~output['title'].str.contains(word.lower())
            
    job_titles_idx = output[include_boolean.values].index.tolist()
    return job_titles_idx
            

In [None]:
pd.set_option('display.max_rows', 500)
import copy
import json
# Run this to initialise the dictionary object
# with open('manual_tagging.json', 'r') as outfile:
#     manual_tagging1 = json.load(outfile)

In [None]:
# Run this to export the manual tagging to the JSON file
# with open('manual_tagging.json', 'w') as outfile:
#     json.dump(manual_tagging, outfile)

Set the SSOC you are scanning for here

In [None]:
ssoc = 12221
include_job_titles = [17640, 30290, 34491, 36065, 36409, 37141, 42499]
for detailed_def_job in detailed_definitions['Jobs Cleaned'][detailed_definitions['SSOC 2020'] == str(ssoc)].values[0].split('.'):
    print(detailed_def_job.strip())
    include_job_titles.append(detailed_def_job.strip())

In [None]:
job_titles_idx = find_matching_job_title(extra_info,
                                         include = ['public relations manager'],
                                         exclude = [])

In [None]:
print(job_titles_idx)
for i, title in extra_info.loc[job_titles_idx, 'title'].iteritems():
    print(f"{i}: {title}")

In [None]:
print(extra_info.loc[34491, 'description'])

In [None]:
ssoc_index = SSOC_2020[SSOC_2020['SSOC 2020'] == ssoc].index[0]
identify_top_n(SSOC_2020_nlp[ssoc_index], data, extra_info, target_vecs, top_n = 15, threshold = 0.85)

Use this to find job postings with the exact job title

In [None]:
words = ['admin', 'manager'] # what words to include
exclude = ['account', 'database', 'it', 'project'] # what words to exclude
output = copy.deepcopy(extra_info)
for word in words:
    output = output[output['title'].str.lower().str.contains(word)]
for word in exclude:
    output = output[~output['title'].str.lower().str.contains(word)]
job_titles_idx = output.index.tolist()
print(job_titles_idx)

In [None]:
#extra_info.loc[25834, 'description']

Change the list `inputting` here to input the indices of the job postings that you want to manually tag as that SSOC

In [None]:
manual_tagging[ssoc] = []
inputting = []
inputting_dedup = list(set(inputting))
for key in manual_tagging.keys():
    for new_idx in inputting_dedup:
        if new_idx in manual_tagging[key]:
            print(f'Duplicate detected for index {new_idx} which has already been marked for SSOC {key}')
            inputting_dedup.remove(new_idx)
manual_tagging[ssoc].extend(inputting_dedup)
print(f'SSOC: {ssoc}')
print(manual_tagging[ssoc])

In [None]:
# ssoc_index = SSOC_2020[SSOC_2020['SSOC 2020'] == ssoc].index[0]
# identify_top_n(SSOC_2020_nlp[ssoc_index], data, extra_info, target_vecs, top_n = 15, threshold = 0.85)

### D) Trying out lambada

Ref: https://github.com/makcedward/nlpaug/blob/master/example/lambada-train_model.ipynb

In [None]:
test_data = data.sample(100)

test_data = test_data[['SSOC 2020', 'Cleaned_Description']]

In [None]:
test_data.rename({'SSOC 2020': 'label', 'Cleaned_Description': 'text'}, axis=1, inplace=True)

In [None]:
test_data = test_data[['text', 'label']]

In [None]:
test_data.to_csv('Data/test/classification.csv', index = False)

Training classifier

DL files from nlpaug
Copy and paste scripts from nlpaug to Models folder
Create file path model\lambada\cls in Models folder
Uploaded c

In [None]:
!python Models/scripts/lambada/train_cls.py  \
    --train_data_path Data/test/classification.csv \
    --val_data_path Data/test/classification.csv \
    --output_dir Models/model/lambada/cls \
    --device cpu \
    --num_epoch 2

Output processing data as mlm_data.txt

In [None]:
!python Models/scripts/lambada/data_processing.py \
    --data_path Data/test/classification.csv \
    --output_dir Data/test

In [None]:
!python Models/scripts/lambada/run_clm.py \
    --tokenizer_name Models/model/lambada/cls \
    --model_name_or_path gpt2 \
    --model_type gpt2 \
    --train_file Data/test/mlm_data.txt \
    --output_dir Models/scripts/lambada/gen \
    --do_train \
    --overwrite_output_dir \
    --per_device_train_batch_size 4 \
    --per_device_eval_batch_size 4 \
    --save_steps=10000 \
    --num_train_epochs 2

Not tested yet

In [None]:
aug = nas.LambadaAug(model_dir='../model/lambada', threshold=0.3, batch_size=4)

In [None]:
aug.augment(['24111', '23619'], n=10)

This entry gave errors, a character is not UTF-8 compliant, not sure which one though.

#### E) Implementing Data Augmentation

Create a function to collate all the augmentation process, and create a dictionary based on the output.

The methods are:

1) Using pretrained word embeddings
2) Using back translation
3) Using synonyms
4) Using contextual word embeddings
5) Using sentence augmentation
6) Using summarisation

In [9]:
text = '''Deliver and develop a robust communications strategy and implementation plan which encompass PR and other communication vehicles, including new social platforms, video and other multi-media tools to understand AHL's multiple stakeholders and customers and build equity of AHL brand. Develop powerful narratives and compelling content to promote the suite of festivals and programmes under the initiative (e g media materials, profile stories and videos) to various target audiences, e g media, public, partners etc. Manage media relations and take the lead in handling media queries, drafting media materials and analysing media coverage. Responsible for Communications Planning & Message Development: To develop and execute PR plans craft media materials including press releases, speeches, message houses, media responses and interview talking points. Grow media partnerships organically and coordinate media interest in AHL and ensure regular contact with target media and appropriate response to media requests. Act as AHL's representative with the media and develop strong relationships with media representatives, locally and internationally. Establish and drive internal and external communications strategy that positively affects employees' and customers' understanding, engagement and commitment with the result of an even stronger connection to AHL. Work closely with marketing colleagues on integrating media communications initiatives within marketing strategies and campaigns/activities for key festivals and programmes. Assist Head of Marcomms to develop the department's strategic short and long term plan, and address critical issues in the communications area. The plan will set out objectives for each strategic area of the department and outlines plans and budget for achieving those objectives. Keep abreast of the development in the field of communications and public relations, not-for-profit management and governance. To carry out and assume any other duties and responsibilities as and when assigned by the AHL Management.'''

General Parameters explained:
- top_k: 
    - Controlling lucky draw pool. Top k score token will be used for augmentation. Larger k, more token can be used. Default value is 100. If value is None which means using all possible tokens.
- top_p: 
    - If set to float < 1, only the most probable tokens with probabilities that add up to `top_p` or higher are kept for generation.
- aug_p:
    - Percentage of word will be augmented.
- aug_min:
    - Minimum number of word will be augmented.
- aug_max:
    - Maximum number of word will be augmented. If None is passed, number of augmentation is calculated via aup_p. If calculated result from aug_p is smaller than aug_max, will use calculated result from aug_p. Otherwise, using aug_max.
- device:
    - If GPU is present use CUDA, if not rely on default value CPU.
- min_length: 
    - For context sentence augmentation, the min length of output text.
- max_length:
    - For context sentence augmentation, the max length of output text.

Specific paramters:
1) Pretrained embeddings substitution
    - asd
2) Back-Translation
    - asd
3) Synonyms
    - asd
4) Contextual word embeddings
    - asd
5) Sentence augmentation
    - asd
6) Summarisation
    - asd

In [44]:
params = {
    'top_k': 5,
    "top_p": 0.9,
    'aug_p': 0.5,
    'aug_min': 10,
    'device': 'cpu',
    'min_length': 100,
    'max_length': 340,
    'wrd_embd': {
        'model_type': 'glove',
        'model_path': 'Models/glove.840B.300d.txt',
        'action': 'substitute'
    },
    'bk_trans': {
        'from_model_name': 'facebook/wmt19-en-de',
        'to_model_name': 'facebook/wmt19-de-en',
    },
    'synonym': {
        'aug_src': 'ppdb',
        'model_path': 'Models/ppdb-2.0-tldr'
    },
    'context_emb': {
        'model_path': 'distilbert-base-uncased',
        'action': 'substitute'
    },
    'sent_aug': {
        'model_path': 'distilgpt2'
    },
    'summ_aug': {
        'model_path': 't5-base'
    }
}

In [45]:
def data_aug_collated(text, params):
    start = time.perf_counter()
    output = {'orginal_text': text}
    
    print(f"Original text:\n{text}")
    print("===================================================================================================")
    
    tic = time.perf_counter()
    wrd_embd_aug  = naw.WordEmbsAug(model_type = params['wrd_embd']['model_type'], 
                                  model_path = params['wrd_embd']['model_path'],
                                  action = params['wrd_embd']['action'],
                                  top_k = params['top_k'],
                                  aug_p = params['aug_p'],
                                  aug_min = params['aug_min'])
    
    wrd_emb_out = wrd_embd_aug.augment(text, num_thread = 4)
    output["wrd_emb_out"] = wrd_emb_out
    toc = time.perf_counter()
    
    print(f"Word embedding convertion:\n{wrd_emb_out}")
    print(f"\nTime taken: {toc - tic:0.4f}")
    print("===================================================================================================")
    
    tic = time.perf_counter()
    bk_trans_aug = naw.BackTranslationAug(from_model_name = params['bk_trans']['from_model_name'], 
                                          to_model_name = params['bk_trans']['to_model_name'],
                                          device = params['device'],
                                          max_length = params['max_length'])
    
    bk_trans_out = bk_trans_aug.augment(text, num_thread = 4)
    output["bk_trans_out"] = bk_trans_out
    toc = time.perf_counter()
        
    print(f"Back translation convertion:\n{bk_trans_out}")
    print(f"\nTime taken: {toc - tic:0.4f}")
    print("===================================================================================================")

    tic = time.perf_counter()
    synonym_aug = naw.SynonymAug(aug_src = params['synonym']['aug_src'], 
                                 model_path = params['synonym']['model_path'],
                                 aug_p = params['aug_p'],
                                 aug_min = params['aug_min'])
    
    synonym_out = synonym_aug.augment(text, num_thread = 4)
    output["synonym_out"] = synonym_out
    toc = time.perf_counter()
        
    print(f"Synonym convertion:\n{synonym_out}")
    print(f"\nTime taken: {toc - tic:0.4f}")
    print("===================================================================================================")
    
    tic = time.perf_counter()
    
    context_emb_aug = naw.ContextualWordEmbsAug(model_path = params['context_emb']['model_path'], 
                                                action = params['context_emb']['action'],
                                                top_k = params['top_k'],
                                                aug_p = params['aug_p'],
                                                aug_min = params['aug_p'],
                                                device = params['device'])
    
    context_emb_out = context_emb_aug.augment(text, num_thread = 4)
    output["context_emb_out"] = context_emb_out
    toc = time.perf_counter()
        
    print(f"Context embedding convertion:\n{context_emb_out}")
    print(f"\nTime taken: {toc - tic:0.4f}")
    print("===================================================================================================")
    
    
    sent_aug = nas.ContextualWordEmbsForSentenceAug(model_path = params['sent_aug']['model_path'],
                                                    min_length = params['min_length'],
                                                    max_length = params['max_length'],
                                                    top_k = params['top_k'],
                                                    top_p = params['top_p'],
                                                    device = params['device'])
    
    sent_out = sent_aug.augment(text, num_thread = 4)
    output["sent_out"] = sent_out
    toc = time.perf_counter()
        
    print(f"Sentence augmentation convertion:\n{sent_out}")
    print(f"\nTime taken: {toc - tic:0.4f}")
    print("===================================================================================================")
    
    summ_aug = nas.AbstSummAug(model_path = params['summ_aug']['model_path'], 
                           min_length = params['min_length'],
                           max_length = params['max_length'],
                           top_k = params['top_k'])

    summ_out = summ_aug.augment(text, num_thread = 4)
    
    print(f"Summarisation convertion:\n{summ_out}")
    output["summ_out"] = summ_out
    print(f"\nTime taken: {toc - tic:0.4f}")
    print("===================================================================================================")
    
    end = time.perf_counter()
    print("Done convertion")
    print(f"\nTotal time taken: {end - start:0.4f}")
    
    return output
    

In [46]:
data_aug_collated(text, params)

Original text:
Deliver and develop a robust communications strategy and implementation plan which encompass PR and other communication vehicles, including new social platforms, video and other multi-media tools to understand AHL's multiple stakeholders and customers and build equity of AHL brand. Develop powerful narratives and compelling content to promote the suite of festivals and programmes under the initiative (e g media materials, profile stories and videos) to various target audiences, e g media, public, partners etc. Manage media relations and take the lead in handling media queries, drafting media materials and analysing media coverage. Responsible for Communications Planning & Message Development: To develop and execute PR plans craft media materials including press releases, speeches, message houses, media responses and interview talking points. Grow media partnerships organically and coordinate media interest in AHL and ensure regular contact with target media and appropria

Input length of input_ids is 340, but ``max_length`` is set to 340.This can lead to unexpected behavior. You should consider increasing ``config.max_length`` or ``max_length``.


Context embedding convertion:
deliver and develop a robust communications strategy and implementation plan which encompass pr and other communication vehicles, including new social platforms, video, other multi - media tools to understand ahl's multiple stakeholders and customers and build equity of ahl brand. develop powerful narratives and compelling content to promote the suite of festivals and programmes under the initiative ( e g media materials, news stories and videos ) to various target audiences, e g media, broadcasters, partners etc. manage media relations and take the lead in handling media queries, drafting media materials and analysing media trends. responsible for communications planning & message development : to develop and execute pr plans craft media materials including press releases, speeches, message houses, media responses and interview talking points. grow media partnerships organically and coordinate media interest in ahl and ensure direct contact with existing 

{'orginal_text': "Deliver and develop a robust communications strategy and implementation plan which encompass PR and other communication vehicles, including new social platforms, video and other multi-media tools to understand AHL's multiple stakeholders and customers and build equity of AHL brand. Develop powerful narratives and compelling content to promote the suite of festivals and programmes under the initiative (e g media materials, profile stories and videos) to various target audiences, e g media, public, partners etc. Manage media relations and take the lead in handling media queries, drafting media materials and analysing media coverage. Responsible for Communications Planning & Message Development: To develop and execute PR plans craft media materials including press releases, speeches, message houses, media responses and interview talking points. Grow media partnerships organically and coordinate media interest in AHL and ensure regular contact with target media and approp

Since Fasttext does n-gram skips, we will obtain misspelt characters, which is not what we want to see. Substituted with Glove model instead.

We take a look at the common job description phrases that can be found (non-exhaustive)

In [51]:
common_phrases = [' We are looking for/searching for a candidate who is',
                  ' We are looking for a candidate who can',
                  ' Are you passionate about this job',
                  ' Do you love a job that',
                  ' We are a company that',
                  ' We are a startup that',
                  ' We are a agency that',
                  ' Would you like to work for a company that is',
                  ' Are you interested in working for a company that',
                  ' In this role, you will be responsible for',
                  ' One of your key responsibilities in this job will be',
                  ' If you love this role, then you’ll fit right in our team of',
                  ' If you would like to be part of our team, apply today by',
                  ' Sound like you? Then, send your resumé/CV and cover letter to',
                  ' If this sounds like you, then apply by clicking the button below',
                  ' Description of the duties and responsibilities of the job includes',
                  ' This job is ideal for someone who is',
                  ' Top skills and proficiencies include',
                  ' Dynamic work environment',
                  ' Proven track record',
                  ' Self-starter']

Do random injection of phrases into sentences

In [78]:
def random_insert(text, common_phrases, prob, edit_phrase):
    text_list = text.split(".")
    length = len(text_list)
    selected_phrases = random.sample(common_phrases, math.ceil(len(common_phrases) * prob))
    
    if edit_phrase:
        selected_phrases = [naw.ContextualWordEmbsAug(model_path = 'distilbert-base-uncased', action = "substitute").augment(phrase, num_thread = 4) for phrase in selected_phrases]

        selected_phrases = [" " + " ".join(re.sub('[^a-zA-Z ]+', '', phrase).capitalize().split()) for phrase in selected_phrases]
    
    print(f"Phrases that were added in: {selected_phrases}")
    
    for phrase in selected_phrases:
        text_list.insert(random.randrange(length), phrase)
        length = len(text_list)
    
    return '.'.join(text_list)

In [81]:
random_insert(text, common_phrases, 0.2, True)

Phrases that were added in: [' This approach is ideal for someone else is', ' We created a agency that', ' We are looking toward any candidate who can', ' If you would like to be part within an organization apply today by', ' Self defense']


" We are looking toward any candidate who can.Deliver and develop a robust communications strategy and implementation plan which encompass PR and other communication vehicles, including new social platforms, video and other multi-media tools to understand AHL's multiple stakeholders and customers and build equity of AHL brand. Develop powerful narratives and compelling content to promote the suite of festivals and programmes under the initiative (e g media materials, profile stories and videos) to various target audiences, e g media, public, partners etc. Manage media relations and take the lead in handling media queries, drafting media materials and analysing media coverage. Responsible for Communications Planning & Message Development: To develop and execute PR plans craft media materials including press releases, speeches, message houses, media responses and interview talking points. If you would like to be part within an organization apply today by. We created a agency that. Grow m