# Natural Languaging Processing 
### This will perform a Name Entity Recognition to extract key geological terms
### This script will also extract key words from each ASX announcement
### Brendan Garner - April 2021

#### Import the necessary modules for NLP

In [2]:
import json
import random
import spacy
from spacy.util import minibatch
from spacy.training import Example
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from nltk.corpus import stopwords

#### Load the Spacy English model

In [4]:
nlp = spacy.load('en_core_web_sm')

#### Get the pipeline from Spacy

In [5]:
ner = nlp.get_pipe("ner")

#### Supply the custom training data

In [6]:
train = [
    ("the drilling has shown an apparent rolling of the eridanus granodiorite at depth",
     {"entities": [(50, 71, "GEOLOGY")]}),
    ("the drill hole failed to intersect any fault breccia at the targeted depth but did intersect significant"
     " high grade gold mineralisation associated with quartz veining in the footwall felsic porphyry below the"
     " sirdar formation",
     {"entities": [(209, 225, "GEOLOGY")]}),
    ("four drill rigs are currently on site at the bardoc tectonic zone targeting areas outside of existing resources",
     {"entities": [(45, 65, "GEOLOGY")]}),
    ("it is interpreted as being on a northerly striking shear that cuts across the kilimanjaro shear zone",
     {"entities": [(78, 100, "GEOLOGY")]}),
    ("there are four main deposits and a multitude of smaller projects within the km land holding providing a large"
     " resource base and excellent exploration potential within the prolific norseman wiluna greenstone belt and"
     " junction of the bardoc tectonic zone btz and the black flag fault",
     {"entities": [(265, 281, "GEOLOGY")]}),
    ("deeper rc drilling will target along the latecomer fault and the confluence of these trends in the june quarter",
     {"entities": [(41, 56, "GEOLOGY")]}),
    ("the mineralisation in the zoroastrian area is predominately associated with a complex array of multiple"
     " dimensional and variable orientated quartz veins and stock works within the differentiated zoroastrian dolerite",
     {"entities": [(195, 215, "GEOLOGY")]}),
    ("the scotia basalt is a known host of mineralisation at prospects such as jackorite successfully mined by"
     " excelsior gold",
     {"entities": [(4, 17, "GEOLOGY")]}),
    ("the yowereena area north of contessa includes the unexplored northern margin of the contessa granite and under"
     " explored prospective archaean greenstone terrane within a region of major gold endowment and production",
     {"entities": [(84, 100, "GEOLOGY")]}),
    ("the company is pleased to provide this exploration report for its owned woodline project in the fraser range"
     " western australia",
     {"entities": [(96, 108, "GEOLOGY")]}),
    ("the jerdacuttup fault a regional scale structure that is the interpreted boundary between the yilgarn craton and"
     " the albany fraser orogen",
     {"entities": [(4, 21, "GEOLOGY"), (94, 108, "GEOLOGY"), (117, 137, "GEOLOGY")]}),
    ("an extensive soil sampling campaign has commenced at polelle to define drill targets with a particular focus on"
     " the albury heath shear zone",
     {"entities": [(116, 139, "GEOLOGY")]}),
]

#### Add a new label for geological terms

In [7]:
ner.add_label("GEOLOGY")

1

#### Disable the other pipes

In [8]:
disable_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

#### Train the model 100 times on the training data

In [9]:
with nlp.disable_pipes(*disable_pipes):
    optimizer = nlp.resume_training()

    for iteration in range(100):
        random.shuffle(train)
        losses = {}

        for batch in minibatch(train, size=8):
            for text, annotations in batch:
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                # print(losses)
                nlp.update([example], drop=0.35, sgd=optimizer, losses=losses)

#### This function will extract key geological terms

In [30]:
def find_geology_terms(announcements, random_numbers):
    for number in range(len(random_numbers)):
        announcement = announcements[number]
        doc = nlp(announcement['text'])
        geology_terms = list(set([ent.text for ent in doc.ents]))
        geology_terms = [term for term in geology_terms if not any(c.isdigit() for c in term)]  # remove any strings with digits
        print(geology_terms)
    return geology_terms

#### Import all of the ASX announcements that were saved to json

In [31]:
with open('all_announcements.json') as f:
    announcements = json.load(f)  # list of dictionaries

#### randomly choose 10 ASX announcements to assess model performance

In [32]:
random_numbers = random.sample(range(0, len(announcements)), 10)

#### find geology_terms using the trained model

In [33]:
geology_terms = find_geology_terms(announcements, random_numbers)

['Metallurgy.', 'Attachments)', 'Eridanus Block Model', 'Appropriate maps', 'RC  ', 'physically received', 'Eridanus Granodiorite', 'RC chips', 'Deeper exploratory', 'Penny', 'pit  ', 'Eridanus,', 'NSR denotes', 'samples.', 'Eridanus consists', 'PO Box', 'Kevin  ', 'Kevin Seymour', 'spacing  ', 'Die  Hardy  ', 'RAB holes', 'Quality control', 'analysis  ', 'Core samples', 'Hole ID   ', 'Marda gold', 'NQ diamond', 'Notes', 'appropriate  ', 'Tampia Hill', 'Penny deposits', 'ASX since', 'start to', 'main  ', 'and/or repetitions', 'Richard Jones', 'actual results', 'assay laboratory', 'Penny Granodiorite', 'hole  ', 'Hole ID', 'AAS finish', 'mineralised granodiorite', 'main zone', 'DBA aware', 'first results', 'Diamond core', 'ERIDANUS DEEPS', 'holes  ', '-sericite', 'XRF instruments', 'Eridanus mine', 'sample preparation', 'Notes       ', 'Die Hardy Indicated']
['inclusion  ', 'Penny North', 'Metallurgy.', 'Mount  Magnet', 'laboratory  ', 'Attachments)', 'length  ', 'Tampia was', 'Holleton

['ASX ANNOUNCEMENT', 'Rietfontein  ', 'TGME tailings', 'TGME Gold Project', 'TGME,', 'samples  ', 'Frankfort,', 'Jack  ', 'TGME  ', 'base  ', 'RC  ', 'JORC category', 'Pre‐Feasibility  ', 'Pilgrim’s', 'TGME Tailings Dam', 'Beta  ', 'solid  ', 'Pilgrim’s Rest', 'TGME CIL Plant', ',  ', 'first few', 'Rob Thomson', 'Figure  ', 'TGME Pre‐feasibility Study', 'Theta Hill', 'upper  metres', 'Bridge Street', 'Columbia Hill', 'ASX,', '“Dukes Hill', 'Bevett’s']
['ASX ANNOUNCEMENT', 'TGME,', 'Phil  ', 'Project  Bentley', 'CIL gold', 'TGME gold', 'NYSE/', 'RC  ', 'ASX release', 'Project  ', 'Pilgrim’s', 'OMT  ', 'TGME)', 'first phase', 'Company’s', 'Asanko open', 'RC rig', 'Theta Hill      ', 'AIM  ', 'shallowest gold', ')  ', 'Rob Thomson', 'Theta Hill', 'CIL  ', 'Bridge Street', 'Columbia Hill', 'solid  ', 'Columbia  ']
['main haulages', 'ASX ANNOUNCEMENT', 'backfill paddocks', 'blast  ', 'size fractions', 'Frankfort:', 'Nestor  ', 'samples  ', 'Vaalhoek', 'Southern section', 'Columbia,', 'Malie

### NLP task 2 - extract keywords from each ASX announcement

#### Sort tf-idf scores in descending order

In [35]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

#### This function extracts the feature names with the top 25 tf-idf scores

In [36]:
def extract_topn_from_vector(feature_names, sorted_items, topn=25):
    # Use only top n items from vector
    sorted_items = sorted_items[:topn]
    score_vals = []
    feature_vals = []

    # Word index and corresponding tf-idf score
    for idx, score in sorted_items:
        # keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    # ceate tuples of feature,score
    results = {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]] = score_vals[idx]
    return results

#### Import the corpus saved earlier in json

In [38]:
with open('corpus.json') as f:
    corpus = json.load(f)

#### Tokenize the text and build a vocabulary of known words

In [39]:
keywords_list = []
stop_words = set(stopwords.words("english"))  # create a set of stopwords
cv = CountVectorizer(max_df=0.8, stop_words=stop_words, max_features=15000, ngram_range=(1, 3))
X = cv.fit_transform(corpus)

#### Get tf-idf score to identify important words

In [40]:
tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf_transformer.fit(X)

TfidfTransformer()

#### Get feature names

In [41]:
feature_names = cv.get_feature_names()

#### Print the top 10 keywords for the same 10 randomly selected ASX announcements used in the geology_term extraction

In [42]:
for number in range(len(random_numbers)):
    announcement = announcements[number]
    # Generate tf-idf for the given document
    tf_idf_vector = tfidf_transformer.transform(cv.transform([announcement['text']]))

    # Sort the tf-idf vectors by descending order of scores
    sorted_items = sort_coo(tf_idf_vector.tocoo())

    # Eetract only the top 10
    keywords = extract_topn_from_vector(feature_names, sorted_items, 10)  # dictionary with word: tf-idf score
    keywords = list(keywords.keys())
    print(keywords)

['buckshot', 'penny', 'eridanus', 'awaited assays awaited', 'awaited assays', 'assays awaited assays', 'awaited', 'assays awaited', 'au', 'nsr']
['penny', 'penny west', 'magenta', 'ramelius', 'au', 'eridanus', 'hole', 'edna may', 'edna', 'boomer']
['shannon', 'vivien', 'ramelius', 'yandan', 'hole', 'au', 'sample', 'drill', 'reported', 'aircore']
['theta', 'reef', 'sample', 'minxcon', 'exploration target', 'theta hill', 'bentley', 'data', 'stonewall', 'abandoned due']
['theta', 'theta hill', 'stonewall', 'hill', 'stonewall resources limited', 'stonewall resources', 'columbia hill', 'columbia', 'stonewallresources com', 'stonewallresources']
['reef', 'columbia hill', 'columbia', 'sample', 'minxcon', 'exploration target', 'dec', 'data', 'hill', 'stonewall']
['rock chips rc', 'chips rc rock', 'chips rc', 'rc rock', 'reef', 'rock chips', 'rc rock chips', 'theta', 'sample', 'minxcon']
['tgme', 'hill', 'stonewall', 'theta', 'columbia hill', 'columbia', 'rietfontein', 'dam', 'theta hill', 'tai