In [2]:
import spacy
import numpy as np
import pandas as pd
import textacy
from collections import Counter

In [3]:
nlp = spacy.load("en_core_web_sm")


with open ("data/preprocessed/train/sentences.txt", encoding = "utf8") as text:
    
    data = text.readlines()
    
#     clean_data = []
#     for i in range(len(data)):
#         clean_data.append(str(data[i]).replace("\"", "").replace("\n", "").replace("\\", ""))
    
    string = ""
    for i in range(len(data)):
#         string += str(data[i]).replace("\"", "").replace("\n", "").replace("\\", "")
        string += str(data[i]).replace("\n", " ").replace("\\", "")
#         string += str(data[i]).replace("\n", " ")
        
    
    doc = nlp(string)


## Tokenization

In [4]:
'''
Tokens
'''
count = 0
for token in doc:
    count+=1

print(f"Number of tokens {count}")

Number of tokens 15209


### Types

In [5]:
'''
Types
'''
count = 0
unique = []

for words in doc:

    unique.append(words.text)
    
unique = np.unique(unique)    

print(f"The number of types {len(unique)}")

The number of types 3744


### Number of words

In [6]:
# NOTE: method from practice notebook
word_frequencies = Counter()

for sentence in doc.sents:
    words = []
    for token in sentence: 
        # Let's filter out punctuation
        if not token.is_punct:
            words.append(token.text)
    word_frequencies.update(words)
    
#print(word_frequencies)

num_tokens = len(doc)
num_words = sum(word_frequencies.values())
num_types = len(word_frequencies.keys())

print(num_tokens, num_words, num_types)

15209 13242 3721


In [7]:
# skip = ["PUNCT", "SYM", "X", "SPACE", "NUM"]
skip = ["PUNCT"]
word_list = []
for token in doc:
        if not token.is_punct:
#     if token.pos_ not in skip:
            word_list.append(token.text)

print(len(word_list))

13242


In [8]:
print(f"The total number of words {len(word_list)}")

The total number of words 13242


In [9]:
# OLD METHOD
skip = ["PUNCT", "SYM", "X", "SPACE", "NUM"]
word_list = []
for token in doc:
    if token.pos_ not in skip:
        if token.lemma_.isalpha():
            word_list.append(token.text)

print(len(word_list))

12707


In [10]:
print(f"The total number of words {len(word_list)}")

The total number of words 12707


### Average number of words per sentence

In [11]:
wordcount = len(word_list)
sentcount = len(list(doc.sents))
print(f"The average words per sentence is {wordcount/sentcount}")

The average words per sentence is 18.937406855439644


### Average word length

In [12]:
word_lengths = []

for i in word_list:
    word_lengths.append(len(i))

In [13]:
print(f"The average word length is {np.mean(word_lengths)} +/- {np.std(word_lengths)}")

The average word length is 4.973007004013536 +/- 2.585102110898612


## 2. Word Classes

In [14]:
tokens = [token.tag_ for token in doc]

In [15]:
tokens = np.array(tokens)
tokens = np.unique(tokens)

In [16]:
token_count = dict.fromkeys(tokens, 0)

In [17]:
for token in doc:
    token_count[token.tag_] += 1

In [18]:
my_keys = sorted(token_count, key=token_count.get, reverse=True)[:10]

In [19]:
my_keys

['NN', 'NNP', 'IN', 'DT', 'JJ', 'NNS', ',', 'VBD', '.', 'VBN']

In [20]:
uberdict = dict.fromkeys(my_keys, 0)

In [21]:
for i in my_keys:
    
    # get universal POS applicable to finegrained POS
    uniPOS = [token.pos_
             for token in doc if token.tag_ == i]
    uniPOS = list(set(uniPOS))
    
    # get all words
    all_words = [token.text
             for token in doc if token.tag_ ==i]
    
    # get count of words for finegrained POS token
    fineg_count = len(all_words)
    
    # create dict with all words existing for this finegrained POS token
    words = np.array(all_words)
    words = np.unique(words)
    word_dict = dict.fromkeys(words, 0)
    
    # count number of occurences of word
    for tok in all_words:
        word_dict[tok] += 1
    
    # get most common and least common words
    keys_freqtokens = sorted(word_dict, key=word_dict.get, reverse=True)[:3]
    keys_unfreqtokens = sorted(word_dict, key=word_dict.get)[:1]
    
    uberdict[i] = [uniPOS, fineg_count, keys_freqtokens, keys_unfreqtokens]
    

In [22]:
uberdict

{'NN': [['PRON', 'NOUN'], 2055, ['year', 'report', 'time'], ['A.']],
 'NNP': [['PROPN', 'AUX'], 1793, ['US', 'President', 'U.S.'], ['-']],
 'IN': [['ADP', 'SCONJ'], 1744, ['of', 'in', 'to'], ['About']],
 'DT': [['PRON', 'DET'], 1379, ['the', 'a', 'The'], ['An']],
 'JJ': [['ADJ'], 872, ['other', 'Russian', 'presidential'], ['21st']],
 'NNS': [['NOUN'], 781, ['ants', 'troops', 'people'], ['1970s']],
 ',': [['PUNCT'], 699, [',', ';', '…'], [';']],
 'VBD': [['VERB', 'AUX'], 658, ['was', 'were', 'said'], ['acknowledged']],
 '.': [['PUNCT'], 655, ['.', '?', '!'], ['!']],
 'VBN': [['VERB', 'AUX'], 501, ['been', 'accused', 'killed'], ['-']]}

In [23]:
total_tags = 0
for i in uberdict.values():
    total_tags += i[1]

for i in uberdict.keys():
    uberdict[i].append(uberdict[i][1]/total_tags)

In [35]:
#uberdict

In [24]:
df = pd.DataFrame.from_dict(uberdict,orient="index")

In [25]:
df

Unnamed: 0,0,1,2,3,4
NN,"[PRON, NOUN]",2055,"[year, report, time]",[A.],0.18452
NNP,"[PROPN, AUX]",1793,"[US, President, U.S.]",[-],0.160995
IN,"[ADP, SCONJ]",1744,"[of, in, to]",[About],0.156595
DT,"[PRON, DET]",1379,"[the, a, The]",[An],0.123821
JJ,[ADJ],872,"[other, Russian, presidential]",[21st],0.078298
NNS,[NOUN],781,"[ants, troops, people]",[1970s],0.070127
",",[PUNCT],699,"[,, ;, …]",[;],0.062764
VBD,"[VERB, AUX]",658,"[was, were, said]",[acknowledged],0.059082
.,[PUNCT],655,"[., ?, !]",[!],0.058813
VBN,"[VERB, AUX]",501,"[been, accused, killed]",[-],0.044985


## 3. N-Grams

In [175]:
def ngram(doc, n, pos = False):
    result = []
    sentence = []
    
    for token in doc:
        if token.is_alpha:
            if pos:
                sentence.append(token.tag_)
            else:
                sentence.append(token)
            
    for word in range(len(sentence) - (n-1)):
        element = []
        for i in range(n):
            element.append(sentence[word+i])
        result.append(element)
        
    return result

def count(ngrams):
    counts = {}
    for ngram in ngrams:
        listToStr = ' '.join(map(str, ngram))
        if listToStr in counts:
            counts[listToStr] += 1
        else:
            counts[listToStr] = 1
            
    return counts

def plot_dist(grams):
    df = pd.DataFrame.from_dict(grams,orient="index").sort_values(by=0, ascending=False)
    df.plot(rot=90)

In [176]:
bigrams = ngram(doc, n=2)
bigrams_types = ngram(doc, n=2, pos=True)

bigram_counts = count(bigrams)
bigram_counts_types = count(bigrams_types)

In [181]:
df = pd.DataFrame.from_dict(bigram_counts,orient="index").sort_values(by=0, ascending=False)
df.head(3)

Unnamed: 0,0
of the,82
in the,54
to the,43


In [180]:
df = pd.DataFrame.from_dict(bigram_counts_types,orient="index").sort_values(by=0, ascending=False)
df.head(3)

Unnamed: 0,0
NNP NNP,699
DT NN,679
IN DT,597


In [183]:
trigrams = ngram(doc, n=3)
trigrams_types = ngram(doc, n=3, pos=True)

trigram_counts = count(trigrams)
trigram_counts_types = count(trigrams_types)

df = pd.DataFrame.from_dict(trigram_counts,orient="index").sort_values(by=0, ascending=False)
df.head(3)

Unnamed: 0,0
in response to,7
Republican Party presidential,6
local time UTC,5


In [184]:
df = pd.DataFrame.from_dict(trigram_counts_types,orient="index").sort_values(by=0, ascending=False)
df.head(3)

Unnamed: 0,0
IN DT NN,303
NNP NNP NNP,271
DT NN IN,219


## 4. Lemmatization

In [26]:
lemmas = [token.lemma_ for token in doc]

In [27]:
lemmas

['child',
 'be',
 'think',
 'to',
 'be',
 'age',
 'three',
 ',',
 'eight',
 ',',
 'and',
 'ten',
 'year',
 ',',
 'alongside',
 'an',
 'eighteen',
 '-',
 'month',
 '-',
 'old',
 'baby',
 '.',
 'we',
 'mix',
 'different',
 'concentration',
 'of',
 'ROS',
 'with',
 'the',
 'spore',
 ',',
 'plate',
 'they',
 'out',
 'on',
 'petridishe',
 'with',
 'an',
 'agar',
 '-',
 'solution',
 'where',
 'fungus',
 'can',
 'grow',
 'on',
 '.',
 'they',
 'feel',
 'they',
 'be',
 'under',
 '-',
 'represent',
 'in',
 'high',
 'education',
 'and',
 'be',
 'suffer',
 'in',
 'a',
 'regional',
 'economic',
 'downturn',
 '.',
 'especially',
 'as',
 'it',
 'concern',
 'a',
 'third',
 'party',
 'build',
 'up',
 'its',
 'military',
 'presence',
 'near',
 'our',
 'border',
 '.',
 'Police',
 'say',
 'three',
 'child',
 'be',
 'hospitalise',
 'for',
 '"',
 'severe',
 'dehydration',
 '"',
 '.',
 'Virginia',
 'Álvarez',
 ',',
 'who',
 'write',
 'the',
 'report',
 ',',
 'note',
 ',',
 '"',
 'instead',
 'of',
 'listen',


In [28]:
test = list(doc.sents)
test[0]

children are thought to be aged three , eight , and ten years , alongside an eighteen-month-old baby .

In [29]:
lemmas = {}
lemma_set = []
token_set = []

sentences = list(doc.sents)
for i in range(len(sentences)):
    for token in sentences[i]:
        if not token.is_punct:
            if token.lemma_ not in lemmas.keys():
                lemmas[token.lemma_] = {}
                lemmas[token.lemma_][token.text] = [i]
                lemma_set.append(token.lemma_)
                token_set.append(token.text)
            elif token.text not in lemmas[token.lemma_].keys():
                lemmas[token.lemma_][token.text] = [i]
                token_set.append(token.text)
            elif token.text in lemmas[token.lemma_].keys() and token.lemma_ in lemmas.keys():
                lemmas[token.lemma_][token.text] += [i]    

In [30]:
for k in lemmas.keys():
    if len(lemmas[k].keys()) > 3:
        print(k)
        

be
say
report
start
do
early
challenge
fall
move
fly
call
use
close
appear
find
go
seek
kill
measure
claim
result
man
include
give
respond
name
see
add
make
run
take
remain
election
average


In [31]:
print(lemmas['challenge'].keys())
print()
lem1tok1 = lemmas['challenge']['challenged'][0]
lem1tok2 = lemmas['challenge']['challenging'][0]
lem1tok3 = lemmas['challenge']['challenges'][0]
lem1tok4 = lemmas['challenge']['challenge'][1]
print(sentences[lem1tok1])
print()
print(sentences[lem1tok2])
print()
print(sentences[lem1tok3])
print()
print(sentences[lem1tok4])

dict_keys(['challenged', 'challenging', 'challenges', 'challenge'])

That decision was the one challenged unsuccessfully in the High Court .

Next week , on January 24 , the Supreme Court is due to deliver a decision in a case challenging the government 's right to issue Article 50 — which starts the Brexit negotiations — without the consultation of Parliament .

U.S. presidential candidate Mark Everson challenges debate exclusion U.S. Republican Party presidential candidate Mark Everson , former commissioner of the Internal Revenue Service ( IRS ) , filed a complaint on Monday with the Federal Election Commission ( FEC ) to challenge his exclusion from Thursday 's first Fox News Republican Party presidential debate .

Election law expert Richard Winger , publisher of Ballot Access News , says Everson is " completely correct " in his challenge .


### Part 5. Named Entity Recognition

In [33]:
# Create sentence object and an array for entity information
doc_sent = doc.sents
array = np.zeros(len(list(doc.sents))).tolist()

In [34]:
# Loop over sentences save entities in array
counter = 0
first_five = []
for sent in doc.sents:
    first_five.append(sent)
    temp_entity = []
    sent_text = nlp(str(sent))
    
    for ent in sent_text.ents:
        temp_entity.append([ent.text, ent.label_])
    
    array[counter] = temp_entity
    counter += 1

In [135]:
# Get entities and entity labels back from array
named_entities = 0
entity_labels = []

for i in range(len(array)):
    
    temp_array = np.array(array[i]).T
    
    if temp_array.size != 0:
        named_entities += len(temp_array[0]) + 1
        entity_labels.append(temp_array[1].tolist())
    
print(f"Number of named entities: {named_entities}")    
print(f"Number of different entity labels: {len(set(np.hstack(entity_labels).tolist()))}")

Number of named entities: 2108
Number of different entity labels: 17


In [150]:
# Analyse first five sentences
for i in range(0,5):
    print(f"Sentence {i+1}:")
    print(first_five[i])
    if np.array(array[i]).T.size != 0:
        print(f"Entities: {np.array(array[i]).T[0]}")
        print(f"Labels: {np.array(array[i]).T[1]}")
    
    print()

Sentence 1:
children are thought to be aged three , eight , and ten years , alongside an eighteen-month-old baby .
Entities: ['three , eight' 'ten years' 'eighteen-month-old']
Labels: ['DATE' 'DATE' 'DATE']

Sentence 2:
We mixed different concentrations of ROS with the spores , plated them out on petridishes with an agar-solution where fungus can grow on .
Entities: ['ROS']
Labels: ['GPE']

Sentence 3:
They feel they are under-represented in higher education and are suffering in a regional economic downturn .

Sentence 4:
Especially as it concerns a third party building up its military presence near our borders .
Entities: ['third']
Labels: ['ORDINAL']

Sentence 5:
Police said three children were hospitalised for " severe dehydration " .
Entities: ['three']
Labels: ['CARDINAL']

