## Import and Install

In [1]:
import pandas as pd
import requests

In [2]:
# Install and import spacy and plotly.
!pip install spaCy
!pip install plotly
!pip install nbformat==5.1.2



In [3]:

# Import spacy
import spacy

# Install English language model
!spacy download en_core_web_sm

# Import os to upload documents and metadata
import os

# Load spaCy visualizer
from spacy import displacy

# Import pandas DataFrame packages
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

# Import graphing package
import plotly.graph_objects as go
import plotly.express as px

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.0/12.8 MB 660.6 kB/s eta 0:00:20
     -- ------------------------------------- 0.7/12.8 MB 8.3 MB/s eta 0:00:02
     ---- ----------------------------------- 1.6/12.8 MB 12.4 MB/s eta 0:00:01
     -------- ------------------------------- 2.8/12.8 MB 16.1 MB/s eta 0:00:01
     ----------- ---------------------------- 3.8/12.8 MB 17.3 MB/s eta 0:00:01
     --------------- ------------------------ 4.9/12.8 MB 18.2 MB/s eta 0:00:01
     ------------------- -------------------- 6.1/12.8 MB 19.6 MB/s eta 0:00:01
     ---------------------- ----------------- 7.1/12.8 MB 19.8 MB/s eta 0:00:01
     ------------------------- -------------- 8.2/12.8 MB 20.1 MB/s eta 0:00:01
     ----------------------------

## Download corpus

In [4]:
response1 = requests.get("https://www.gutenberg.org/cache/epub/66446/pg66446.txt")
response2 = requests.get("https://www.gutenberg.org/cache/epub/67160/pg67160.txt")
response3 = requests.get("https://www.gutenberg.org/cache/epub/67173/pg67173.txt")


In [5]:
text1 = response1.text
text2 = response2.text
text3 = response3.text

Create empty lists for file names and contents

In [6]:
texts = [text1, text2, text3]
file_names = ["The Plymouth Express Affair", "The Hunter's Lodge Case","The Missing Will"]
print(os.getcwd())
     

C:\Users\70794


Create dictionary object associating each file name with its text


In [7]:
d = {'Filename':file_names,'Text':texts}

Turn dictionary into a dataframe


In [8]:
agatha_df = pd.DataFrame(d)

In [54]:
agatha_df.head()

Unnamed: 0,Filename,Text,Doc,Tokens,Lemmas,POS,Proper_Nouns,Named_Entities,NE_Words
0,The Plymouth Express Affair,﻿The Project Gutenberg eBook of The Plymouth E...,"(﻿The, Project, Gutenberg, eBook, of, The, Ply...","[﻿The, Project, Gutenberg, eBook, of, The, Ply...","[﻿the, Project, Gutenberg, eBook, of, the, Ply...","[(NOUN, NN), (PROPN, NNP), (PROPN, NNP), (PROP...","[Project, Gutenberg, eBook, Plymouth, Express,...","[PERSON, ORG, GPE, ORG, GPE, PRODUCT, PERSON, ...","[(Project, Gutenberg, eBook), (The, Plymouth, ..."
1,The Hunter's Lodge Case,﻿The Project Gutenberg eBook of The Hunter's L...,"(﻿The, Project, Gutenberg, eBook, of, The, Hun...","[﻿The, Project, Gutenberg, eBook, of, The, Hun...","[﻿the, Project, Gutenberg, eBook, of, the, Hun...","[(NOUN, NN), (PROPN, NNP), (PROPN, NNP), (PROP...","[Project, Gutenberg, eBook, Hunter, Lodge, Cas...","[PERSON, PERSON, GPE, ORG, GPE, PRODUCT, PERSO...","[(Project, Gutenberg, eBook), (Lodge, Case), (..."
2,The Missing Will,﻿The Project Gutenberg eBook of The Missing Wi...,"(﻿The, Project, Gutenberg, eBook, of, The, Mis...","[﻿The, Project, Gutenberg, eBook, of, The, Mis...","[﻿the, Project, Gutenberg, eBook, of, the, Mis...","[(NOUN, NN), (PROPN, NNP), (PROPN, NNP), (PROP...","[Project, Gutenberg, eBook, Missing, United, S...","[PERSON, GPE, ORG, GPE, PRODUCT, PERSON, DATE,...","[(Project, Gutenberg, eBook), (the, United, St..."


## Clean the text

In [12]:

agatha_df['Text'] = agatha_df['Text'].str.replace('\s+', ' ', regex=True).str.strip()
agatha_df.head()

Unnamed: 0,Filename,Text
0,The Plymouth Express Affair,﻿The Project Gutenberg eBook of The Plymouth E...
1,The Hunter's Lodge Case,﻿The Project Gutenberg eBook of The Hunter's L...
2,The Missing Will,﻿The Project Gutenberg eBook of The Missing Wi...


In [16]:
# Remove .txt from title of each paper
agatha_df['Filename'] = agatha_df['Filename'].str.replace('.txt', '', regex=True)

# Rename column from paper ID to Title
agatha_df.rename(columns={"PAPER ID": "Filename"}, inplace=True)

## Creating Doc

In [22]:
# Load nlp pipeline
nlp = spacy.load('en_core_web_sm')

# Check what functions it performs
print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [23]:
# Define a function that runs the nlp pipeline on any given input text
def process_text(text):
    return nlp(text)

In [55]:
# Loop through each token in doc object
for token in doc:
    # Print text and part of speech for each
    print(token.text, token.pos_)

﻿The NOUN
Project PROPN
Gutenberg PROPN
eBook PROPN
of ADP
The DET
Hunter PROPN
's PART
Lodge PROPN
Case PROPN
This DET
ebook NOUN
is AUX
for ADP
the DET
use NOUN
of ADP
anyone PRON
anywhere ADV
in ADP
the DET
United PROPN
States PROPN
and CCONJ
most ADJ
other ADJ
parts NOUN
of ADP
the DET
world NOUN
at ADP
no DET
cost NOUN
and CCONJ
with ADP
almost ADV
no PRON
restrictions NOUN
whatsoever ADV
. PUNCT
You PRON
may AUX
copy VERB
it PRON
, PUNCT
give VERB
it PRON
away ADV
or CCONJ
re VERB
- VERB
use VERB
it PRON
under ADP
the DET
terms NOUN
of ADP
the DET
Project PROPN
Gutenberg PROPN
License PROPN
included VERB
with ADP
this DET
ebook NOUN
or CCONJ
online NOUN
at ADP
www.gutenberg.org NOUN
. PUNCT
If SCONJ
you PRON
are AUX
not PART
located VERB
in ADP
the DET
United PROPN
States PROPN
, PUNCT
you PRON
will AUX
have VERB
to PART
check VERB
the DET
laws NOUN
of ADP
the DET
country NOUN
where SCONJ
you PRON
are AUX
located VERB
before ADP
using VERB
this DET
eBook PROPN
. PUNCT
Title NOUN


In [25]:
# Define a function that runs the nlp pipeline on any given input text
def process_text(text):
    return nlp(text)

In [26]:
# Apply the function to the "Text" column, so that the nlp pipeline is called on each student essay
agatha_df['Doc'] = agatha_df['Text'].apply(process_text)

## Processing 

In [27]:
# Define a function to retrieve tokens from a doc object
def get_token(doc):
    return [(token.text) for token in doc]

In [28]:
# Run the token retrieval function on the doc objects in the dataframe
agatha_df['Tokens'] = agatha_df['Doc'].apply(get_token)
agatha_df.head()

Unnamed: 0,Filename,Text,Doc,Tokens
0,The Plymouth Express Affair,﻿The Project Gutenberg eBook of The Plymouth E...,"(﻿The, Project, Gutenberg, eBook, of, The, Ply...","[﻿The, Project, Gutenberg, eBook, of, The, Ply..."
1,The Hunter's Lodge Case,﻿The Project Gutenberg eBook of The Hunter's L...,"(﻿The, Project, Gutenberg, eBook, of, The, Hun...","[﻿The, Project, Gutenberg, eBook, of, The, Hun..."
2,The Missing Will,﻿The Project Gutenberg eBook of The Missing Wi...,"(﻿The, Project, Gutenberg, eBook, of, The, Mis...","[﻿The, Project, Gutenberg, eBook, of, The, Mis..."


In [29]:
tokens = agatha_df[['Text', 'Tokens']].copy()
tokens.head()

Unnamed: 0,Text,Tokens
0,﻿The Project Gutenberg eBook of The Plymouth E...,"[﻿The, Project, Gutenberg, eBook, of, The, Ply..."
1,﻿The Project Gutenberg eBook of The Hunter's L...,"[﻿The, Project, Gutenberg, eBook, of, The, Hun..."
2,﻿The Project Gutenberg eBook of The Missing Wi...,"[﻿The, Project, Gutenberg, eBook, of, The, Mis..."


In [30]:
# Define a function to retrieve lemmas from a doc object
def get_lemma(doc):
    return [(token.lemma_) for token in doc]

# Run the lemma retrieval function on the doc objects in the dataframe
agatha_df['Lemmas'] = agatha_df['Doc'].apply(get_lemma)

I want to count the occurrences of the word murder.

In [63]:
print(f'"murder" appears in the text tokens column ' + str(agatha_df['Tokens'].apply(lambda x: x.count('murder')).sum()) + ' times.')
print(f'"murder" appears in the lemmas column ' + str(agatha_df['Lemmas'].apply(lambda x: x.count('murder')).sum()) + ' times.')

"murder" appears in the text tokens column 6 times.
"murder" appears in the lemmas column 9 times.


## Annotation

In [64]:
# Define a function to retrieve lemmas from a doc object
def get_pos(doc):
    #Return the coarse- and fine-grained part of speech text for each token in the doc
    return [(token.pos_, token.tag_) for token in doc]

# Define a function to retrieve parts of speech from a doc object
agatha_df['POS'] = agatha_df['Doc'].apply(get_pos)

In [65]:
# Create a list of part of speech tags
list(agatha_df['POS'])

[[('NOUN', 'NN'),
  ('PROPN', 'NNP'),
  ('PROPN', 'NNP'),
  ('PROPN', 'NNP'),
  ('ADP', 'IN'),
  ('DET', 'DT'),
  ('PROPN', 'NNP'),
  ('PROPN', 'NNP'),
  ('PROPN', 'NNP'),
  ('DET', 'DT'),
  ('NOUN', 'NN'),
  ('AUX', 'VBZ'),
  ('ADP', 'IN'),
  ('DET', 'DT'),
  ('NOUN', 'NN'),
  ('ADP', 'IN'),
  ('PRON', 'NN'),
  ('ADV', 'RB'),
  ('ADP', 'IN'),
  ('DET', 'DT'),
  ('PROPN', 'NNP'),
  ('PROPN', 'NNP'),
  ('CCONJ', 'CC'),
  ('ADJ', 'JJS'),
  ('ADJ', 'JJ'),
  ('NOUN', 'NNS'),
  ('ADP', 'IN'),
  ('DET', 'DT'),
  ('NOUN', 'NN'),
  ('ADP', 'IN'),
  ('DET', 'DT'),
  ('NOUN', 'NN'),
  ('CCONJ', 'CC'),
  ('ADP', 'IN'),
  ('ADV', 'RB'),
  ('PRON', 'DT'),
  ('NOUN', 'NNS'),
  ('ADV', 'RB'),
  ('PUNCT', '.'),
  ('PRON', 'PRP'),
  ('AUX', 'MD'),
  ('VERB', 'VB'),
  ('PRON', 'PRP'),
  ('PUNCT', ','),
  ('VERB', 'VB'),
  ('PRON', 'PRP'),
  ('ADV', 'RB'),
  ('CCONJ', 'CC'),
  ('VERB', 'VB'),
  ('VERB', 'VB'),
  ('VERB', 'VB'),
  ('PRON', 'PRP'),
  ('ADP', 'IN'),
  ('DET', 'DT'),
  ('NOUN', 'NNS'),
  ('A

In [66]:
# Define function to extract proper nouns from Doc object
def extract_proper_nouns(doc):
    return [token.text for token in doc if token.pos_ == 'PROPN']

# Apply function to Doc column and store resulting proper nouns in new column
agatha_df['Proper_Nouns'] = agatha_df['Doc'].apply(extract_proper_nouns)

In [67]:
# Get all NE labels and assign to variable
labels = nlp.get_pipe("ner").labels

# Print each label and its description
for label in labels:
    print(label + ' : ' + spacy.explain(label))

CARDINAL : Numerals that do not fall under another type
DATE : Absolute or relative dates or periods
EVENT : Named hurricanes, battles, wars, sports events, etc.
FAC : Buildings, airports, highways, bridges, etc.
GPE : Countries, cities, states
LANGUAGE : Any named language
LAW : Named documents made into laws.
LOC : Non-GPE locations, mountain ranges, bodies of water
MONEY : Monetary values, including unit
NORP : Nationalities or religious or political groups
ORDINAL : "first", "second", etc.
ORG : Companies, agencies, institutions, etc.
PERCENT : Percentage, including "%"
PERSON : People, including fictional
PRODUCT : Objects, vehicles, foods, etc. (not services)
QUANTITY : Measurements, as of weight or distance
TIME : Times smaller than a day
WORK_OF_ART : Titles of books, songs, etc.


In [68]:
# Define function to extract named entities from doc objects
def extract_named_entities(doc):
    return [ent.label_ for ent in doc.ents]

# Apply function to Doc column and store resulting named entities in new column
agatha_df['Named_Entities'] = agatha_df['Doc'].apply(extract_named_entities)
agatha_df['Named_Entities']

0    [PERSON, ORG, GPE, ORG, GPE, PRODUCT, PERSON, ...
1    [PERSON, PERSON, GPE, ORG, GPE, PRODUCT, PERSO...
2    [PERSON, GPE, ORG, GPE, PRODUCT, PERSON, DATE,...
Name: Named_Entities, dtype: object

In [69]:
# Define function to extract text tagged with named entities from doc objects
def extract_named_entities(doc):
    return [ent for ent in doc.ents]

# Apply function to Doc column and store resulting text in new column
agatha_df['NE_Words'] = agatha_df['Doc'].apply(extract_named_entities)
agatha_df['NE_Words']

0    [(Project, Gutenberg, eBook), (The, Plymouth, ...
1    [(Project, Gutenberg, eBook), (Lodge, Case), (...
2    [(Project, Gutenberg, eBook), (the, United, St...
Name: NE_Words, dtype: object

In [70]:
# Extract the first Doc object
doc = agatha_df['Doc'][1]

# Visualize named entity tagging in a single paper
displacy.render(doc, style='ent', jupyter=True)

In [72]:
entity_freq = Counter()

for index, row in agatha_df.iterrows():
    doc = nlp(row['Text'])
    for ent in doc.ents:
        entity_freq[ent.label_] += 1

for ent_type, freq in entity_freq.items():
    print(f"{ent_type}: {freq} times")

PERSON: 463 times
ORG: 212 times
GPE: 149 times
PRODUCT: 22 times
DATE: 104 times
LAW: 23 times
NORP: 12 times
ORDINAL: 26 times
TIME: 57 times
EVENT: 3 times
WORK_OF_ART: 21 times
FAC: 5 times
MONEY: 5 times
CARDINAL: 111 times
PERCENT: 3 times
LOC: 3 times
QUANTITY: 4 times


## Output Dataset

In [45]:
agatha_df.to_csv('Agatha_with_spaCy_tags.csv')