In [1]:
import spacy

!spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ----- ---------------------------------- 1.8/12.8 MB 11.2 MB/s eta 0:00:01
     ----------- ---------------------------- 3.7/12.8 MB 9.5 MB/s eta 0:00:01
     ----------------- ---------------------- 5.5/12.8 MB 9.1 MB/s eta 0:00:01
     ---------------------- ----------------- 7.1/12.8 MB 8.9 MB/s eta 0:00:01
     --------------------------- ------------ 8.9/12.8 MB 8.9 MB/s eta 0:00:01
     ---------------------------------- ----- 11.0/12.8 MB 9.2 MB/s eta 0:00:01
     ---------------------------------------  12.6/12.8 MB 9.2 MB/s eta 0:00:01
     ---------------------------------------- 12.8/12.8 MB 8.9 MB/s eta 0:00:00
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
# Import os to upload documents and metadata
import os

# Load spaCy visualizer
from spacy import displacy

# Import pandas DataFrame packages
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

# Import graphing package
import plotly.express as px

In [8]:
# Create empty lists for file names and contents
texts = []
file_names = []

# Iterate through each file in the folder
for _file_name in os.listdir('Wilfred_Owen_Poems'):
# Look for only text files
    if _file_name.endswith('.txt'):
    # Append contents of each text file to text list
        texts.append(open('Wilfred_Owen_Poems' + '/' + _file_name, 'r', encoding='utf-8').read())
        # Append name of each file to file name list
        file_names.append(_file_name)

In [11]:
# Create dictionary object associating each file name with its text
d = {'Filename':file_names,'Text':texts}

In [12]:
# Turn dictionary into a dataframe
poem_df = pd.DataFrame(d)

In [13]:
poem_df.head()

Unnamed: 0,Filename,Text
0,A Terre.txt,A Terre\n\n (Being the philosophy of m...
1,Anthem for Doomed Youth.txt,Anthem for Doomed Youth\n\n\n\n What passi...
2,Apologia pro Poemate Meo.txt,"Apologia pro Poemate Meo\n\n\n\n I, too, s..."
3,Arms and the Boy.txt,Arms and the Boy\n\n\n\n Let the boy try a...
4,Conscious.txt,"Conscious\n\n\n\n His fingers wake, and fl..."


In [14]:
# Remove extra spaces from papers
poem_df['Text'] = poem_df['Text'].str.replace('\s+', ' ', regex=True).str.strip()
poem_df.head()

  poem_df['Text'] = poem_df['Text'].str.replace('\s+', ' ', regex=True).str.strip()


Unnamed: 0,Filename,Text
0,A Terre.txt,A Terre (Being the philosophy of many Soldiers...
1,Anthem for Doomed Youth.txt,Anthem for Doomed Youth What passing-bells for...
2,Apologia pro Poemate Meo.txt,"Apologia pro Poemate Meo I, too, saw God throu..."
3,Arms and the Boy.txt,Arms and the Boy Let the boy try along this ba...
4,Conscious.txt,"Conscious His fingers wake, and flutter up the..."


In [15]:
# Remove .txt from title of each paper
poem_df['Filename'] = poem_df['Filename'].str.replace('.txt', '', regex=True)

In [16]:
poem_df.head()

Unnamed: 0,Filename,Text
0,A Terre,A Terre (Being the philosophy of many Soldiers...
1,Anthem for Doomed Youth,Anthem for Doomed Youth What passing-bells for...
2,Apologia pro Poemate Meo,"Apologia pro Poemate Meo I, too, saw God throu..."
3,Arms and the Boy,Arms and the Boy Let the boy try along this ba...
4,Conscious,"Conscious His fingers wake, and flutter up the..."


In [17]:
# Load nlp pipeline
nlp = spacy.load('en_core_web_sm')

# Check what functions it performs
print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [18]:
# Define a function that runs the nlp pipeline on any given input text
def process_text(text):
    return nlp(text)

In [19]:
# Apply the function to the "Text" column, so that the nlp pipeline is called on each student essay
poem_df['Doc'] = poem_df['Text'].apply(process_text)

In [20]:
# Define a function to retrieve tokens from a doc object
def get_token(doc):
    return [(token.text) for token in doc]

In [21]:
# Run the token retrieval function on the doc objects in the dataframe
poem_df['Tokens'] = poem_df['Doc'].apply(get_token)
poem_df.head()

Unnamed: 0,Filename,Text,Doc,Tokens
0,A Terre,A Terre (Being the philosophy of many Soldiers...,"(A, Terre, (, Being, the, philosophy, of, many...","[A, Terre, (, Being, the, philosophy, of, many..."
1,Anthem for Doomed Youth,Anthem for Doomed Youth What passing-bells for...,"(Anthem, for, Doomed, Youth, What, passing, -,...","[Anthem, for, Doomed, Youth, What, passing, -,..."
2,Apologia pro Poemate Meo,"Apologia pro Poemate Meo I, too, saw God throu...","(Apologia, pro, Poemate, Meo, I, ,, too, ,, sa...","[Apologia, pro, Poemate, Meo, I, ,, too, ,, sa..."
3,Arms and the Boy,Arms and the Boy Let the boy try along this ba...,"(Arms, and, the, Boy, Let, the, boy, try, alon...","[Arms, and, the, Boy, Let, the, boy, try, alon..."
4,Conscious,"Conscious His fingers wake, and flutter up the...","(Conscious, His, fingers, wake, ,, and, flutte...","[Conscious, His, fingers, wake, ,, and, flutte..."


In [22]:
tokens = poem_df[['Text', 'Tokens']].copy()
tokens.head()

Unnamed: 0,Text,Tokens
0,A Terre (Being the philosophy of many Soldiers...,"[A, Terre, (, Being, the, philosophy, of, many..."
1,Anthem for Doomed Youth What passing-bells for...,"[Anthem, for, Doomed, Youth, What, passing, -,..."
2,"Apologia pro Poemate Meo I, too, saw God throu...","[Apologia, pro, Poemate, Meo, I, ,, too, ,, sa..."
3,Arms and the Boy Let the boy try along this ba...,"[Arms, and, the, Boy, Let, the, boy, try, alon..."
4,"Conscious His fingers wake, and flutter up the...","[Conscious, His, fingers, wake, ,, and, flutte..."


In [23]:
# Define a function to retrieve lemmas from a doc object
def get_lemma(doc):
    return [(token.lemma_) for token in doc]

# Run the lemma retrieval function on the doc objects in the dataframe
poem_df['Lemmas'] = poem_df['Doc'].apply(get_lemma)

In [25]:
# Define a function to retrieve lemmas from a doc object
def get_pos(doc):
    #Return the coarse- and fine-grained part of speech text for each token in the doc
    return [(token.pos_, token.tag_) for token in doc]

# Define a function to retrieve parts of speech from a doc object
poem_df['POS'] = poem_df['Doc'].apply(get_pos)

In [26]:
list(poem_df['POS'])

[[('DET', 'DT'),
  ('PROPN', 'NNP'),
  ('PUNCT', '-LRB-'),
  ('AUX', 'VBG'),
  ('DET', 'DT'),
  ('NOUN', 'NN'),
  ('ADP', 'IN'),
  ('ADJ', 'JJ'),
  ('NOUN', 'NNS'),
  ('PUNCT', '.'),
  ('PUNCT', '-RRB-'),
  ('VERB', 'VB'),
  ('ADP', 'IN'),
  ('DET', 'DT'),
  ('NOUN', 'NN'),
  ('PUNCT', ':'),
  ('PRON', 'PRP'),
  ('AUX', 'VBP'),
  ('ADJ', 'JJ'),
  ('PUNCT', ','),
  ('CCONJ', 'CC'),
  ('NUM', 'CD'),
  ('NOUN', 'NNS'),
  ('NOUN', 'NN'),
  ('PUNCT', ','),
  ('AUX', 'VB'),
  ('ADJ', 'JJ'),
  ('PUNCT', ':'),
  ('AUX', 'MD'),
  ('PART', 'RB'),
  ('VERB', 'VB'),
  ('NOUN', 'NNS'),
  ('ADV', 'RB'),
  ('PUNCT', ':'),
  ('ADV', 'RB'),
  ('AUX', 'MD'),
  ('PUNCT', '.'),
  ('DET', 'DT'),
  ('NOUN', 'NNS'),
  ('AUX', 'VBP'),
  ('VERB', 'VBN'),
  ('ADP', 'IN'),
  ('PRON', 'PRP'),
  ('PUNCT', ':'),
  ('NOUN', 'NNS'),
  ('PUNCT', '.'),
  ('PRON', 'PRP$'),
  ('NOUN', 'NNS'),
  ('VERB', 'VBP'),
  ('INTJ', 'UH'),
  ('NUM', 'CD'),
  ('ADJ', 'JJ'),
  ('NOUN', 'NNS'),
  ('PUNCT', '.'),
  ('PRON', 'PRP'),
  (

In [31]:
# Define function to extract proper nouns from Doc object
def extract_proper_nouns(doc):
    return [token.text for token in doc if token.pos_ == 'PROPN']

# Apply function to Doc column and store resulting proper nouns in new column
poem_df['Proper_Nouns'] = poem_df['Doc'].apply(extract_proper_nouns)

In [33]:
list(poem_df.loc[[3, 2], 'Proper_Nouns'])

[['Blue', 'Sharp', 'God'],
 ['Poemate',
  'Meo',
  'I',
  'God',
  'War',
  'Merry',
  'exultation--',
  'Faces',
  'Shine',
  'Seraphic',
  'fellowships--',
  'Untold',
  'Joy',
  'slips,--',
  'Knit',
  'rifle',
  'thong',
  'heaven',
  'November']]

In [34]:
# Get all NE labels and assign to variable
labels = nlp.get_pipe("ner").labels

# Print each label and its description
for label in labels:
    print(label + ' : ' + spacy.explain(label))

CARDINAL : Numerals that do not fall under another type
DATE : Absolute or relative dates or periods
EVENT : Named hurricanes, battles, wars, sports events, etc.
FAC : Buildings, airports, highways, bridges, etc.
GPE : Countries, cities, states
LANGUAGE : Any named language
LAW : Named documents made into laws.
LOC : Non-GPE locations, mountain ranges, bodies of water
MONEY : Monetary values, including unit
NORP : Nationalities or religious or political groups
ORDINAL : "first", "second", etc.
ORG : Companies, agencies, institutions, etc.
PERCENT : Percentage, including "%"
PERSON : People, including fictional
PRODUCT : Objects, vehicles, foods, etc. (not services)
QUANTITY : Measurements, as of weight or distance
TIME : Times smaller than a day
WORK_OF_ART : Titles of books, songs, etc.


In [35]:
# Define function to extract named entities from doc objects
def extract_named_entities(doc):
    return [ent.label_ for ent in doc.ents]

# Apply function to Doc column and store resulting named entities in new column
poem_df['Named_Entities'] = poem_df['Doc'].apply(extract_named_entities)
poem_df['Named_Entities']

0     [CARDINAL, CARDINAL, CARDINAL, PERSON, DATE, D...
1                                         [ORG, PERSON]
2     [GPE, PERSON, PERSON, TIME, PERSON, ORG, PERSO...
3                                                 [ORG]
4                                        [PERSON, TIME]
5     [ORG, DATE, CARDINAL, CARDINAL, ORG, DATE, NOR...
6       [ORG, PERSON, ORG, ORG, PERSON, PERSON, PERSON]
7                        [TIME, PERSON, TIME, CARDINAL]
8                                           [GPE, TIME]
9     [ORG, NORP, WORK_OF_ART, PERSON, WORK_OF_ART, ...
10    [PERSON, PERSON, PERSON, ORDINAL, DATE, TIME, ...
11    [GPE, ORG, WORK_OF_ART, NORP, ORG, ORG, PERSON...
12          [PERSON, ORDINAL, PERSON, GPE, PERSON, ORG]
13                   [ORG, DATE, ORG, GPE, GPE, PERSON]
14    [PERSON, PERSON, PERSON, PERSON, DATE, ORG, TI...
15    [ORG, PERSON, DATE, PERSON, GPE, PERSON, ORDIN...
16    [ORG, ORG, DATE, ORG, ORG, TIME, PRODUCT, CARD...
17    [PERSON, ORG, CARDINAL, CARDINAL, DATE, TI

In [36]:
# Define function to extract text tagged with named entities from doc objects
def extract_named_entities(doc):
    return [ent for ent in doc.ents]

# Apply function to Doc column and store resulting text in new column
poem_df['NE_Words'] = poem_df['Doc'].apply(extract_named_entities)
poem_df['NE_Words']

0     [(three), (ten), (One), (medals?--Discs), (fif...
1                                [(choirs,--), (Shall)]
2     [(Apologia), (Shine), (Seraphic), (an, hour), ...
3                                             [(Sharp)]
4                                    [(Helped), (dusk)]
5     [(grey, ,, Legless), (last, year), (half), (On...
6     [(Dulce, et, Decorum), (Bent), (Drunk), (Fitti...
7               [(the, night), (Dawn), (hours), (half)]
8                           [(France), (this, morning)]
9     [(Greater, Love, Red), (English), (Love), (Tre...
10    [(Losses), (Longer), (Chance), (first), (large...
11    [(Drooping), (Misery), (Sleeping), (--These), ...
12    [(Isaac), (first), (Behold), (stretch\ed), (lo...
13    [(English, Poetry), (War), (Poetry), (War), (P...
14    [(S., I., W.), (Discipline), (W., B., Yeats), ...
15    [(Smile), (Smile, Head), (Yesterday), (Vast, B...
16    [(Spring, Offensive, Halted), (fed), (May), (S...
17    [(Down), (Titanic), (one), (a, thousand), 

In [37]:
# Extract the first Doc object
doc = poem_df['Doc'][1]

# Visualize named entity tagging in a single paper
displacy.render(doc, style='ent', jupyter=True)

In [38]:
# Save DataFrame as csv (in Google Drive)
# Use this step only to save  csv to your computer's working directory
poem_df.to_csv('Poems_with_spaCy_tags.csv')