# Code Demo Two for Text Analysis with Python, covering:

* Review of loading and cleaning text data
* Lexicon-based sentiment analysis
* Part-of-Speech (POS) Tagging
* Named Entity Recognition (NER)

## Preparing Text for Bag-of-Words Analysis

In [None]:
# Reading in the text data from the text file

import os

grimms_folder = "/content/drive/MyDrive/TRIADS_workshops/grimms"

rapunzel = open(os.path.join(grimms_folder, "rapunzel.txt"), encoding="utf-8").read()



In [None]:
print(rapunzel)

In [None]:
# Tokenizing using NLTK

import nltk
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize

tokens = word_tokenize(rapunzel)

print(tokens)



In [None]:
# Removing stopwords

from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords = stopwords.words("english")

tokens = [token for token in tokens if token not in stopwords]

print(tokens)

In [None]:
# Cleaning tokens of punctuation


import string
punctuation = list(string.punctuation)

punctuation.append("‘")
punctuation.append("’")

tokens = [token for token in tokens if token not in punctuation]

print(tokens)

In [None]:
len(tokens)

In [None]:
# Making tokens all lowercase

tokens = [token.lower() for token in tokens]

print(tokens)

# Demo: Sentiment Analysis--Lexicon/Dictionary Approach

In [None]:
# Import sentiment analyzer tool from NLTK

from nltk.sentiment import SentimentIntensityAnalyzer

# Download the VADER lexicon

nltk.download('vader_lexicon')

#Loading the VADER sentiment analyzer tool

sentiment_analyzer = SentimentIntensityAnalyzer()



In [None]:
#Assigning the VADER lexicon to a variable and taking a look at the lexicon

vader_lexicon = list(sentiment_analyzer.lexicon.items())

print(vader_lexicon[2000:2050])


In [None]:
#Calculating sentiment scores for Rapunzel

sentiment_scores = sentiment_analyzer.polarity_scores(rapunzel)
print(sentiment_scores)

In [None]:
# Extracting just the overall sentiment scores

sentiment_scores["compound"]

# Exercise: Sentiment Scores

1. Calculate the negative sentiment score for Hansel and Gretal

2. Calculate the compound sentiment score for each story in Grimm's Fairytales by looping through the files in your Grimms folder.



In [None]:
# Calculate the negative sentiment score for Hansel and Gretal

hansel_gretel = open(os.path.join(grimms_folder, "hansel_and_gretel.txt"), encoding="utf-8").read()

sentiment_scores_HG = sentiment_analyzer.polarity_scores(hansel_gretel)

sentiment_scores_HG["neg"]


In [None]:
# Calculating sentiment scores for ALL text files in a folder

results = []

for file in os.listdir(grimms_folder):
  if file[-4:] == ".txt":

    file_name = open(os.path.join(grimms_folder, file), encoding="utf-8")
    text = file_name.read()

    sentiment_scores = sentiment_analyzer.polarity_scores(text)
    compound_score = sentiment_scores["compound"]
    results.append([file, compound_score])

print(results)

## Demo: Part-of-Speech (POS) Tagging

In [None]:
# Import sentence tokenizer from NLTK
# You must tokenize by sentence for POS tagging, as the tool uses the sentence structure to determin the POS

from nltk import sent_tokenize

rapunzel_sentences = sent_tokenize(rapunzel)
rapunzel_sentences[:2]

In [None]:
# Use .replace() to remove new line characters (\n) from sentences

rapunzel_sentences_cleaned = []

for x in rapunzel_sentences:
  x = x.replace("\n", " ")
  rapunzel_sentences_cleaned.append(x)

rapunzel_sentences = rapunzel_sentences_cleaned


rapunzel_sentences[:2]

In [None]:
# Preparing to POS tag ONE of the sentences

example_sentence = rapunzel_sentences[0]

words_example = nltk.word_tokenize(example_sentence)

print(words_example)



In [None]:
# POS tagging our example sentence

nltk.download('averaged_perceptron_tagger_eng')

pos_example = nltk.pos_tag(words_example)

print(pos_example)

In [None]:
# POS tagging ALL the sentences in Rapunzel

pos = []

for sentence in rapunzel_sentences:
  words = nltk.word_tokenize(sentence)
  pos_tags = nltk.pos_tag(words)
  pos.append(pos_tags)

pos[:5]

In [None]:
# Counting each part of speech in sentence in Rapunzel by looping through each word/tag pair in each sentence

from collections import Counter

pos_counts_per_sentence = []

for tagged_sentence in pos:
    tags = []
    for word, tag in tagged_sentence:
        tags.append(tag)
    counts = Counter(tags)
    pos_counts_per_sentence.append(counts)


print(pos_counts_per_sentence)


In [None]:
# Calculating just the number of VERBS in each sentence of Rapunzel

verb_totals = []

for tagged_sentence in pos:
    verb_count = 0  # start counter for this sentence
    for word, tag in tagged_sentence:
        if tag.startswith("VB"):  # check if it's a verb tag
            verb_count += 1       # add 1 for each verb
    verb_totals.append(verb_count)

print(verb_totals)

In [None]:
# Creating an overall count of each POS tag in Rapunzel by aggregating the POS counts for each sentence


total_counts = Counter()

for counts in pos_counts_per_sentence:
    total_counts.update(counts)

print(total_counts)


In [None]:
# Printing our overall POS tag data to a dataframe

import pandas as pd

pos_total_df = pd.DataFrame(total_counts.items(), columns=['POS', 'Count'])

pos_total_df = pos_total_df.sort_values(by='Count', ascending=False).reset_index(drop=True)

pos_total_df

In [None]:
# Saving our results to a CSV!

pos_total_df.to_csv("rapunzel_pos_data.csv", header=True, index=False)

## Exercise: POS Tagging

1.   Calculate the total number of nouns in The Frog Prince. (Try writing down the steps you need to follow on paper first!)




In [None]:
# Read in the text data

frog = open(os.path.join(grimms_folder, "the_frog-prince.txt"), encoding="utf-8").read()

# Tokenize by sentence

frog_sentences = sent_tokenize(frog)

# Word tokenize and POS tag each sentence

frog_pos = []

for sentence in frog_sentences:
  words = nltk.word_tokenize(sentence)
  pos_tags = nltk.pos_tag(words)
  frog_pos.append(pos_tags)

# Loop through the pos tags for each sentence and count each instance of a verb

frog_noun_totals = []

for tagged_sentence in frog_pos:
    noun_count = 0
    for word, tag in tagged_sentence:
        if tag.startswith("NN"):
            noun_count += 1
    frog_noun_totals.append(noun_count)

# Sum together all of the noun counts

frog_aggregate = sum(frog_noun_totals)

print(frog_aggregate)

## Demo: Names Entity Recognition

In [None]:
# Loading one sentence to use as an example

one_sentence = rapunzel_sentences[1]
print(one_sentence)

In [None]:
# Importing the necessary nltk packages, and running the NER tool on our example sentence

from nltk import ne_chunk
nltk.download('maxent_ne_chunker_tab')
nltk.download('averaged_perceptron_tagger')
nltk.download('words')

# Step 1: tokenize the sentence
x = word_tokenize(one_sentence)

#Step 2: POS tag the sentence
y = nltk.pos_tag(x)

# Step 3: Apply NER to the pos tags

ner = ne_chunk(y)

In [None]:
# Lets take a look at what we end up with:

print(ner)

In [None]:
# What the heck is that? Let's use type() to find out...

type(ner)

In [None]:
# Use svgling package to visualize the POS and NER tree

!pip install svgling
import svgling

tree_svg = svgling.draw_tree(ner)
display(tree_svg)

In [None]:
# Extract out only the subtree with a Named Entity in it

subtree = ner[6]
print(subtree)

In [None]:
# Print that Named Entity and it's POS tag

print(subtree.leaves())

In [None]:
# Lets get just the Named Entity

entity_name = subtree.leaves()[0][0]
print(entity_name)

In [None]:
# And now lets just get the label, or type of Named Entity it is

entity_type = subtree.label()
print(entity_type)

In [None]:
# Run NER on every sentence in Rapunzel!!

ner_sentences = []

for sentence in rapunzel_sentences:
    words = word_tokenize(sentence)
    pos_tags = nltk.pos_tag(words)
    ner_tree = ne_chunk(pos_tags)
    ner_sentences.append(ner_tree)


In [None]:
# Lets visualize one of the sentences and the results

ner_sentences[24]

In [None]:
# Create a list of named entities, for each entity type in Rapunzel

from nltk import Tree

named_entities = {}

for sentence in ner_sentences:
  for part in sentence:
    if type(part) == Tree:
          entity_name = part.leaves()[0][0]
          entity_type = part.label()

          if entity_type in named_entities:
            named_entities[entity_type].append(entity_name)
          else:
            named_entities[entity_type] = [entity_name]


print("Named Entities:", named_entities)

In [None]:
# Calculate now many times each PERSON named entity appears in Rapunzel

from collections import Counter

person_counts = Counter(named_entities.get('PERSON', []))

# Display the result
print("Person Counts:", person_counts)

In [None]:
# Lets get each unique Named Entity, it's type, and how many times is appears in the text

entities = []
types = []
counts = []

for entity_type in named_entities:
    names = named_entities[entity_type]
    name_counts = Counter(names)
    for name in name_counts:
        entities.append(name)
        types.append(entity_type)
        counts.append(name_counts[name])

print(entities)
print(types)
print(counts)


In [None]:
# Save our entity data to a dataframe

#Create a dataframe with column names and data from prior cell
entities_df = pd.DataFrame({
    'Entity': entities,
    'Type': types,
    'Count': counts})

# Sort the dataframe by the most frequent entity
entities_df = entities_df.sort_values(by='Count', ascending=False).reset_index(drop=True)

entities_df

In [None]:
# Save our entity data to a CSV!

entities_df.to_csv("rapunzel_NER.csv", header=True, index=False)


## Exercise: Named Entity Recognition



1.  Named Entity Recognition requires three pre-processing steps. List them.
2.  Make a list of the Named Entities in Snow White ("snow-white_and_rose-red.txt"). Do not include the entity types, and your list should not have any repeat values.




Main Pre-Processing Steps for Named Entity Recognition:

1.   Tokenize by sentence
2.   Tokenize each sentence by word
3.   Part-of-speech tag the words



In [None]:
# Load text file and tokenize

snow_white = open(os.path.join(grimms_folder, "snow-white_and_rose-red.txt"), encoding="utf-8").read()

snow_sentences = sent_tokenize(snow_white)

# Apply NER to the sentences

snow_ner_sentences = []

for sentence in snow_sentences:
    tokens = word_tokenize(sentence)
    pos_tags = nltk.pos_tag(tokens)
    ner_tree = ne_chunk(pos_tags)
    snow_ner_sentences.append(ner_tree)

# Extract out the entities and save them to a list, skipping the entities we've already added

entity_list = []

for sentence in snow_ner_sentences:
  for part in sentence:
    if type(part) == Tree:
          entity_name = part.leaves()[0][0]
          if entity_name not in entity_list:
              entity_list.append(entity_name)

entity_list