# Code Demo One for Text Analysis with Python, covering:

*   Loading text data
*   Tokenization
*   Text cleaning (incl. removing stopwords, puntuation and capital letters)
* Basic word frequency analysis



In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Reading and Processing Text Files

In [None]:
grimms_folder = "/content/drive/MyDrive/TRIADS_workshops/grimms"

In [None]:
import os

snow_white_file = open(os.path.join(grimms_folder, "snow-white_and_rose-red.txt"),
                  encoding="utf-8")

snow_white = snow_white_file.read()

In [None]:
print(snow_white)

## Cleaning and Pre-processing ##

In [None]:
#Removing bibliographic text by splitting on the asterisks and keeping everything before

snow_white_cleaned = snow_white.split("*****")[0]


In [None]:
# .split() breaks the text into parts on the designated string, and creates a list of strings out of the parts.

snow_white.split("*****")[1]

In [None]:
# printing the text cleaned of bibliographic data

print(snow_white_cleaned)

### Tokenization

In [None]:
# Importing necessary libraries (Natural Language TookKit (NLTK))

import nltk
nltk.download('punkt_tab')

from nltk.tokenize import sent_tokenize, word_tokenize



In [None]:
# Tokenization

tokens = word_tokenize(snow_white)

print(tokens)

In [None]:
# Counting the number of tokens in the text

len(tokens)

### Token Cleaning

In [None]:
# Creating a list of punctuation marks using the string library

import string

punctuation = list(string.punctuation)

punctuation


In [None]:
# Adding curly brackets to the punctuation list

punctuation.extend(["‘", "’", "“", "”"])

print(punctuation)

In [None]:
# Using a for loop to remove punctuation from the tokens list

for token in tokens:
  if token in punctuation:
    tokens.remove(token)


In [None]:
len(tokens)

In [None]:
print(tokens)

In [None]:
# Alternative way to remove punctuation--looping through the tokens list and keeping only alphanumeric tokens.

filtered_tokens = []

for token in tokens:
    if token.isalpha():
        filtered_tokens.append(token)

tokens = filtered_tokens

In [None]:
len(tokens)

In [None]:
# Alternative way to keep only alpha-numeric tokens--using a list comprehension!

tokens = [token for token in tokens if token.isalpha()]

In [None]:
# Making all tokens lowercase using list comprehension

tokens = [token.lower() for token in tokens]
print(tokens)

### Basic Word Frequency Stats

In [None]:
# Calculate final word count

word_count = len(tokens)
print(word_count)

In [None]:
#Calculate most frequent words using Counter

from collections import Counter

word_counts = Counter(tokens)
print(word_counts)

In [None]:
# Load stopword list from NLTK

from nltk.corpus import stopwords # Load stopwords sub-library from NLTK

nltk.download('stopwords') # Download stopword list (this is a quirk of NLTK)

stopwords = set(stopwords.words('english'))

print(stopwords)


In [None]:
# Using a list comprehension to loop through tokens and keep the tokens that are NOT in the stopword list

content_words = [token for token in tokens if token not in stopwords]

print(content_words)

In [None]:
# Checking the word count of the content words

len(content_words)

In [None]:
# Recalculating word frequencies using Counter

content_word_counts = Counter(content_words)

print(content_word_counts)

In [None]:
# Creating a dataframe to store word frequency data

import pandas as pd

df = pd.DataFrame.from_dict(content_word_counts, orient="index", columns=["count"])

print(df)


In [None]:
# Sorting the dataframe by word frequency

df.sort_values("count", ascending=False, inplace=True)

print(df)

In [None]:
# Saving dataframe to CSV

df.to_csv("rapunzel_word_frequencies.csv", header=True, index=False)

## Exercises:

### Calculate and compare the relative frequency of male and female pronouns in rapunzel.txt



In [None]:
# Tokenizing the text file

rapunzel = open(os.path.join(grimms_folder, "rapunzel.txt"), encoding="utf-8").read()

tokens = word_tokenize(rapunzel)

In [None]:
# Checking if pronouns are stopwords or not

if "her" in stopwords:
  print("It's a stopword")
else:
  print("It's not a stopword")

In [None]:
# Cleaning tokens of punctuation, and making all lowercase

punctuation = list(string.punctuation)

punctuation.extend(["‘", "’", "“", "”"])

tokens = [token for token in tokens if token not in punctuation]

tokens = [token.lower() for token in tokens]

In [None]:
# Calculating the number and relative frequency of female pronouns

female_pronouns = ["she", "her", "hers", "herself"]

female_pronoun_count = 0

for x in female_pronouns:
  for y in tokens:
    if x == y:
      female_pronoun_count += 1

relative_female_freq = female_pronoun_count / len(tokens)

print("The number of female pronouns is: " + str(female_pronoun_count))
print("The relative frequency of female pronouns is: " + str(relative_female_freq))


In [None]:
# Calculating the number and relative frequency of male pronouns

male_pronouns = ["he", "him", "his", "himself"]

male_pronoun_count = 0

for x in male_pronouns:
  for y in tokens:
     if x == y:
      male_pronoun_count += 1

relative_male_freq = male_pronoun_count / len(tokens)

print("The number of male pronouns is: " + str(male_pronoun_count))
print("The relative frequency of male pronouns is: " + str(relative_male_freq))