# Reading and Processing Text Files

Introducing:



*   Loading text files
*   Tokenization and Cleaning
*   Word Frequency Analysis



In [32]:
import os

snow_white = open("/content/drive/MyDrive/python_bootcamp/sample_data/snow-white_and_rose-red.txt", encoding="utf-8").read()

In [None]:
print(snow_white)

## Cleaning and Pre-processing ##

In [None]:
#Removing bibliographic text

snow_white_cleaned = [snow_white.split("*****")[0]][0]

In [None]:
print(snow_white_cleaned)

In [None]:
# Tokenization

import nltk
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize, word_tokenize



In [None]:
# Extract tokens
tokens = word_tokenize(snow_white_cleaned)
print(tokens)

In [None]:
len(tokens)

In [None]:
#Remove punctuation

import string

punctuation = list(string.punctuation)

punctuation.append("‘")
punctuation.append("’")
punctuation.append("“")
punctuation.append("”")
print(punctuation)


In [None]:
for token in tokens:
  if token in punctuation:
    tokens.remove(token)


In [None]:
len(tokens)

In [None]:
print(tokens)

In [None]:
# Manually removing remaining curly quotes from the tokens list (alternative way to remove punctuation)

tokens = [token for token in tokens if token.isalpha()]


In [None]:
#Make all lowercase

tokens = [token.lower() for token in tokens]
print(tokens)

In [None]:
# Calculate word count

word_count = len(tokens)
print(word_count)

In [None]:
#Calculate most frequent words

from collections import Counter

word_counts = Counter(tokens)
print(word_counts)

In [None]:
#Remove stopwords

from nltk.corpus import stopwords
nltk.download('stopwords')

stopwords = stopwords.words("english")
print(stopwords)



In [None]:
content_words = [token for token in tokens if token not in stopwords]
print(content_words)

In [None]:
len(content_words)

In [None]:
content_word_counts = Counter(content_words)
print(content_word_counts)

In [None]:
import pandas as pd

df = pd.DataFrame.from_dict(content_word_counts, orient="index",
                            columns=["count"])
df.sort_values("count", ascending=False, inplace=True)
print(df)


# **Exercises!**





## Exercise: Text Files

Create a new text file called “python.txt” that contains the text “I am almost finished my first python class!”)

In [None]:
file = open("python.txt", mode="w", encoding="utf-8")
file.write("I am almost finished my first python class!")


43

## Exercise: Word Frequencies

Write the code to count the approximate number of words in the file austen_pride.txt

In [None]:
# Exercise: Word Frequencies -- Solution

import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')

file = open("/content/drive/MyDrive/python_bootcamp/sample_data/austen_pride.txt", mode="r", encoding="utf-8")
text = file.read()
tokens = word_tokenize(text)
tokens = [token.lower() for token in tokens if token.isalpha()]
print(len(tokens))


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


120201


## Exercise: Relative Frequencies

Calculate and compare the relative frequency of male pronouns to the relative frequency of female pronouns in rapunzel.txt

In [None]:
# Tokenizing the text file

rapunzel = open("/content/drive/MyDrive/python_bootcamp/sample_data/rapunzel.txt", encoding="utf-8").read()

tokens = word_tokenize(rapunzel)

In [22]:
# Checking if pronouns are stopwords or not

from nltk.corpus import stopwords
nltk.download('stopwords')

stopwords = stopwords.words("english")
print(stopwords)

if "her" in stopwords:
  print("It's a stopword")
else:
  print("It's not a stopword")

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [25]:
#stop_words = set(stopwords.words('english'))

female_pronouns = ["she", "her", "hers", "herself"]
male_pronouns = ["he", "him", "his", "himself"]

pronouns = female_pronouns + male_pronouns

# Example: remove "not" from the stopword list
for word in pronouns:
    stopwords.remove(word)


In [28]:
# Cleaning tokens of punctuation, and making all lowercase

import string

punctuation = list(string.punctuation)

punctuation.append("‘")
punctuation.append("’")

token = [token.lower for token in tokens if token not in punctuation]

In [29]:
# Actually calculating the relative frequencies

female_pronoun_count = 0

for x in female_pronouns:
  for y in tokens:
    if x == y:
      female_pronoun_count += 1

male_pronoun_count = 0

for x in male_pronouns:
  for y in tokens:
     if x == y:
      male_pronoun_count += 1

relative_female_freq = female_pronoun_count / len(tokens)
relative_male_freq = male_pronoun_count / len(tokens)

print("The relative frequency of female pronouns is: " + str(relative_female_freq))
print("The relative frequency of male pronouns is: " + str(relative_male_freq))

The relative frequency of female pronouns is: 0.036275695284159616
The relative frequency of male pronouns is: 0.03083434099153567


In [30]:
# Alternative using Counter

from collections import Counter

token_counts = Counter(tokens)

female_pronoun_count = 0
for pronoun in female_pronouns:
    female_pronoun_count += token_counts[pronoun]

male_pronoun_count = 0
for pronoun in male_pronouns:
    male_pronoun_count += token_counts[pronoun]

relative_female_freq = female_pronoun_count / len(tokens)
relative_male_freq = male_pronoun_count / len(tokens)

print("The relative frequency of female pronouns is:", relative_female_freq)
print("The relative frequency of male pronouns is:", relative_male_freq)


The relative frequency of female pronouns is: 0.036275695284159616
The relative frequency of male pronouns is: 0.03083434099153567
