## Set up

In [1]:
# imports
import requests
import pandas as pd
import spacy

In [2]:
# set up nlp pipline
nlp = spacy.load("en_core_web_sm")
nlp.disable_pipes('ner', 'parser')

['ner', 'parser']

###Andersen's Fairy Tales

In [54]:
# use requests library to load text
response = requests.get('https://www.gutenberg.org/cache/epub/1597/pg1597.txt')
text = response.text

In [55]:
text[:250]

"\ufeffThe Project Gutenberg eBook of Andersen's Fairy Tales\r\n    \r\nThis ebook is for the use of anyone anywhere in the United States and\r\nmost other parts of the world at no cost and with almost no restrictions\r\nwhatsoever. You may copy it, give it away o"

In [56]:
# find start index
start = text.find('Many years ago, there was an Emperor')

In [57]:
# find end index
end = text.find('*** END OF THE PROJECT')

In [58]:
# trim text
aft = text[start:end]

In [59]:
# split into paragraphs
aft_paras = aft.split('\r\n\r\n')

In [60]:
type(aft_paras)

list

In [61]:
# creating empty lists for author and title will be handy for building our dataframe
author = []
title = []

In [62]:
# this little for-loop will poplate the lists we just created
for para in aft_paras:
    author.append('Andersen')
    title.append('Andersen Fairy Tales')

In [63]:
len(author) == len(title)

True

In [64]:
aft_df = pd.DataFrame(list(zip(author, title, aft_paras)), columns=['author', 'title', 'text'])

In [65]:
# Sanity check
aft_df.head()

Unnamed: 0,author,title,text
0,Andersen,Andersen Fairy Tales,"Many years ago, there was an Emperor, who was ..."
1,Andersen,Andersen Fairy Tales,Time passed merrily in the large town which wa...
2,Andersen,Andersen Fairy Tales,"“These must, indeed, be splendid clothes!” tho..."
3,Andersen,Andersen Fairy Tales,"So the two pretended weavers set up two looms,..."
4,Andersen,Andersen Fairy Tales,“I should like to know how the weavers are get...


In [66]:
# extract lemmas
def process_text(text):
    """Remove new line characters and lemmatize text. Returns string of lemmas"""
    text = text.replace('\n', ' ')
    doc = nlp(text)
    tokens = [token for token in doc]
    no_stops = [token for token in tokens if not token.is_stop]
    no_punct = [token for token in no_stops if token.is_alpha]
    lemmas = [token.lemma_ for token in no_punct]
    lemmas_lower = [lemma.lower() for lemma in lemmas]
    lemmas_string = ' '.join(lemmas_lower)
    return lemmas_string

In [67]:
# apply process_text to text column
aft_df['lemmas'] = aft_df['text'].apply(process_text)
process_text

<function __main__.process_text(text)>

In [68]:
# sanity check
aft_df.head()

Unnamed: 0,author,title,text,lemmas
0,Andersen,Andersen Fairy Tales,"Many years ago, there was an Emperor, who was ...",year ago emperor excessively fond new clothe s...
1,Andersen,Andersen Fairy Tales,Time passed merrily in the large town which wa...,time pass merrily large town capital stranger ...
2,Andersen,Andersen Fairy Tales,"“These must, indeed, be splendid clothes!” tho...",splendid clothe think emperor suit find man re...
3,Andersen,Andersen Fairy Tales,"So the two pretended weavers set up two looms,...",pretend weaver set loom affect work busily rea...
4,Andersen,Andersen Fairy Tales,“I should like to know how the weavers are get...,like know weaver get cloth say emperor little ...


In [69]:
# filter out strings shorter than 25 characters
length_filter = aft_df['lemmas'].str.len() > 25

In [70]:
filter_df = aft_df[length_filter]

In [71]:
filter_df.head()

Unnamed: 0,author,title,text,lemmas
0,Andersen,Andersen Fairy Tales,"Many years ago, there was an Emperor, who was ...",year ago emperor excessively fond new clothe s...
1,Andersen,Andersen Fairy Tales,Time passed merrily in the large town which wa...,time pass merrily large town capital stranger ...
2,Andersen,Andersen Fairy Tales,"“These must, indeed, be splendid clothes!” tho...",splendid clothe think emperor suit find man re...
3,Andersen,Andersen Fairy Tales,"So the two pretended weavers set up two looms,...",pretend weaver set loom affect work busily rea...
4,Andersen,Andersen Fairy Tales,“I should like to know how the weavers are get...,like know weaver get cloth say emperor little ...


In [72]:
# remove \n and \r characters from the text
def remove_new_lines(text):
    text = text.replace('\n', ' ')
    text = text.replace('\r', ' ')
    return text

In [73]:
# apply above function, you can ignore the warning.
filter_df['text'] = filter_df['text'].apply(remove_new_lines)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filter_df['text'] = filter_df['text'].apply(remove_new_lines)


In [74]:
# save our work
filter_df.to_csv('andersens_fairy_tales.csv', index=False)