### Import Libraries and Functions

In [1]:
# basic libraries
import pandas as pd

In [2]:
# nlp libraries
import spacy
nlp = spacy.load("en_core_web_sm")

### Files and Dataframes Defining

In [4]:
# opening text file
with open("story.txt", "r") as f:
    text = f.read()

In [5]:
# create dataframes
df_characters = pd.DataFrame(columns=["cid", "name", "frequency", "features"])
df_lines = pd.DataFrame(columns=["pid", "ndid", "name", "dialogue", "narration"])

# Characters Identification

In [6]:
# finding personal entities from text
entities = nlp(text).ents
personal_entities = []
for entity in entities:
    if entity.label_ == 'PERSON':
        personal_entities.append(entity.text)
personal_entities = list(set(personal_entities))

In [7]:
# adding persons and counts to df_characters
cid_num = 0
for person in personal_entities:
    df_characters = df_characters._append({
        'cid': cid_num, 'name': person, 'frequency': text.count(person), 'features': None}, ignore_index=True)
    cid_num += 1

In [8]:
# characters
df_characters.head(10)

Unnamed: 0,cid,name,frequency,features
0,0,Lily,12,
1,1,Thompson,8,


# Lines Identification

In [9]:
# converting text to paragraphs
paragraphs = text.split("\n")
non_empty_paragraphs = list(filter(lambda x: x != '', paragraphs))

In [10]:
# function to identify narrations and dialogues
def identify_narrations_and_dialogues(paragraph):
    """
    :param paragraph: string of paragraph in a story
    :return: list of tuples in (id, name_of_speaker, dialogue, narration) format
    """
    divisions = paragraph.split('"')
    divisions = list(filter(lambda x: x != '', divisions))
    i = 0
    identification_type = 'D' if '"' in divisions[0] else 'N'
    narrations_and_dialogues = []
    for division in divisions:
        if identification_type == 'D':
            narrations_and_dialogues.append((i, None, division, None))
            identification_type = 'N'
        else:
            narrations_and_dialogues.append((i, None, None, division))
            identification_type = 'D'
        i += 1
    return narrations_and_dialogues


In [11]:
# identifying lines (narrations or dialogues) from each paragraphs
pid_num = 0
for paragraph in non_empty_paragraphs:
    for row in [(pid_num,)+nad for nad in identify_narrations_and_dialogues(paragraph)]:
        df_lines = df_lines._append(pd.Series(row, index=df_lines.columns), ignore_index=True)
    pid_num += 1

In [12]:
# lines
df_lines.head(10)

Unnamed: 0,pid,ndid,name,dialogue,narration
0,0,0,,,"Once upon a time, in a small, quiet village, t..."
1,1,0,,,"One sunny afternoon, as Lily was by the river,..."
2,2,0,,,"Unbeknownst to Lily, a kind stranger had been ..."
3,2,1,,You have a heart as beautiful as that butterfl...,
4,3,0,,,"Lily blushed, not used to receiving compliment..."
5,3,1,,I've been searching for someone just like you....,
6,4,0,,,Lily's eyes sparkled with excitement. She had ...
7,4,1,,"I'd love to help,",
8,4,2,,,she replied.
9,5,0,,,"From that day on, Lily spent her afternoons ca..."
