In [None]:
##https://www.cliffsnotes.com/literature/h/hamlet/character-map

--------------------
## **Context**
--------------------

Hamlet is a classic written by Hamlet...
In this case study, **we will analyze the co-occurrence network of the characters in Shakespeare's Hamlet** 

The dataset is publicly available from Gutenburg Project

**Note: Here, two characters are considered to co-occur if their names appear in the vicinity of 15 words from one another in the books**.

-------------------------
## **Objectives:**
-------------------------
- Load all the raw text of literature and perform descriptive analysis
- Run Network Analysis Algorithms on individual books (and combined)
- Summarize the structure of the play and encode them into act, playline, and character tables.
- Visualize each character's importance based on total number of the player's presence per act. 


In [1]:
import pandas as pd
import networkx as nx
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
fhand=open("hamlet.txt")
count=0
for line in fhand:
    count+=1
print("There are "+str(count)+" lines in Hamelet")

There are 4459 lines in Hamelet


# Summary

#### ACT

In [4]:
fhand=open("hamlet.txt")
count=0
for line in fhand:
    line.lstrip()
    
    if line.lower().startswith("act"):
        count+=1
        print(line)
print("There are "+str(count)+" acts in Hamelet")

ACT I. Scene I.

Act II. Scene I.

ACT III. Scene I.

ACT IV. Scene I.

ACT V. Scene I.

There are 5 acts in Hamelet


#### SCENES

In [5]:
fhand=open("hamlet.txt")
count=0
for line in fhand:
    line.lstrip()
    
    if line.lower().startswith("scene"):
        count+=1
        print(line)
print("There are "+str(count)+" scenes in Hamelet")

SCENE.- Elsinore.

Scene II.

Scene III.

Scene IV.

Scene V.

Scene II.

Scene II.

Scene III.

Scene IV.

Scene II.

Scene III.

Scene IV.

Scene V.

Scene VI.

Scene VII.

Scene II.

There are 16 scenes in Hamelet


In [6]:
fhand=open("hamlet.txt")
lines=[]
for line in fhand:
    line.rstrip()
    lines.append(line)
df=pd.DataFrame({"player_lines":lines})


# Characters

In [7]:
df[9:37]

Unnamed: 0,player_lines
9,"Claudius, King of Denmark.\n"
10,"Marcellus, Officer.\n"
11,"Hamlet, son to the former, and nephew to the..."
12,"Polonius, Lord Chamberlain.\n"
13,"Horatio, friend to Hamlet.\n"
14,"Laertes, son to Polonius.\n"
15,"Voltemand, courtier.\n"
16,"Cornelius, courtier.\n"
17,"Rosencrantz, courtier.\n"
18,"Guildenstern, courtier.\n"


#### regular expression

In [8]:
import re
character_list=pd.Series(df["player_lines"][9:37])
#character_list
personas=[]
for char in character_list:
    x=re.findall("([a-zA-z\s]+)[.,]", char)
    personas.append(x)


In [9]:
characters_df=pd.DataFrame(personas)
characters_df

Unnamed: 0,0,1,2
0,Claudius,King of Denmark,
1,Marcellus,Officer,
2,Hamlet,son to the former,and nephew to the present king
3,Polonius,Lord Chamberlain,
4,Horatio,friend to Hamlet,
5,Laertes,son to Polonius,
6,Voltemand,courtier,
7,Cornelius,courtier,
8,Rosencrantz,courtier,
9,Guildenstern,courtier,


In [10]:
characters_df=characters_df.rename(columns={0:"Player",1:"Identity_1",2:"Identity_2"})

In [11]:
alias=["King.","Mar.","Ham.","Pol.","Hor.","Laer.","Volt.","Cor.","Ros.","Guil.","Osr.","Gent.","Priest.","Mar.","Ber.","Fran.","Rey.","","Clowns","Fort.","Capt.","Ambassador.","","Queen.","Oph.","","Ghost.",""]

characters_df["Alias"]=alias

In [12]:
characters_df

Unnamed: 0,Player,Identity_1,Identity_2,Alias
0,Claudius,King of Denmark,,King.
1,Marcellus,Officer,,Mar.
2,Hamlet,son to the former,and nephew to the present king,Ham.
3,Polonius,Lord Chamberlain,,Pol.
4,Horatio,friend to Hamlet,,Hor.
5,Laertes,son to Polonius,,Laer.
6,Voltemand,courtier,,Volt.
7,Cornelius,courtier,,Cor.
8,Rosencrantz,courtier,,Ros.
9,Guildenstern,courtier,,Guil.


In [13]:
characters_df=characters_df.drop([17,22,25,27])


#### character table

In [14]:
characters_df

Unnamed: 0,Player,Identity_1,Identity_2,Alias
0,Claudius,King of Denmark,,King.
1,Marcellus,Officer,,Mar.
2,Hamlet,son to the former,and nephew to the present king,Ham.
3,Polonius,Lord Chamberlain,,Pol.
4,Horatio,friend to Hamlet,,Hor.
5,Laertes,son to Polonius,,Laer.
6,Voltemand,courtier,,Volt.
7,Cornelius,courtier,,Cor.
8,Rosencrantz,courtier,,Ros.
9,Guildenstern,courtier,,Guil.


In [15]:
from pathlib import Path  
filepath = Path('folder/character_identity_df.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
characters_df.to_csv(filepath) 

#### Character line

In [None]:
character_line_df=characters_df[["Alias","Player"]]

In [None]:
character_line_df

In [None]:
from pathlib import Path  
filepath = Path('folder/characters_df.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
character_line_df.to_csv(filepath) 

#### search line

In [None]:
## calculate each player's lines 

In [None]:
count_line=[]
for alias in characters_df["Alias"]:
    count=0
    for line in df["player_lines"]:
        if alias in line:
            count+=1
    count_line.append(count)       
character_line_df["lines"]=count_line

In [None]:
character_line_df=character_line_df.sort_values(by='lines', ascending=False)

In [None]:
character_line_df

In [None]:
plt.figure(figsize=(10,10))
ax = sns.barplot(x='lines',y='Player',data=character_line_df)
ax.set(xlabel='Number of lines', ylabel='Player Name')
plt.show()

# ACT I.~V.

#### divide the book into 5 acts

In [None]:
fhand=open("hamlet.txt")
count=0
for line in fhand:
    line.lstrip()
    
    if line.lower().startswith("act"):
        count+=1
        print(line)
print("There are "+str(count)+" acts in Hamelet")

In [None]:
#find each index of the act lines and split them 

In [None]:
for number,line in enumerate(df["player_lines"]):
    if line.lower().startswith("act"):
        print(number)
        print(line)
        

In [None]:
df[46:47]

In [None]:
acts=["ACT I.","Act II.","ACT III.","ACT IV.","ACT V."]
act_1=df["player_lines"][46:1050]
act_2=df["player_lines"][1050:1838]
act_3=df["player_lines"][1838:2862]
act_4=df["player_lines"][2862:3691]
act_5=df["player_lines"][3691:]
[len(act_1),len(act_1),len(act_3),len(act_4),len(act_5)]

In [None]:
act_nums=[act_1,act_2,act_3,act_4,act_5]

In [None]:
acts_df=df


In [None]:
act1_df=pd.DataFrame({"player_lines":act_1, "Act No.":"ACT I." })
act2_df=pd.DataFrame({"player_lines":act_2, "Act No.":"ACT II." })
act3_df=pd.DataFrame({"player_lines":act_3, "Act No.":"ACT III." })
act4_df=pd.DataFrame({"player_lines":act_4, "Act No.":"ACT IV." })
act5_df=pd.DataFrame({"player_lines":act_5, "Act No.":"ACT V." })
acts_df=pd.concat([act1_df,act2_df,act3_df,act4_df,act5_df])

In [None]:
acts_df

#### Number of lines spoken per player in each act

In [None]:
def countline(act=act_1):
    count_line=[]
    for alias in characters_df["Alias"]:
        count=0
        for line in act:
            if alias in line:
                count+=1
        count_line.append(count)
        
    return count_line 


In [None]:
countline(act=act_1)
countline(act=act_2)
countline(act=act_3)
countline(act=act_4)
countline(act=act_5)

In [None]:
line_act_df=pd.DataFrame({"Player":characters_df["Player"],
            "Alias":characters_df["Alias"],
              "Act I.":countline(act=act_1),
              "Act II.":countline(act=act_2),
              "Act III.":countline(act=act_3),
              "Act VI.":countline(act=act_4),
              "Act V":countline(act=act_5)})
line_act_df

In [None]:
line_act_df.columns[2:]

In [None]:
plt.figure(figsize = (10, 20))
acts=line_act_df.columns[2:]

for i, act in enumerate(acts):
    plt.subplot(5, 1, i + 1)
    #ax = sns.barplot(x=act,y='Player',data=line_act_df.sort_values(by=act,ascending=False))
    ax = sns.barplot(x=act,y='Player',data=line_act_df)
    ax.set(xlabel='Number of lines', ylabel='Player Name')
    
    plt.tight_layout()
    
    plt.title(act)
plt.show()

#### Save Tables

In [None]:
#from pathlib import Path  
#filepath = Path('folder/line_act_df.csv')  
#filepath.parent.mkdir(parents=True, exist_ok=True)  
#line_act_df.to_csv(filepath) 