In [48]:
import os
import csv
import subprocess
import re
import random
import numpy as np
from numpy import linalg as LA
import pandas as pd

In [38]:
def read_characters():
    tuples = []
    characters = {''}
    with open('will_play_text.csv') as f:
        csv_reader = csv.reader(f, delimiter=';')
        for row in csv_reader:
          charactor = row[4]
          characters.add(charactor)
          line = row[5]
          line_tokens = re.sub(r'[^a-zA-Z0-9\s]', ' ', line).split()
          line_tokens = [token.lower() for token in line_tokens]
          tuples.append((charactor, line_tokens))
    return tuples,characters

In [3]:
ctuples,charac = read_characters()

charac.discard('')
l_cha = list(charac)

In [4]:
len(l_cha)

934

In [5]:
def read_in_shakespeare():
  '''Reads in the Shakespeare dataset processes it into a list of tuples.
     Also reads in the vocab and play name lists from files.

  Each tuple consists of
  tuple[0]: The name of the play
  tuple[1] A line from the play as a list of tokenized words.

  Returns:
    tuples: A list of tuples in the above format.
    document_names: A list of the plays present in the corpus.
    vocab: A list of all tokens in the vocabulary.
  '''

  tuples = []

  with open('will_play_text.csv') as f:
    csv_reader = csv.reader(f, delimiter=';')
    for row in csv_reader:
      play_name = row[1]
      line = row[5]
      line_tokens = re.sub(r'[^a-zA-Z0-9\s]', ' ', line).split()
      line_tokens = [token.lower() for token in line_tokens]

      tuples.append((play_name, line_tokens))

  with open('vocab.txt') as f:
    vocab =  [line.strip() for line in f]

  with open('play_names.txt') as f:
    document_names =  [line.strip() for line in f]

  return tuples, document_names, vocab

def get_row_vector(matrix, row_id):
  return matrix[row_id, :]

def get_column_vector(matrix, col_id):
  return matrix[:, col_id]

In [6]:
def create_term_character_matrix(line_tuples, characters, vocab):
    ''' Returns a numpy array containing the term character matrix for the input lines.
    '''
    vocab_to_id = dict(zip(vocab, range(0, len(vocab))))
    cha_to_id = dict(zip(characters,range(0,len(characters))))
    tdm = np.zeros(shape=(len(characters),len(vocab)))
    for line in line_tuples:
        cha = line[0]
        x_axis = cha_to_id.get(cha)
        if x_axis is None:
            continue
        for i in range(0,len(line[1])):
          y_axis = vocab_to_id.get(line[1][i])
          tdm[x_axis,y_axis] += 1
            
    return tdm

In [44]:
def compute_cosine_similarity(vector1, vector2):
  '''Computes the cosine similarity of the two input vectors.

  Inputs:
    vector1: A nx1 numpy array
    vector2: A nx1 numpy array

  Returns:
    A scalar similarity value.
  '''
  cs = np.dot(vector1,vector2)/(LA.norm(vector1)*LA.norm(vector2))
  # YOUR CODE HERE
  return cs

def compute_jaccard_similarity(vector1, vector2):
  '''Computes the cosine similarity of the two input vectors.

  Inputs:
    vector1: A nx1 numpy array
    vector2: A nx1 numpy array

  Returns:
    A scalar similarity value.
  '''
  
  # YOUR CODE HERE
  num = np.minimum(vector1,vector2)
  dom = np.maximum(vector1,vector2)
  js = np.sum(num)/np.sum(dom)

  return js
def compute_dice_similarity(vector1, vector2):
  '''Computes the cosine similarity of the two input vectors.

  Inputs:
    vector1: A nx1 numpy array
    vector2: A nx1 numpy array

  Returns:
    A scalar similarity value.
  '''

  # YOUR CODE HERE
  upper = np.minimum(vector1,vector2)
  upper_sum = np.sum(upper) * 2
  dom = np.sum(vector1 + vector2)
  return upper_sum/dom
def rank_cha(target_play_index, term_document_matrix, similarity_fn):
  ''' Ranks the similarity of all of the plays to the target play.

  Inputs:
    target_play_index: The integer index of the play we want to compare all others against.
    term_document_matrix: The term-document matrix as a mxn numpy array.
    similarity_fn: Function that should be used to compared vectors for two
      documents. Either compute_dice_similarity, compute_jaccard_similarity, or
      compute_cosine_similarity.

  Returns:
    A length-n list of integer indices corresponding to play names,
    ordered by decreasing similarity to the play indexed by target_play_index
  '''
  
  # YOUR CODE HERE
  nums = term_document_matrix.shape[0]
  target = term_document_matrix[target_play_index,:]
  docs_ranking = {}
  result = []
  for i in range(nums):
          if i!= target_play_index:
              similarity_doc = similarity_fn(term_document_matrix[i,:], target)
              docs_ranking[i] = similarity_doc
  sort_ranking = sorted(docs_ranking.items(),key=lambda item: item[1],reverse=True)
  for k,v in sort_ranking:
      result.append(k)
  return result

In [68]:

print('Computing term character matrix...')
tch_matrix = create_term_character_matrix(ctuples,l_cha,vocab)

Computing term character matrix...


In [69]:
len(tch_matrix)

934

In [71]:
len(sum_for_row)

934

In [72]:
sum_for_row = np.sum(tch_matrix,axis=1)
mask = []
for i in range(len(l_cha)):
    if sum_for_row[i]< 1100:
        mask.append(i)
masked_tc = np.delete(tch_matrix,mask,0)
masked_ch = np.delete(l_cha,mask)

In [102]:
len(masked)

934

In [110]:
most1 = []
least1 = []
most2 = []
least2 = []
most3 = []
least3 = []
for idx in range(len(masked_ch)):
    ranks1 = rank_cha(idx,masked_tc,compute_cosine_similarity)
    most1.append(masked_ch[ranks1[0]])
    least1.append(masked_ch[ranks1[len(ranks1)-1]])
    ranks2 = rank_cha(idx,masked_tc,compute_dice_similarity)
    most2.append(masked_ch[ranks2[0]])
    least2.append(masked_ch[ranks2[len(ranks2)-1]])
    ranks3 = rank_cha(idx,masked_tc,compute_jaccard_similarity)
    most3.append(masked_ch[ranks3[0]])
    least3.append(masked_ch[ranks3[len(ranks3)-1]])

In [111]:
df = pd.DataFrame({'Character':masked_ch[0:len(masked_ch)],'Most_cosine':most1,'Least_cosine':least1,'M_dice':most2,'L_dice':least2,'M_jaccard':most3,'L_jaccard':least3})

## Result

The result of the similarity between characters. Each column after first column is its corresponding Most or least similarity based on different method.

In [113]:
df

Unnamed: 0,Character,Most_cosine,Least_cosine,M_dice,L_dice,M_jaccard,L_jaccard
0,HENRY BOLINGBROKE,YORK,SIR ANDREW,KING JOHN,GLOUCESTER,KING JOHN,GLOUCESTER
1,CYMBELINE,KING,SLENDER,QUEEN,GLOUCESTER,QUEEN,GLOUCESTER
2,SIR HUGH EVANS,FLUELLEN,AEGEON,MISTRESS QUICKLY,GLOUCESTER,MISTRESS QUICKLY,GLOUCESTER
3,BERTRAM,HELENA,Chorus,COUNTESS,GLOUCESTER,COUNTESS,GLOUCESTER
4,HOLOFERNES,First Gentleman,SLENDER,COSTARD,GLOUCESTER,COSTARD,GLOUCESTER
5,DUCHESS OF YORK,QUEEN MARGARET,MOTH,QUEEN ELIZABETH,GLOUCESTER,QUEEN ELIZABETH,GLOUCESTER
6,PROTEUS,VALENTINE,CANTERBURY,VALENTINE,GLOUCESTER,VALENTINE,GLOUCESTER
7,LADY MACBETH,MACBETH,SLENDER,CYMBELINE,GLOUCESTER,CYMBELINE,GLOUCESTER
8,QUEEN KATHARINE,CARDINAL WOLSEY,First Citizen,BERTRAM,GLOUCESTER,BERTRAM,GLOUCESTER
9,APEMANTUS,TIMON,MOTH,Fool,GLOUCESTER,Fool,GLOUCESTER
