## Setup df

In [None]:
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

import os

os.chdir("/content/drive/MyDrive/practicum")
!ls

df = pd.read_csv('charlists.csv', delimiter = "~").drop(columns = 'Unnamed: 0')
df["characters"] = df["characters"].map(lambda x: x.split("_"))
#df["NER_chars"] = df["NER_chars"].map(lambda x: x.split("_"))

In [None]:
#filter charlist down

df_chars = df
df_chars["word_len"] = df_chars['full_text'].map(lambda x: len(x.split(" ")))
df_chars["letter_len"] = df_chars['full_text'].map(lambda x: len(x))
df_chars["num_chars"] = df_chars["characters"].map(lambda x: len(x))

df_chars = df_chars.loc[df_chars["num_chars"] >2]
df_chars = df_chars.loc[df_chars["num_chars"] <50]
df_chars = df_chars.loc[df_chars["word_len"] >1000]
df_chars = df_chars.loc[df_chars["letter_len"] <1000000]

df_chars = df_chars.drop(columns = ["word_len","letter_len","num_chars"])

In [None]:
list(set(df_chars.iloc[0].characters))

['Meyer Wolfsheim',
 'George Wilson',
 'Jay Gatsby',
 'Tom Buchanan',
 'Nick Carraway',
 'Daisy Buchanan',
 'Jordan Baker',
 'Myrtle Wilson']

## NER Methods

In [None]:
# get 100 highest occuring people
def sort_persons(df_book):
  count_persons = df_book[df_book['label']== 'PERSON']
  count_persons = count_persons.groupby(['text']).count()
  count_persons = count_persons.sort_values(by=['label'], ascending=False)
  count_persons = count_persons.index.values.tolist()[:100]
  return count_persons

### NER spacy

In [None]:
import nltk
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [None]:
def find_chars_spacy(text):
  doc = nlp(text)
  df_book = pd.DataFrame({
    'text': [x.text for x in doc.ents],
    'label': [x.label_ for x in doc.ents]})
  
  return sort_persons(df_book)

In [None]:
df_chars["NER_chars"] = df_chars["full_text"].map(find_chars_spacy)

In [None]:
df_chars

Unnamed: 0,title,full_text,characters,NER_chars
0,The Great Gatsby,\t\t\t The Great Gatsby \t\t\t\t by \t\t\t F. ...,"[Nick Carraway, Jay Gatsby, Daisy Buchanan, To...","[Gatsby, Daisy, Tom, Jordan, Wilson, Baker, Wo..."
1,Hamlet,Project Gutenberg's Etext of Shakespeare's The...,"[Horatio, Polonius, Laertes, Gertrude, Marcell...","[Rosin, Laer, Pol, thinke, Guild, Guildenstern..."
2,The Odyssey,cover The Odyssey by Homer Translated by Alexa...,"[Odysseus, Penelope (wife of Odysseus), Helen ...","[ye, Thy, Jove, Minerva, Ulysses, Euryclea, Sh..."
3,Madame Bovary,Madame Bovary By Gustave Flaubert Translated f...,"[Emma Bovary, Charles Bovary, Monsieur Homais,...","[Charles, Emma, Yonville, Bertaux, Rouault, Bo..."
4,Wuthering Heights,Wuthering Heights by Emily Brontë CHAPTER I 18...,"[Heathcliff, Catherine Earnshaw, Edgar Linton,...","[Catherine, Linton, Heathcliff, Cathy, Joseph,..."
...,...,...,...,...
96,The Pioneers,"THE PIONEERS, BY R.M. BALLANTYNE. PREFACE. Sir...","[Meriwether Lewis, William Clark, Thomas Jeffe...","[Reuben, Lawrence, Swiftarrow, Reuben Guff, Al..."
97,The Song of Roland,The Song of Roland Translated by C. K. [Charle...,"[Roland, Ganelon, Oliver, King Marsile, Blanca...","[Charles, Oliver, Rollanz, Guenelun, Baligant,..."
98,She,She by H. Rider Haggard First Published 1886. ...,"[Laurel Mack, Ellie Mack, Paul Mack, Hanna Mac...","[Leo, Job, ye, Kallikrates, Baboon, Kôr, Queen..."
99,The Wonderful Adventures of Nils,[Transcriber's note: The inconsistent orthogra...,"[Dunfin, Clement Larsson, Smirre Fox, Morten G...","[Karr, Smirre, Jarro, Mats, Clement, Grayskin,..."


In [None]:
df_save = df_chars.copy()
df_save["characters"] = df_save["characters"].map(lambda x: "_".join(x))
df_save["NER_chars"] = df_save["NER_chars"].map(lambda x: "_".join(x))

In [None]:
df_save.to_csv('new_booklist.csv', sep ='~') 

!cp new_booklist.csv "/content/drive/MyDrive/practiCUM/charlists.csv"

### NER nltk


In [None]:
import nltk
from nltk import word_tokenize,pos_tag

In [None]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [None]:
def find_chars_nltk(text):
  nltk_list = []
  for sent in nltk.sent_tokenize(text):
    for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
      if hasattr(chunk,'label'):
        lab =chunk.label()
        txt = ''.join(c[0] for c in chunk)
        nltk_list.append({"label":lab, "text": txt})

  df_book = pd.DataFrame(nltk_list)
  
  return sort_persons(df_book)

In [None]:
df_chars["NER_chars"] = df_chars["full_text"].map(find_chars_nltk)

## NER flair

In [None]:
!pip install flair
from flair.data import Sentence
from flair.models import SequenceTagger
from segtok.segmenter import split_single
tagger = SequenceTagger.load('ner-ontonotes')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting flair
  Downloading flair-0.11.3-py3-none-any.whl (401 kB)
[K     |████████████████████████████████| 401 kB 8.0 MB/s 
[?25hCollecting janome
  Downloading Janome-0.4.2-py2.py3-none-any.whl (19.7 MB)
[K     |████████████████████████████████| 19.7 MB 1.3 MB/s 
Collecting bpemb>=0.3.2
  Downloading bpemb-0.3.3-py3-none-any.whl (19 kB)
Collecting conllu>=4.0
  Downloading conllu-4.5.1-py2.py3-none-any.whl (16 kB)
Collecting segtok>=1.5.7
  Downloading segtok-1.5.11-py3-none-any.whl (24 kB)
Collecting transformers>=4.0.0
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 25.5 MB/s 
[?25hCollecting deprecated>=1.2.4
  Downloading Deprecated-1.2.13-py2.py3-none-any.whl (9.6 kB)
Collecting pptree
  Downloading pptree-3.1.tar.gz (3.0 kB)
Collecting hyperopt>=0.2.7
  Downloading hyperopt-0.2.7-py2.py3-none-any.whl (1.6 MB)
[



Downloading:   0%|          | 0.00/1.51G [00:00<?, ?B/s]

2022-07-26 21:11:27,805 loading file /root/.flair/models/ner-english-ontonotes/f46dcd14689a594a7dd2a8c9c001a34fd55b02fded2528410913c7e88dbe43d4.1207747bf5ae24291205b6f3e7417c8bedd5c32cacfb5a439f3eff38afda66f7
2022-07-26 21:11:35,015 SequenceTagger predicts: Dictionary with 76 tags: <unk>, O, B-CARDINAL, E-CARDINAL, S-PERSON, S-CARDINAL, S-PRODUCT, B-PRODUCT, I-PRODUCT, E-PRODUCT, B-WORK_OF_ART, I-WORK_OF_ART, E-WORK_OF_ART, B-PERSON, E-PERSON, S-GPE, B-DATE, I-DATE, E-DATE, S-ORDINAL, S-LANGUAGE, I-PERSON, S-EVENT, S-DATE, B-QUANTITY, E-QUANTITY, S-TIME, B-TIME, I-TIME, E-TIME, B-GPE, E-GPE, S-ORG, I-GPE, S-NORP, B-FAC, I-FAC, E-FAC, B-NORP, E-NORP, S-PERCENT, B-ORG, E-ORG, B-LANGUAGE, E-LANGUAGE, I-CARDINAL, I-ORG, S-WORK_OF_ART, I-QUANTITY, B-MONEY


In [None]:
def find_chars_flair(text):
  tagger = SequenceTagger.load('ner-ontonotes')
  sentence = [Sentence(sent, use_tokenizer = True) for sent in split_single(text)]
  tagger.predict(sentence)
  flair_list = []

  for sent in sentence:
    for entity in sent.get_spans('ner'):
        entity = str(entity)
        entity = entity.split('"')
        txt= entity[1]

        entity = entity[2].split(' ')
        lab=entity[2]
        flair_list.append({"label":lab, "text": txt})


  df_book = pd.DataFrame(flair_list)
  
  return sort_persons(df_book)

In [None]:
df_chars["NER_chars"] = df_chars["full_text"].map(find_chars_flair)



2022-07-26 21:11:52,701 loading file /root/.flair/models/ner-english-ontonotes/f46dcd14689a594a7dd2a8c9c001a34fd55b02fded2528410913c7e88dbe43d4.1207747bf5ae24291205b6f3e7417c8bedd5c32cacfb5a439f3eff38afda66f7
2022-07-26 21:12:00,229 SequenceTagger predicts: Dictionary with 76 tags: <unk>, O, B-CARDINAL, E-CARDINAL, S-PERSON, S-CARDINAL, S-PRODUCT, B-PRODUCT, I-PRODUCT, E-PRODUCT, B-WORK_OF_ART, I-WORK_OF_ART, E-WORK_OF_ART, B-PERSON, E-PERSON, S-GPE, B-DATE, I-DATE, E-DATE, S-ORDINAL, S-LANGUAGE, I-PERSON, S-EVENT, S-DATE, B-QUANTITY, E-QUANTITY, S-TIME, B-TIME, I-TIME, E-TIME, B-GPE, E-GPE, S-ORG, I-GPE, S-NORP, B-FAC, I-FAC, E-FAC, B-NORP, E-NORP, S-PERCENT, B-ORG, E-ORG, B-LANGUAGE, E-LANGUAGE, I-CARDINAL, I-ORG, S-WORK_OF_ART, I-QUANTITY, B-MONEY




2022-07-26 21:24:06,532 loading file /root/.flair/models/ner-english-ontonotes/f46dcd14689a594a7dd2a8c9c001a34fd55b02fded2528410913c7e88dbe43d4.1207747bf5ae24291205b6f3e7417c8bedd5c32cacfb5a439f3eff38afda66f7
2022-07-26 21:24:19,246 SequenceTagger predicts: Dictionary with 76 tags: <unk>, O, B-CARDINAL, E-CARDINAL, S-PERSON, S-CARDINAL, S-PRODUCT, B-PRODUCT, I-PRODUCT, E-PRODUCT, B-WORK_OF_ART, I-WORK_OF_ART, E-WORK_OF_ART, B-PERSON, E-PERSON, S-GPE, B-DATE, I-DATE, E-DATE, S-ORDINAL, S-LANGUAGE, I-PERSON, S-EVENT, S-DATE, B-QUANTITY, E-QUANTITY, S-TIME, B-TIME, I-TIME, E-TIME, B-GPE, E-GPE, S-ORG, I-GPE, S-NORP, B-FAC, I-FAC, E-FAC, B-NORP, E-NORP, S-PERCENT, B-ORG, E-ORG, B-LANGUAGE, E-LANGUAGE, I-CARDINAL, I-ORG, S-WORK_OF_ART, I-QUANTITY, B-MONEY




2022-07-26 21:31:27,838 loading file /root/.flair/models/ner-english-ontonotes/f46dcd14689a594a7dd2a8c9c001a34fd55b02fded2528410913c7e88dbe43d4.1207747bf5ae24291205b6f3e7417c8bedd5c32cacfb5a439f3eff38afda66f7
2022-07-26 21:31:37,242 SequenceTagger predicts: Dictionary with 76 tags: <unk>, O, B-CARDINAL, E-CARDINAL, S-PERSON, S-CARDINAL, S-PRODUCT, B-PRODUCT, I-PRODUCT, E-PRODUCT, B-WORK_OF_ART, I-WORK_OF_ART, E-WORK_OF_ART, B-PERSON, E-PERSON, S-GPE, B-DATE, I-DATE, E-DATE, S-ORDINAL, S-LANGUAGE, I-PERSON, S-EVENT, S-DATE, B-QUANTITY, E-QUANTITY, S-TIME, B-TIME, I-TIME, E-TIME, B-GPE, E-GPE, S-ORG, I-GPE, S-NORP, B-FAC, I-FAC, E-FAC, B-NORP, E-NORP, S-PERCENT, B-ORG, E-ORG, B-LANGUAGE, E-LANGUAGE, I-CARDINAL, I-ORG, S-WORK_OF_ART, I-QUANTITY, B-MONEY




2022-07-26 21:58:47,239 loading file /root/.flair/models/ner-english-ontonotes/f46dcd14689a594a7dd2a8c9c001a34fd55b02fded2528410913c7e88dbe43d4.1207747bf5ae24291205b6f3e7417c8bedd5c32cacfb5a439f3eff38afda66f7
2022-07-26 21:58:57,799 SequenceTagger predicts: Dictionary with 76 tags: <unk>, O, B-CARDINAL, E-CARDINAL, S-PERSON, S-CARDINAL, S-PRODUCT, B-PRODUCT, I-PRODUCT, E-PRODUCT, B-WORK_OF_ART, I-WORK_OF_ART, E-WORK_OF_ART, B-PERSON, E-PERSON, S-GPE, B-DATE, I-DATE, E-DATE, S-ORDINAL, S-LANGUAGE, I-PERSON, S-EVENT, S-DATE, B-QUANTITY, E-QUANTITY, S-TIME, B-TIME, I-TIME, E-TIME, B-GPE, E-GPE, S-ORG, I-GPE, S-NORP, B-FAC, I-FAC, E-FAC, B-NORP, E-NORP, S-PERCENT, B-ORG, E-ORG, B-LANGUAGE, E-LANGUAGE, I-CARDINAL, I-ORG, S-WORK_OF_ART, I-QUANTITY, B-MONEY


KeyboardInterrupt: ignored

## Evaluation

In [None]:
# Trim list
def trimList(NER_list, Sample_list):
  max_len = len(Sample_list)*2
  if(max_len>len(NER_list)):
    return NER_list
  else:
    return NER_list[:max_len]

# Any Names
def anyNames(NER_list, Sample_list):
  NER_list = " ".join(NER_list).split(" ")
  NER_list = trimList(NER_list, Sample_list)
  NER_list = [x.lower() for x in NER_list]
  Sample_list = [x.lower() for x in Sample_list]

  found_list=[]
  for character in NER_list:
    found_spot= -1
    for i in range(len(Sample_list)):
      if(character in Sample_list[i].split(" ")):
        found_spot=i
        break
    found_list.append(found_spot)
  return found_list


# All Names
def allNames(NER_list, Sample_list):
  found_list=[]
  Sample_list = " ".join(Sample_list).split(" ")
  NER_list = " ".join(NER_list).split(" ")
  NER_list = trimList(NER_list, Sample_list)
  NER_list = [x.lower() for x in NER_list]
  Sample_list = [x.lower() for x in Sample_list]

  for character in NER_list:
    try:
      found_list.append(Sample_list.index(character))
    except ValueError:
      found_list.append(-1)
  return found_list


# Strict Names
def strictNames(NER_list, Sample_list):
  found_list=[]
  NER_list = trimList(NER_list, Sample_list)
  NER_list = [x.lower() for x in NER_list]
  Sample_list = [x.lower() for x in Sample_list]

  for character in NER_list:
    try:
      found_list.append(Sample_list.index(character))
    except ValueError:
      found_list.append(-1)
  return found_list

# Evaluate
def evalNames(found_set, correct_set):
  # Trim to last correct value
  max_len = 0
  for i in reversed(range(len(found_set))):
    if(found_set[i] != -1):
      max_len = i
      break

  found_set = found_set[:max_len+1]


  #find total FN
  FN = len(correct_set)
  correct_set = range(FN)

  for correct in correct_set:
    if(correct in found_set):
      FN= FN-1

  #find FP
  FP = found_set.count(-1)

  #find TP
  TP = len(found_set) - FP

  #calculate F1
  precision = TP / (TP+FP)
  recall = TP / (TP+FN)
  if(precision+recall!=0):
    fmeasure = 2*precision*recall/(precision+recall)
  else:
    fmeasure = 0

  return {"precision":precision, "recall":recall, "fmeasure":fmeasure}




In [None]:
df_chars["Any_NER"] = df_chars.apply(lambda x: evalNames(anyNames(x.NER_chars, x.characters), x.characters), axis=1)

In [None]:
df_chars["All_NER"] = df_chars.apply(lambda x: evalNames(allNames(x.NER_chars, x.characters), x.characters), axis=1)

In [None]:
df_chars["Strict_NER"] = df_chars.apply(lambda x: evalNames(strictNames(x.NER_chars, x.characters), x.characters), axis=1)

In [None]:
df_chars["All_NER"]

0      {'precision': 0.5625, 'recall': 0.947368421052...
1      {'precision': 0.11538461538461539, 'recall': 0...
2      {'precision': 0.20833333333333334, 'recall': 0...
3      {'precision': 0.36363636363636365, 'recall': 0...
4      {'precision': 0.625, 'recall': 0.9259259259259...
                             ...                        
96      {'precision': 0.0, 'recall': 0.0, 'fmeasure': 0}
97     {'precision': 0.23076923076923078, 'recall': 0...
98      {'precision': 0.0, 'recall': 0.0, 'fmeasure': 0}
99     {'precision': 0.42857142857142855, 'recall': 0...
100     {'precision': 0.0, 'recall': 0.0, 'fmeasure': 0}
Name: All_NER, Length: 101, dtype: object

In [None]:
def unpack_fmeasure(column_name):
  df_chars[column_name+"_precision"] = df_chars[column_name].map(lambda x: x["precision"]) 
  df_chars[column_name+"_recall"] = df_chars[column_name].map(lambda x: x["recall"]) 
  df_chars[column_name+"_fmeasure"] = df_chars[column_name].map(lambda x: x["fmeasure"]) 

In [None]:
unpack_fmeasure("Any_NER")
unpack_fmeasure("All_NER")
unpack_fmeasure("Strict_NER")

In [None]:
df_chars["characters"][28]

['Jonathan Harker',
 'Lucy Westenra',
 'Abraham Van Helsing',
 'John Seward',
 'Quincey Morris',
 'Arthur Holmwood',
 'R.M. Renfield',
 'Mina Harker',
 'Dracula']

In [None]:
df_chars["NER_chars"][28]


['Lucy',
 'Van Helsing',
 'Jonathan',
 'Harker',
 'Arthur',
 'Seward',
 'Quincey',
 'Morris',
 'Renfield',
 'Quincey Morris',
 'Mina',
 'Hawkins',
 'Dracula',
 'Czarina Catherine',
 'Westenra',
 "Mina Harker's",
 'Madam Mina',
 'Lucy Westenra',
 "Van Helsing's",
 "Jonathan Harker's",
 'Jack',
 'ye',
 'Arthur Holmwood',
 'Jonathan Harker',
 'Count Dracula',
 'Peter Hawkins',
 'Holmwood',
 'John Seward',
 'Mem',
 'Telegram',
 'Swales',
 'Jonathan Harker’s',
 'Mina Harker',
 'Mina Murray',
 'Bersicker',
 "Lucy Westenra's",
 'Sister Agatha',
 'Turk',
 "Mina Murray's",
 'Kukri',
 'Bilder',
 'Sam',
 'Billington',
 'Sacred Wafer',
 'Yorkshire',
 'Jack Seward',
 'Wafer',
 'Vincent',
 'Bloxam',
 'Szekelys',
 'Thomas Snelling',
 'Skinsky',
 'Mina Murray’s',
 'Mitchell',
 'God',
 'Quincey P. Morris',
 'Art',
 'Buda-Pesth',
 'Abraham Van Helsing',
 'DRACULA',
 'Demeter',
 'Geordie',
 'Carter',
 'M. D.',
 "Robin Hood's",
 'Letter',
 'Keeper',
 'Miss Westenra',
 'St. Joseph',
 'Galatz',
 'John Paxto

In [None]:
df_chars.describe()

Unnamed: 0,num_chars,word_len,letter_len,Any_NER_precision,Any_NER_recall,Any_NER_fmeasure,All_NER_precision,All_NER_recall,All_NER_fmeasure,Strict_NER_precision,Strict_NER_recall,Strict_NER_fmeasure
count,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0
mean,10.227723,81316.089109,448410.841584,0.50417,0.554465,0.512519,0.373791,0.599071,0.444136,0.162475,0.193928,0.165029
std,7.065241,46527.929794,256850.237361,0.297685,0.310217,0.284801,0.229809,0.311224,0.24482,0.200044,0.202538,0.17527
min,3.0,1020.0,5937.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,5.0,40015.0,215561.0,0.285714,0.333333,0.333333,0.225806,0.4,0.285714,0.0,0.0,0.0
50%,8.0,78917.0,426682.0,0.533333,0.657895,0.55814,0.375,0.708333,0.5,0.125,0.176471,0.153846
75%,14.0,115663.0,649013.0,0.709677,0.8,0.75,0.535211,0.833333,0.64,0.230769,0.315789,0.277778
max,42.0,184478.0,996938.0,1.0,1.0,1.0,1.0,1.0,0.909091,1.0,1.0,0.909091


## Saving Data

In [None]:
df_save

Unnamed: 0,title,full_text,characters,NER_chars,Any_NER,All_NER,Strict_NER,Any_NER_precision,Any_NER_recall,Any_NER_fmeasure,All_NER_precision,All_NER_recall,All_NER_fmeasure,Strict_NER_precision,Strict_NER_recall,Strict_NER_fmeasure
0,The Great Gatsby,\t\t\t The Great Gatsby \t\t\t\t by \t\t\t F. ...,Nick Carraway_Jay Gatsby_Daisy Buchanan_Tom Bu...,Gatsby_Daisy_Tom_Jordan_Wilson_Baker_Wolfshiem...,"{'precision': 0.75, 'recall': 0.8, 'fmeasure':...","{'precision': 0.5625, 'recall': 0.947368421052...","{'precision': 0.23076923076923078, 'recall': 0...",0.750000,0.800000,0.774194,0.562500,0.947368,0.705882,0.230769,0.375000,0.285714
1,Hamlet,Project Gutenberg's Etext of Shakespeare's The...,Horatio_Polonius_Laertes_Gertrude_Marcellus_Ro...,Rosin_Laer_Pol_thinke_Guild_Guildensterne_Heau...,"{'precision': 0.11538461538461539, 'recall': 0...","{'precision': 0.11538461538461539, 'recall': 0...","{'precision': 0.04, 'recall': 0.0625, 'fmeasur...",0.115385,0.176471,0.139535,0.115385,0.166667,0.136364,0.040000,0.062500,0.048780
2,The Odyssey,cover The Odyssey by Homer Translated by Alexa...,Odysseus_Penelope (wife of Odysseus)_Helen of ...,ye_Thy_Jove_Minerva_Ulysses_Euryclea_Shall_Tro...,"{'precision': 0.25, 'recall': 0.44444444444444...","{'precision': 0.20833333333333334, 'recall': 0...","{'precision': 0.125, 'recall': 0.25, 'fmeasure...",0.250000,0.444444,0.320000,0.208333,0.454545,0.285714,0.125000,0.250000,0.166667
3,Madame Bovary,Madame Bovary By Gustave Flaubert Translated f...,Emma Bovary_Charles Bovary_Monsieur Homais_Ber...,Charles_Emma_Yonville_Bertaux_Rouault_Bovary_L...,"{'precision': 0.5, 'recall': 0.571428571428571...","{'precision': 0.36363636363636365, 'recall': 0...","{'precision': 0.0, 'recall': 0.0, 'fmeasure': 0}",0.500000,0.571429,0.533333,0.363636,0.888889,0.516129,0.000000,0.000000,0.000000
4,Wuthering Heights,Wuthering Heights by Emily Brontë CHAPTER I 18...,Heathcliff_Catherine Earnshaw_Edgar Linton_Isa...,Catherine_Linton_Heathcliff_Cathy_Joseph_Edgar...,"{'precision': 0.75, 'recall': 0.88235294117647...","{'precision': 0.625, 'recall': 0.9259259259259...","{'precision': 0.16666666666666666, 'recall': 0...",0.750000,0.882353,0.810811,0.625000,0.925926,0.746269,0.166667,0.300000,0.214286
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,The Pioneers,"THE PIONEERS, BY R.M. BALLANTYNE. PREFACE. Sir...",Meriwether Lewis_William Clark_Thomas Jefferson,Reuben_Lawrence_Swiftarrow_Reuben Guff_Alexand...,"{'precision': 0.0, 'recall': 0.0, 'fmeasure': 0}","{'precision': 0.0, 'recall': 0.0, 'fmeasure': 0}","{'precision': 0.0, 'recall': 0.0, 'fmeasure': 0}",0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
97,The Song of Roland,The Song of Roland Translated by C. K. [Charle...,Roland_Ganelon_Oliver_King Marsile_Blancandrin...,Charles_Oliver_Rollanz_Guenelun_Baligant_Tierr...,"{'precision': 0.2857142857142857, 'recall': 0....","{'precision': 0.23076923076923078, 'recall': 0...","{'precision': 0.2857142857142857, 'recall': 0....",0.285714,0.333333,0.307692,0.230769,0.500000,0.315789,0.285714,0.333333,0.307692
98,She,She by H. Rider Haggard First Published 1886. ...,Laurel Mack_Ellie Mack_Paul Mack_Hanna Mack_Fl...,Leo_Job_ye_Kallikrates_Baboon_Kôr_Queen_Billal...,"{'precision': 0.0, 'recall': 0.0, 'fmeasure': 0}","{'precision': 0.0, 'recall': 0.0, 'fmeasure': 0}","{'precision': 0.0, 'recall': 0.0, 'fmeasure': 0}",0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
99,The Wonderful Adventures of Nils,[Transcriber's note: The inconsistent orthogra...,Dunfin_Clement Larsson_Smirre Fox_Morten Goose...,Karr_Smirre_Jarro_Mats_Clement_Grayskin_Smirre...,"{'precision': 0.46153846153846156, 'recall': 0...","{'precision': 0.42857142857142855, 'recall': 0...","{'precision': 0.38095238095238093, 'recall': 0...",0.461538,0.705882,0.558140,0.428571,0.833333,0.566038,0.380952,0.615385,0.470588


In [None]:
df_save = df_chars.copy()
df_save = df_save.drop(columns = ["Any_NER","All_NER","Strict_NER"])
df_save["characters"] = df_save["characters"].map(lambda x: "_".join(x))
df_save["NER_chars"] = df_save["NER_chars"].map(lambda x: "_".join(x))

In [None]:
df_save.to_csv('new_booklist.csv', sep ='~') 

!cp new_booklist.csv "/content/drive/MyDrive/practicum/nltk_lists.csv"

In [None]:
df_chars.describe()

Unnamed: 0,Any_NER_precision,Any_NER_recall,Any_NER_fmeasure,All_NER_precision,All_NER_recall,All_NER_fmeasure,Strict_NER_precision,Strict_NER_recall,Strict_NER_fmeasure
count,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0
mean,0.50417,0.554465,0.512519,0.373791,0.599071,0.444136,0.162475,0.193928,0.165029
std,0.297685,0.310217,0.284801,0.229809,0.311224,0.24482,0.200044,0.202538,0.17527
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.285714,0.333333,0.333333,0.225806,0.4,0.285714,0.0,0.0,0.0
50%,0.533333,0.657895,0.55814,0.375,0.708333,0.5,0.125,0.176471,0.153846
75%,0.709677,0.8,0.75,0.535211,0.833333,0.64,0.230769,0.315789,0.277778
max,1.0,1.0,1.0,1.0,1.0,0.909091,1.0,1.0,0.909091
