# Looted Benin Art Work Distribution

Scrape <a href="https://digitalbenin.org/">the Benin site</a> to create a dataframe that contains the following scraped information about each institution:

- Museum name
- Country
- Number of disputed items

Export as a ```disputed-benin-artwork.csv```

In [69]:
##Import libraries

import requests 
import pandas as pd
from bs4 import BeautifulSoup
import re
import glob
import spacy

In [6]:
##Import url

url = "https://ww2.nycourts.gov/courts/1jd/supctmanh/judicial_assignments.shtml"
response = requests.get(url)

In [7]:
##cheking if we connected
response.status_code

200

In [8]:
##Checking type

type(response)

requests.models.Response

In [9]:
#Transforming type to strings and check type

type(response.text)

str

In [10]:
##Calling response

response.text

'<!DOCTYPE html>\n<html lang="en" dir="ltr">\n  <head>\n    <meta charset="utf-8" /><script type="text/javascript">(window.NREUM||(NREUM={})).init={ajax:{deny_list:["bam.nr-data.net"]}};(window.NREUM||(NREUM={})).loader_config={licenseKey:"aa5723b152",applicationID:"65783686"};;/*! For license information please see nr-loader-rum-1.246.1.min.js.LICENSE.txt */\n(()=>{"use strict";var e,t,n={234:(e,t,n)=>{n.d(t,{P_:()=>h,Mt:()=>m,C5:()=>s,DL:()=>w,OP:()=>j,lF:()=>S,Yu:()=>_,Dg:()=>v,CX:()=>c,GE:()=>A,sU:()=>T});var r=n(8632),i=n(9567);const a={beacon:r.ce.beacon,errorBeacon:r.ce.errorBeacon,licenseKey:void 0,applicationID:void 0,sa:void 0,queueTime:void 0,applicationTime:void 0,ttGuid:void 0,user:void 0,account:void 0,product:void 0,extra:void 0,jsAttributes:{},userAttributes:void 0,atts:void 0,transactionName:void 0,tNamePlain:void 0},o={};function s(e){if(!e)throw new Error("All info objects require an agent identifier!");if(!o[e])throw new Error("Info for ".concat(e," was never set"))

In [11]:
##Transform content

response.content

b'<!DOCTYPE html>\n<html lang="en" dir="ltr">\n  <head>\n    <meta charset="utf-8" /><script type="text/javascript">(window.NREUM||(NREUM={})).init={ajax:{deny_list:["bam.nr-data.net"]}};(window.NREUM||(NREUM={})).loader_config={licenseKey:"aa5723b152",applicationID:"65783686"};;/*! For license information please see nr-loader-rum-1.246.1.min.js.LICENSE.txt */\n(()=>{"use strict";var e,t,n={234:(e,t,n)=>{n.d(t,{P_:()=>h,Mt:()=>m,C5:()=>s,DL:()=>w,OP:()=>j,lF:()=>S,Yu:()=>_,Dg:()=>v,CX:()=>c,GE:()=>A,sU:()=>T});var r=n(8632),i=n(9567);const a={beacon:r.ce.beacon,errorBeacon:r.ce.errorBeacon,licenseKey:void 0,applicationID:void 0,sa:void 0,queueTime:void 0,applicationTime:void 0,ttGuid:void 0,user:void 0,account:void 0,product:void 0,extra:void 0,jsAttributes:{},userAttributes:void 0,atts:void 0,transactionName:void 0,tNamePlain:void 0},o={};function s(e){if(!e)throw new Error("All info objects require an agent identifier!");if(!o[e])throw new Error("Info for ".concat(e," was never set")

In [12]:
##Cheking type once again
type(response.content)

bytes

In [13]:
##creating soup

soup = BeautifulSoup(response.text, "html.parser")
soup

<!DOCTYPE html>

<html dir="ltr" lang="en">
<head>
<meta charset="utf-8"/><script type="text/javascript">(window.NREUM||(NREUM={})).init={ajax:{deny_list:["bam.nr-data.net"]}};(window.NREUM||(NREUM={})).loader_config={licenseKey:"aa5723b152",applicationID:"65783686"};;/*! For license information please see nr-loader-rum-1.246.1.min.js.LICENSE.txt */
(()=>{"use strict";var e,t,n={234:(e,t,n)=>{n.d(t,{P_:()=>h,Mt:()=>m,C5:()=>s,DL:()=>w,OP:()=>j,lF:()=>S,Yu:()=>_,Dg:()=>v,CX:()=>c,GE:()=>A,sU:()=>T});var r=n(8632),i=n(9567);const a={beacon:r.ce.beacon,errorBeacon:r.ce.errorBeacon,licenseKey:void 0,applicationID:void 0,sa:void 0,queueTime:void 0,applicationTime:void 0,ttGuid:void 0,user:void 0,account:void 0,product:void 0,extra:void 0,jsAttributes:{},userAttributes:void 0,atts:void 0,transactionName:void 0,tNamePlain:void 0},o={};function s(e){if(!e)throw new Error("All info objects require an agent identifier!");if(!o[e])throw new Error("Info for ".concat(e," was never set"));return o[e

In [14]:
##Cleaning data with prettify and printing

print(soup.prettify())

<!DOCTYPE html>
<html dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <script type="text/javascript">
   (window.NREUM||(NREUM={})).init={ajax:{deny_list:["bam.nr-data.net"]}};(window.NREUM||(NREUM={})).loader_config={licenseKey:"aa5723b152",applicationID:"65783686"};;/*! For license information please see nr-loader-rum-1.246.1.min.js.LICENSE.txt */
(()=>{"use strict";var e,t,n={234:(e,t,n)=>{n.d(t,{P_:()=>h,Mt:()=>m,C5:()=>s,DL:()=>w,OP:()=>j,lF:()=>S,Yu:()=>_,Dg:()=>v,CX:()=>c,GE:()=>A,sU:()=>T});var r=n(8632),i=n(9567);const a={beacon:r.ce.beacon,errorBeacon:r.ce.errorBeacon,licenseKey:void 0,applicationID:void 0,sa:void 0,queueTime:void 0,applicationTime:void 0,ttGuid:void 0,user:void 0,account:void 0,product:void 0,extra:void 0,jsAttributes:{},userAttributes:void 0,atts:void 0,transactionName:void 0,tNamePlain:void 0},o={};function s(e){if(!e)throw new Error("All info objects require an agent identifier!");if(!o[e])throw new Error("Info for ".concat(e," was never set"));r

In [15]:
##Cheking type
type(soup)

bs4.BeautifulSoup

In [40]:
##Filtering out first element
judges = soup.find_all("h2", class_ = "content_header")
judges

[<h2 class="content_header content_header_steelblue" id="A"><strong>HON. DEBORAH A. KAPLAN</strong></h2>,
 <h2 class="content_header content_header_steelblue"><strong>HON. ADAM SILVERA</strong></h2>,
 <h2 class="content_header content_header_steelblue">HON. SUZANNE ADAMS</h2>,
 <h2 class="content_header content_header_steelblue" id="B">HON. NANCY M. BANNON</h2>,
 <h2 class="content_header content_header_steelblue">HON. ARLENE BLUTH</h2>,
 <h2 class="content_header content_header_steelblue">HON. ANDREW BORROK</h2>,
 <h2 class="content_header content_header_steelblue" id="C">HON. MARGARET A. CHAN</h2>,
 <h2 class="content_header content_header_steelblue">HON. ARIEL D. CHESLER</h2>,
 <h2 class="content_header content_header_steelblue">HON. JAMES G. CLYNES</h2>,
 <h2 class="content_header content_header_steelblue">HON. DAVID COHEN</h2>,
 <h2 class="content_header content_header_steelblue">HON. JOEL M. COHEN</h2>,
 <h2 class="content_header content_header_steelblue">HON. MELISSA CRANE</h2>,

In [41]:
##Checking type of element
type(judges)

bs4.element.ResultSet

In [42]:
##Checking lenght of element
len(judges)

55

In [43]:
##Getting text
judges_name = []
for judge in judges:
    print(judge.get_text())
    print("*********")

HON. DEBORAH A. KAPLAN
*********
HON. ADAM SILVERA
*********
HON. SUZANNE ADAMS
*********
HON. NANCY M. BANNON
*********
HON. ARLENE BLUTH
*********
HON. ANDREW BORROK
*********
HON. MARGARET A. CHAN
*********
HON. ARIEL D. CHESLER
*********
HON. JAMES G. CLYNES
*********
HON. DAVID COHEN
*********
HON. JOEL M. COHEN
*********
HON. MELISSA CRANE
*********
HON. JAMES D’AUGUSTE
*********
HON. TANDRA DAWSON
*********
HON. DENISE M. DOMINGUEZ
*********
HON. ERIKA M. EDWARDS
*********
HON. ARTHUR F. ENGORON
*********
HON. LYLE E. FRANK
*********
HON. PAUL ALLAN GOETZ
*********
HON. SHLOMO S. HAGLER
*********
HON. LISA HEADLEY
*********
HON. DOUGLAS E. HOFFMAN
*********
HON. DEBRA JAMES
*********
HON. TA-TANISHA D. JAMES
*********
HON. FRANCIS A. KAHN III
*********
HON. MICHAEL L. KATZ
*********
HON. JOHN J. KELLEY
*********
HON. JUDY H. KIM
*********
HON. KATHY J. KING
*********
HON. LYNN R. KOTLER
*********
HON. SABRINA B. KRAUS
*********
HON. RICHARD G. LATIN
*********
HON. GERALD LEBOVIT

In [44]:
##Storaging text

judges_name_fl = []
for judge in judges:
    judges_name_fl.append(judge.get_text())
judges_name_fl

['HON. DEBORAH A. KAPLAN',
 'HON. ADAM SILVERA',
 'HON. SUZANNE ADAMS',
 'HON. NANCY M. BANNON',
 'HON. ARLENE BLUTH',
 'HON. ANDREW BORROK',
 'HON. MARGARET A. CHAN',
 'HON. ARIEL D. CHESLER',
 'HON. JAMES G. CLYNES',
 'HON. DAVID COHEN',
 'HON. JOEL M. COHEN',
 'HON. MELISSA CRANE',
 'HON. JAMES D’AUGUSTE',
 'HON. TANDRA DAWSON',
 'HON. DENISE M. DOMINGUEZ',
 'HON. ERIKA M. EDWARDS',
 'HON. ARTHUR F. ENGORON',
 'HON. LYLE E. FRANK',
 'HON. PAUL ALLAN GOETZ',
 'HON. SHLOMO S. HAGLER',
 'HON. LISA HEADLEY',
 'HON. DOUGLAS E. HOFFMAN',
 'HON. DEBRA JAMES',
 'HON. TA-TANISHA D. JAMES',
 'HON. FRANCIS A. KAHN III',
 'HON. MICHAEL L. KATZ',
 'HON. JOHN J. KELLEY',
 'HON. JUDY H. KIM',
 'HON. KATHY J. KING',
 'HON. LYNN R. KOTLER',
 'HON. SABRINA B. KRAUS',
 'HON. RICHARD G. LATIN',
 'HON. GERALD LEBOVITS',
 'HON. ANDREA MASLEY',
 'HON. JUDITH McMAHON',
 'HON. EMILY MORALES-MINERVA',
 'HON. NICHOLAS MOYNE',
 'HON. FRANK P. NERVO',
 'HON. LOUIS L. NOCK',
 'HON. BARRY OSTRAGER',
 'HON. PHAEDR

In [59]:
# Find and extract all text from the HTML
all_text = soup.get_text()



In [60]:
# Print or save the extracted text
print(all_text)















Judicial Assignments & Locations | NYCOURTS.GOV





      Skip to main content
    






Main Menu >


Home


The Courts


E-Courts


Representing Yourself


The Law


Jurors


Judges


Legal Profession


Topics A to Z




NYCOURTS.GOV









SEARCH NYCourts.gov







NYCOURTS.GOV

New York State Unified Court System




1st JD - Supreme Court, Civil Branch, NY County









 



 











HOME


ACCESSIBILITY (ADA) 


Statewide Info


Local ADA Info




ABOUT THE COURT


General Overview


History


Offices & Functions


Locations & Directions


Juror Information


Help Center


Courthouse Technology


Attorney IDs


Court Terms/Holidays


Court Tours


Foreclosure Auctions




CASE PROCESSING


 Commencement of Cases


RJIs & Assignments


Motions & Applications


Conferences & Case Management


Guardianship / Fiduciary Cases


Matrimonial (Divorce) Litigation


Trials


Subpoenaed Records/Interpreters/Withdrawal


Court Records


Court Reporters




COURT RES

In [70]:
conda install -c conda-forge spacy-model-en_core_web_sm

Retrieving notices: ...working... done
done
Solving environment: done


  current version: 23.1.0
  latest version: 23.10.0

Please update conda by running

    $ conda update -n base -c defaults conda

Or to minimize the number of packages updated during conda update use

     conda install conda=23.10.0



# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.


In [71]:
import en_core_web_sm

In [72]:
## build nlp pipeline (a function will tokenize, parse and ner for us)
nlp = en_core_web_sm.load()

In [73]:
doc = nlp(all_text)

In [74]:
for item in doc:
    print(item)
    print("*******")
















*******
Judicial
*******
Assignments
*******
&
*******
Locations
*******
|
*******
NYCOURTS.GOV
*******






      
*******
Skip
*******
to
*******
main
*******
content
*******

    







*******
Main
*******
Menu
*******
>
*******




*******
Home
*******




*******
The
*******
Courts
*******




*******
E
*******
-
*******
Courts
*******




*******
Representing
*******
Yourself
*******




*******
The
*******
Law
*******




*******
Jurors
*******




*******
Judges
*******




*******
Legal
*******
Profession
*******




*******
Topics
*******
A
*******
to
*******
Z
*******






*******
NYCOURTS.GOV
*******











*******
SEARCH
*******
NYCourts.gov
*******









*******
NYCOURTS.GOV
*******



*******
New
*******
York
*******
State
*******
Unified
*******
Court
*******
System
*******






*******
1st
*******
JD
*******
-
*******
Supreme
*******
Court
*******
,
*******
Civil
*******
Branch
*******
,
*******
NY
*******
County
*******










 



 






In [91]:

## function to find entities
def show_entities(my_text):
  '''
  my_text must be a spacy doc tokenized object; already run through nlp pipeline

  '''
  each_token = "Token"
  entity_type = "Entity"
  entity_def = "Entity Defined"
  print(f"{each_token:{30}}{entity_type:{15}}{entity_def}")
  if my_text.ents:
      for word in my_text.ents:
          print(f"{word.text:{30}} {word.label_:{15}} {str(spacy.explain(word.label_))}")
  else:
      print("There are no entities in this text")


In [93]:
print(my_text)

NameError: name 'my_text' is not defined

In [77]:
type(museum_name_fl)

list

In [54]:
##Filtering out second element

charge = soup.find_all("p")
charge


[<p><a href="https://www.nycourts.gov/google/sitewide.shtml" style="text-decoration:underline;">SEARCH NYCourts.gov</a></p>,
 <p> </p>,
 <p><a href="https://www.nycourts.gov/legacypdfs/courts/1jd/supctmanh/Rules/Uniform_Rules.pdf" rel="noopener" target="_blank" title="Uniform Rules of the Justices"><strong>Uniform Rules of the Justices</strong></a></p>,
 <p>Select from below to view judicial assignments ordered by last name.</p>,
 <p><a href="#A">A</a> <a href="#B">B</a> <a href="#C">C</a> <a href="#D">D</a> <a href="#E">E</a> <a href="#F">F</a> <a href="#G">G</a> <a href="#H">H</a> <a href="#I">I</a> <a href="#J">J</a> <a href="#K">K</a> <a href="#L">L</a> <a href="#M">M</a> <a href="#N">N</a> <a href="#O">O</a> <a href="#P">P</a> <a href="#Q">Q</a> <a href="#R">R</a> <a href="#S">S</a> <a href="#T">T</a> <a href="#U">U</a> <a href="#V">V</a> <a href="#W">W</a> <a href="#X">X</a> <a href="#Y">Y</a> <a href="#Z">Z</a></p>,
 <p> </p>,
 <p><strong>Deputy Chief Administrative Judge for Co

In [79]:
len(countries)

131

In [80]:
##Getting text

country_name = []
for country in countries:
    print(country.get_text())
    print("*********")

United Kingdom
*********
Germany
*********
United States
*********
United Kingdom
*********
Nigeria
*********
Germany
*********
Austria
*********
United States
*********
Germany
*********
United States
*********
United Kingdom
*********
Netherlands
*********
Germany
*********
Nigeria
*********
United Kingdom
*********
United Kingdom
*********
United Kingdom
*********
Germany
*********
United States
*********
Germany
*********
Sweden
*********
United States
*********
Ireland
*********
United States
*********
United States
*********
France
*********
United Kingdom
*********
Germany
*********
Russia
*********
Germany
*********
United States
*********
United Kingdom
*********
Norway
*********
Switzerland
*********
United Kingdom
*********
Germany
*********
Switzerland
*********
United States
*********
New Zealand
*********
United States
*********
Switzerland
*********
United Kingdom
*********
United Kingdom
*********
United States
*********
United Kingdom
*********
Germany
*********
Israel

In [81]:
##Appending elements

country_name_fl = []
for country in countries:
    country_name_fl.append(country.get_text())
country_name_fl

['United Kingdom',
 'Germany',
 'United States',
 'United Kingdom',
 'Nigeria',
 'Germany',
 'Austria',
 'United States',
 'Germany',
 'United States',
 'United Kingdom',
 'Netherlands',
 'Germany',
 'Nigeria',
 'United Kingdom',
 'United Kingdom',
 'United Kingdom',
 'Germany',
 'United States',
 'Germany',
 'Sweden',
 'United States',
 'Ireland',
 'United States',
 'United States',
 'France',
 'United Kingdom',
 'Germany',
 'Russia',
 'Germany',
 'United States',
 'United Kingdom',
 'Norway',
 'Switzerland',
 'United Kingdom',
 'Germany',
 'Switzerland',
 'United States',
 'New Zealand',
 'United States',
 'Switzerland',
 'United Kingdom',
 'United Kingdom',
 'United States',
 'United Kingdom',
 'Germany',
 'Israel',
 'Australia',
 'Germany',
 'United Kingdom',
 'United States',
 'United States',
 'United Kingdom',
 'Switzerland',
 'United States',
 'United States',
 'Switzerland',
 'United States',
 'United States',
 'United States',
 'United States',
 'United States',
 'Denmark',
 

In [84]:
type(country_name_fl)

list

In [97]:
##Filtering our third element

items = soup.find_all("div", class_ = "d-inline object_count")
items

[<div class="d-inline object_count" count_default="944">944</div>,
 <div class="d-inline object_count" count_default="518">518</div>,
 <div class="d-inline object_count" count_default="393">393</div>,
 <div class="d-inline object_count" count_default="350">350</div>,
 <div class="d-inline object_count" count_default="285">285</div>,
 <div class="d-inline object_count" count_default="283">283</div>,
 <div class="d-inline object_count" count_default="202">202</div>,
 <div class="d-inline object_count" count_default="188">188</div>,
 <div class="d-inline object_count" count_default="179">179</div>,
 <div class="d-inline object_count" count_default="154">154</div>,
 <div class="d-inline object_count" count_default="148">148</div>,
 <div class="d-inline object_count" count_default="122">122</div>,
 <div class="d-inline object_count" count_default="92">92</div>,
 <div class="d-inline object_count" count_default="81">81</div>,
 <div class="d-inline object_count" count_default="74">74</div>,
 

In [98]:
##Getting text
disputed_items = []
for item in items:
    print(item.get_text())
    print("*********")

944
*********
518
*********
393
*********
350
*********
285
*********
283
*********
202
*********
188
*********
179
*********
154
*********
148
*********
122
*********
92
*********
81
*********
74
*********
72
*********
71
*********
69
*********
64
*********
55
*********
53
*********
48
*********
46
*********
43
*********
37
*********
35
*********
32
*********
32
*********
28
*********
24
*********
23
*********
23
*********
23
*********
20
*********
18
*********
18
*********
17
*********
16
*********
15
*********
15
*********
14
*********
14
*********
13
*********
12
*********
10
*********
10
*********
9
*********
9
*********
9
*********
8
*********
8
*********
8
*********
8
*********
8
*********
7
*********
7
*********
7
*********
7
*********
7
*********
6
*********
6
*********
5
*********
5
*********
5
*********
5
*********
5
*********
4
*********
4
*********
4
*********
4
*********
4
*********
4
*********
4
*********
3
*********
3
*********
3
*********
3
*********
3
*********
3
****

In [99]:
##appending text
disputed_items_fl = []
for item in items:
    disputed_items_fl.append(item.get_text())
disputed_items_fl

['944',
 '518',
 '393',
 '350',
 '285',
 '283',
 '202',
 '188',
 '179',
 '154',
 '148',
 '122',
 '92',
 '81',
 '74',
 '72',
 '71',
 '69',
 '64',
 '55',
 '53',
 '48',
 '46',
 '43',
 '37',
 '35',
 '32',
 '32',
 '28',
 '24',
 '23',
 '23',
 '23',
 '20',
 '18',
 '18',
 '17',
 '16',
 '15',
 '15',
 '14',
 '14',
 '13',
 '12',
 '10',
 '10',
 '9',
 '9',
 '9',
 '8',
 '8',
 '8',
 '8',
 '8',
 '7',
 '7',
 '7',
 '7',
 '7',
 '6',
 '6',
 '5',
 '5',
 '5',
 '5',
 '5',
 '4',
 '4',
 '4',
 '4',
 '4',
 '4',
 '4',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1']

In [100]:
len(disputed_items_fl)

131

In [102]:
##Creating list of items

museums_list = []
for item in zip (museum_name_fl, country_name_fl, disputed_items_fl):
    museums_list.append(item)

museums_list

[('British Museum', 'United Kingdom', '944'),
 ('Ethnologisches Museum, Staatliche Museen zu Berlin', 'Germany', '518'),
 ('Field Museum', 'United States', '393'),
 ('Museum of Archaeology and Anthropology, University of Cambridge',
  'United Kingdom',
  '350'),
 ('National Museum, Benin', 'Nigeria', '285'),
 ('Staatliche Ethnographische Sammlungen Sachsen und Staatliche Kunstsammlungen Dresden',
  'Germany',
  '283'),
 ('Weltmuseum Wien', 'Austria', '202'),
 ('University of Pennsylvania Museum of Archaeology and Anthropology (Penn Museum)',
  'United States',
  '188'),
 ('MARKK Museum am Rothenbaum Kulturen und Künste der Welt', 'Germany', '179'),
 ('Metropolitan Museum of Art', 'United States', '154'),
 ('Pitt Rivers Museum', 'United Kingdom', '148'),
 ('Nationaal Museum van Wereldculturen and Wereldmuseum',
  'Netherlands',
  '122'),
 ('Rautenstrauch-Joest-Museum', 'Germany', '92'),
 ('National Museum, Lagos', 'Nigeria', '81'),
 ('National Museums Scotland', 'United Kingdom', '74'),

In [103]:
##Transforming dataframe and putting name of the columns

df = pd.DataFrame(museums_list)
df.columns = ("Museum name", "Country", "Number of disputed items")
df

Unnamed: 0,Museum name,Country,Number of disputed items
0,British Museum,United Kingdom,944
1,"Ethnologisches Museum, Staatliche Museen zu Be...",Germany,518
2,Field Museum,United States,393
3,"Museum of Archaeology and Anthropology, Univer...",United Kingdom,350
4,"National Museum, Benin",Nigeria,285
...,...,...,...
126,"Allen Memorial Art Museum, Oberlin College",United States,1
127,Newark Museum of Art,United States,1
128,LACMA The Los Angeles County Museum of Art,United States,1
129,Hood Museum of Art,United States,1


In [105]:
##Exporting to csv
df.to_csv("disputed-benin-artwork.csv", index = False, encoding = "UTF-8")