In [1]:
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen

In [2]:
wiki_url = 'https://en.wikipedia.org/wiki/Genome'
wiki_data = urlopen(wiki_url)
wiki_html = wiki_data.read()
wiki_data.close

page_soup = soup(wiki_html, 'html.parser')

In [4]:
print(page_soup.h1)

<h1 class="firstHeading" id="firstHeading">Genome</h1>


In [23]:
genome_table = page_soup.findAll('table', {'class': 'wikitable sortable'})


In [6]:
genome_table = genome_table[0]
headers = genome_table.findAll('th', {})
print(headers)

[<th>Organism type
</th>, <th>Organism
</th>, <th colspan="2">Genome size <br/>(<a href="/wiki/Base_pair" title="Base pair">base pairs</a>)
</th>, <th>Approx. no. of genes
</th>, <th class="unsortable">Note
</th>]


In [12]:
header_titles = []
for header in headers:
    header_titles.append(header.text[:-1])
    print(header_titles)

['Organism type']
['Organism type', 'Organism']
['Organism type', 'Organism', 'Genome size (base pairs)']
['Organism type', 'Organism', 'Genome size (base pairs)', 'Approx. no. of genes']
['Organism type', 'Organism', 'Genome size (base pairs)', 'Approx. no. of genes', 'Note']


In [22]:
all_rows = genome_table.findAll('tr', {})
data = all_rows[1:]

first_row = data[0]
first_row_data = first_row.findAll('td', {})

data_texts = []
for data_text in first_row_data:
    data_texts.append(data_text.text[:-1])

print(data_texts)



['Virus\nViru', 'Porcine circovirus type 1\nPorcine circovirus type ', '1,759\n1,75', '1.8\xa0kB\n1.8\xa0k', '', 'Smallest viruses replicating autonomously in eukaryotic cells.[49]\nSmallest viruses replicating autonomously in eukaryotic cells.[49']


In [24]:
table_rows = []

for row in data:
    table_row = []
    row_data = row.findAll('td', {})
    for data_point in row_data:
        table_row.append(data_point.text[:-1])
    table_rows.append(table_row)
        
print(table_rows)

[['Virus\nViru', 'Porcine circovirus type 1\nPorcine circovirus type ', '1,759\n1,75', '1.8\xa0kB\n1.8\xa0k', '', 'Smallest viruses replicating autonomously in eukaryotic cells.[49]\nSmallest viruses replicating autonomously in eukaryotic cells.[49'], ['Virus', 'Bacteriophage MS2', '3,569', '3.5\xa0kB', '', 'First sequenced RNA-genome[50]'], ['Virus', 'SV40', '5,224', '5.2\xa0kB', '', '[51]'], ['Virus', 'Phage Φ-X174', '5,386', '5.4\xa0kB', '', 'First sequenced DNA-genome[52]'], ['Virus', 'HIV', '9,749', '9.7\xa0kB', '', '[53]'], ['Virus', 'Phage λ', '48,502', '48.5\xa0kB', '', 'Often used as a vector for the cloning of recombinant DNA.\n[54]\n[55]\n[56]\n'], ['Virus', 'Megavirus', '1,259,197', '1.3\xa0MB', '', 'Until 2013 the largest known viral genome.[57]'], ['Virus', 'Pandoravirus salinus', '2,470,000', '2.47\xa0MB', '', 'Largest known viral genome.[58]'], ['Eukaryotic organelle', 'Human mitochondrion', '16,569', '16.6\xa0kB', '', '[59]'], ['Bacterium', 'Nasuia deltocephalinicola (

In [35]:
filename = 'genome_table.csv'
f = open(filename, 'w', encoding="utf-8")
header_string = ''
for title in header_titles:
    header_string += title + ','
header_string = header_string[:-1]
header_string += '\n'
f.write(header_string)
for row in table_rows:
    row_string = ''
    for column in row:
        column_string = column.replace(',', '')  #comma-less version saved to column_string
        row_string += column_string + ','        #updated to refer to column_string
    row_string = row_string[:-1]
    row_string += '\n'
    
    f.write(row_string)
    
f.close()

In [48]:
filename = 'Genome - Wikipedia.html'
f = open(filename, encoding="utf8")
new_soup = soup(f, 'html.parser')
print(new_soup.h1)   #scrape data from a local file , file no longer included deleted for uploading to github

<h1 class="firstHeading" id="firstHeading">Genome</h1>


In [56]:
references_list_raw = page_soup.findAll('ol', {'class': 'references'})
references_list = references_list_raw[0].findAll('li', {})
all_references = []

for list_item in references_list:
    references = []
    for reference in list_item.findAll('a', {}):
        references.append(reference['href'])
    all_references.append(references)
    
print(all_references)




[['#cite_ref-Roth_p._1-0', '//www.ncbi.nlm.nih.gov/pmc/articles/PMC6579593', '/wiki/Doi_(identifier)', 'https://doi.org/10.5195%2Fjmla.2019.604', '/wiki/ISSN_(identifier)', '//www.worldcat.org/issn/1558-9439', '/wiki/PMC_(identifier)', '//www.ncbi.nlm.nih.gov/pmc/articles/PMC6579593', '/wiki/PMID_(identifier)', '//pubmed.ncbi.nlm.nih.gov/31258451'], ['#cite_ref-2', '/wiki/Bibcode_(identifier)', 'https://ui.adsabs.harvard.edu/abs/2009NYASA1178..186B', '/wiki/Doi_(identifier)', 'https://doi.org/10.1111%2Fj.1749-6632.2009.05004.x', '/wiki/PMID_(identifier)', '//pubmed.ncbi.nlm.nih.gov/19845638', '/wiki/S2CID_(identifier)', 'https://api.semanticscholar.org/CorpusID:8279434'], ['#cite_ref-3', '/wiki/Matt_Ridley', 'https://web.archive.org/web/20181024231945/http://bioinformaticsinstitute.ru/sites/default/files/genome_the_autobiography_of_a_species_in_23_chapters_-_matt_ridley.pdf', '/wiki/ISBN_(identifier)', '/wiki/Special:BookSources/978-0-06-019497-0', 'http://bioinformaticsinstitute.ru/si