In [20]:
# Step 0: Import libraries

from bs4 import BeautifulSoup
import requests
import json
import pandas as pd

def scrape_webpage(url): #rename function to be more meaningful
    response = requests.get(url)

    html_string = response.text
    return html_string

# Step 1: Create a function for getting the urls wuth title

#lowercase and underscores are the normal convention for naming functions in Python. 
#Camelcasing like you had is more normal in JavaScript or for classes.

def get_content(url='', keyword='', filename='', url_head="https://humanist.kdl.kcl.ac.uk"):
    content = scrape_webpage(url)
    soup = BeautifulSoup(content, "html.parser")
    links = soup.find_all('a')

    result = {} #Create a dictionary to store the results, and give a more descriptive name to the variable
    for link in links:
        text = link.get_text().lower()
        if keyword in text:
            result[text] = url_head + link.get('href')
    
    # Saving into new py doc

    with open(filename, 'w') as file:
        file.write(json.dumps(result, indent=2))

    return result

In [21]:
res = get_content(
    url="https://humanist.kdl.kcl.ac.uk/",
    keyword="volume",
    filename="main_page.json")


res_v1 = get_content(
    url=res['volume 1 5/87-5/88'],
    keyword="txt",
    url_head="https://humanist.kdl.kcl.ac.uk/Archives/Virginia/v01/",
    filename="1st_volume.json")

res_v33 = get_content(
    url=res['volume 33'],
    keyword="humanist",
    filename="33rd_volume.json")

In [26]:
print("\nPrinting the 1st volume:\n")
for key, value in res_v1.items():
    print(key, ':', value)
    text_file = scrape_webpage(value)
    soup_text = BeautifulSoup(text_file, "html.parser")
    soup_text = soup_text.text #Use bs4 to extract website's text, because the returned text_file has tags in it
    with open('humanist_files/' + key, 'w',encoding='utf-8-sig') as file:
        file.write(soup_text)


Printing the 1st volume:

8705.1324.txt : https://humanist.kdl.kcl.ac.uk/Archives/Virginia/v01/8705.1324.txt
8706.1324.txt : https://humanist.kdl.kcl.ac.uk/Archives/Virginia/v01/8706.1324.txt
8707.1324.txt : https://humanist.kdl.kcl.ac.uk/Archives/Virginia/v01/8707.1324.txt
8708.1324.txt : https://humanist.kdl.kcl.ac.uk/Archives/Virginia/v01/8708.1324.txt
8709.1324.txt : https://humanist.kdl.kcl.ac.uk/Archives/Virginia/v01/8709.1324.txt
8710.1324.txt : https://humanist.kdl.kcl.ac.uk/Archives/Virginia/v01/8710.1324.txt
8711.1324.txt : https://humanist.kdl.kcl.ac.uk/Archives/Virginia/v01/8711.1324.txt
8712.1324.txt : https://humanist.kdl.kcl.ac.uk/Archives/Virginia/v01/8712.1324.txt
8802.1324.txt : https://humanist.kdl.kcl.ac.uk/Archives/Virginia/v01/8802.1324.txt
8803.1324.txt : https://humanist.kdl.kcl.ac.uk/Archives/Virginia/v01/8803.1324.txt
8804.1324.txt : https://humanist.kdl.kcl.ac.uk/Archives/Virginia/v01/8804.1324.txt
biog01.1324.txt : https://humanist.kdl.kcl.ac.uk/Archives/Vi

In [27]:
print("\nPrinting the 33rd volume:\n")
for key, value in res_v33.items():
    print(key, ':', value)
    text_file = scrape_webpage(value)
    soup_text = BeautifulSoup(text_file, "html.parser")
    soup_text = soup_text.text# Use ".txt" to save files correctly
    with open('humanist_files/' + key+".txt", 'w',encoding='utf-8-sig') as file:
        file.write(soup_text)


Printing the 33rd volume:

may 7, 2019, 6:16 a.m. humanist 33.1 : https://humanist.kdl.kcl.ac.uk/volume/33/1
may 7, 2019, 6:21 a.m. humanist 33.2 : https://humanist.kdl.kcl.ac.uk/volume/33/2
may 8, 2019, 6:35 a.m. humanist 33.3 : https://humanist.kdl.kcl.ac.uk/volume/33/3
may 8, 2019, 6:37 a.m. humanist 33.4 : https://humanist.kdl.kcl.ac.uk/volume/33/4
may 8, 2019, 6:41 a.m. humanist 33.5 : https://humanist.kdl.kcl.ac.uk/volume/33/5
may 9, 2019, 6:07 a.m. humanist 33.6 : https://humanist.kdl.kcl.ac.uk/volume/33/6
may 9, 2019, 6:10 a.m. humanist 33.7 : https://humanist.kdl.kcl.ac.uk/volume/33/7
may 9, 2019, 6:12 a.m. humanist 33.8 : https://humanist.kdl.kcl.ac.uk/volume/33/8
may 10, 2019, 5:53 a.m. humanist 33.9 : https://humanist.kdl.kcl.ac.uk/volume/33/9
may 10, 2019, 5:54 a.m. humanist 33.10 : https://humanist.kdl.kcl.ac.uk/volume/33/10
may 10, 2019, 5:57 a.m. humanist 33.11 : https://humanist.kdl.kcl.ac.uk/volume/33/11
may 10, 2019, 6 a.m. humanist 33.12 : https://humanist.kdl.kcl.

In [28]:
data=[]
Converted = get_content(
    url="https://humanist.kdl.kcl.ac.uk/Archives/Converted_Text/",
    keyword="humanist",# Use key work to get links
    url_head="https://humanist.kdl.kcl.ac.uk/Archives/Converted_Text/",
    filename="Converted_volume.json")# Links to txt files are saved in this json file

for key, value in Converted.items():
    print(key, ':', value)
    text_file = scrape_webpage(value)
    soup_text = BeautifulSoup(text_file, "html.parser")
    soup_text = soup_text.text
    with open('humanist_files/' + key, 'w',encoding='utf-8-sig') as file:
        file.write(soup_text)
    date = value.split(".")[-2]# Get dates in websites like 1999-2000
    # Save content in dictionary
    # Replace '\n' in soup_text
    data.append({"Date":date,"URL":value,"Text":soup_text.replace('\n',"")})
humanist_vols = pd.DataFrame.from_dict(data, orient='columns')
print(humanist_vols)
humanist_vols.to_csv('web_scraped_humanist_listserv.csv')

humanist.1987-1988.txt : https://humanist.kdl.kcl.ac.uk/Archives/Converted_Text/humanist.1987-1988.txt
humanist.1988-1989.txt : https://humanist.kdl.kcl.ac.uk/Archives/Converted_Text/humanist.1988-1989.txt
humanist.1989-1990.txt : https://humanist.kdl.kcl.ac.uk/Archives/Converted_Text/humanist.1989-1990.txt
humanist.1990-1991.txt : https://humanist.kdl.kcl.ac.uk/Archives/Converted_Text/humanist.1990-1991.txt
humanist.1991-1992.txt : https://humanist.kdl.kcl.ac.uk/Archives/Converted_Text/humanist.1991-1992.txt
humanist.1992-1993.txt : https://humanist.kdl.kcl.ac.uk/Archives/Converted_Text/humanist.1992-1993.txt
humanist.1993-1994.txt : https://humanist.kdl.kcl.ac.uk/Archives/Converted_Text/humanist.1993-1994.txt
humanist.1994-1995.txt : https://humanist.kdl.kcl.ac.uk/Archives/Converted_Text/humanist.1994-1995.txt
humanist.1995-1996.txt : https://humanist.kdl.kcl.ac.uk/Archives/Converted_Text/humanist.1995-1996.txt
humanist.1996-1997.txt : https://humanist.kdl.kcl.ac.uk/Archives/Converte