In [1]:
# Import libraries necessary for Webscraping and working with CSV Files
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# Define the URL of the Wikipedia page
url = "https://en.wikipedia.org/wiki/List_of_Nobel_laureates_in_Literature"

# Send a GET request to fetch the contents of the page
response = requests.get(url)

# Parse the page content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

In [3]:
# BeautifulSoup to find the table containing the data
table = soup.find('table', {'class': 'wikitable'})

In [4]:
# Create list to store data extracted from the table
data = []

In [5]:
# Loop through each row of the table (except the header row)
for row in table.find_all('tr')[1:]:  # [1:] skips the header row
    # Extract all cells (columns) in the current row
    cells = row.find_all(['th', 'td'])
    
    # Skip rows with no data
    if len(cells) == 0:
        continue

    # Extract the year from the first cell
    year_cell = cells[0]
    year_link = year_cell.find('a')  # Some years have links
    year = year_link.text.strip() if year_link else year_cell.text.strip()

    # Handle cases where a year spans multiple laureates (rowspan attribute)
    rowspan = int(year_cell.get('rowspan', 1))

    # Extract other details like laureate name, country, language, citation, and genres
    laureate = cells[2].get_text(strip=True) if len(cells) > 2 else "N/A"
    country = cells[3].get_text(strip=True) if len(cells) > 3 else "N/A"
    language = cells[4].get_text(strip=True) if len(cells) > 4 else "N/A"
    citation = cells[5].get_text(strip=True) if len(cells) > 5 else "N/A"
    genres = cells[6].get_text(strip=True) if len(cells) > 6 else "N/A"

    # Append the extracted data to the list
    data.append([year, laureate, country, language, citation, genres])

    # Process additional rows in cases where there are multiple laureates (rowspan)
    for _ in range(rowspan - 1):
        # Move to the next row and extract data in the same manner
        next_row = row.find_next_sibling('tr')
        row = next_row
        next_cells = row.find_all(['th', 'td'])

        laureate = next_cells[1].get_text(strip=True) if len(next_cells) > 1 else "N/A"
        country = next_cells[2].get_text(strip=True) if len(next_cells) > 2 else "N/A"
        language = next_cells[3].get_text(strip=True) if len(next_cells) > 3 else "N/A"
        citation = next_cells[4].get_text(strip=True) if len(next_cells) > 4 else "N/A"
        genres = next_cells[5].get_text(strip=True) if len(next_cells) > 5 else "N/A"

        data.append([year, laureate, country, language, citation, genres])

In [6]:
# Create a DataFrame to organize the data
df = pd.DataFrame(data, columns=["Year", "Laureate", "Country", "Language", "Citation", "Genres"])

In [7]:
# Define the file name for the CSV
csv_file = "nobel_laureates_literature.csv"

# Save the DataFrame to a CSV file
df.to_csv(csv_file, index=False, encoding='utf-8')

# Print a success message
print(f"Data has been successfully saved to {csv_file}.")

Data has been successfully saved to nobel_laureates_literature.csv.
