In [1]:
import selenium

In [5]:
import pandas as pd
import time
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import matplotlib.pyplot as plt
import os
import logging

In [None]:
pip install matplotlib

Collecting matplotlib
  Downloading matplotlib-3.7.1-cp311-cp311-macosx_10_12_x86_64.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting contourpy>=1.0.1 (from matplotlib)
  Downloading contourpy-1.0.7-cp311-cp311-macosx_10_9_x86_64.whl (244 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cycler>=0.10 (from matplotlib)
  Downloading cycler-0.11.0-py3-none-any.whl (6.4 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.39.4-py3-none-any.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting kiwisolver>=1.0.1 (from matplotlib)
  Downloading kiwisolver-1.4.4-cp311-cp311-macosx_10_9_x86_64.whl (65 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

#### This whole project is about extracting the relationships between the characters in the witcher book series using NLP. After that, network analysis is applied to transform 1000's of pages into a interactive network graph, to help us better understand the dynamics between the characters

#### In the witcherwiki website, there is a list of all the books and the each book page contains the list of characters that appear in the book.Our task is to extract all the character names in the books using selenium.

### <font color = blue> Why selenium instead of BeautifulSoup or Scrapy?

### <font color = teal> Selenium

#### All of them individually can be used for web scrapping purposes or together which would optimise the process. 
#### Selenium is not actually a web scraping library, it is a browser automation library. It is used to auutomate tasks that you may do as humans on a website like
#### opening a browser, moving a mouse, clicking a button etc., So selenium is great for scrapping dynamic web pages that use javascript to store content. Examples websites which have "See More" or "Next" buttons,  if have to scroll down to see more data , input some data into a form to be able to see more data.
#### Selenium can interact with all of these elements and help to get the data out.
#### Notice that the URL might not change upon those actions, so other libraries that only get data through URL's can struggle in this case.
#### The downsides of Selenium is the datasize it can handle is more limited than Scrapy.

### <font color = teal > Scrapy

#### Scrapy is not even a library, it is a complete web scrapping framework. So it can be used to create a complete spider that can crawl through the entire website in a systematic way. It is fast, powerful and can handle large data. For serious and big web scrapping projects it is better to use Scrapy. To handle dynamic web pages, we can insert Selenium in the parse of the Scrapy spider to automate the clicking, scrolling etc., so that we have best of both worlds.

#### However, Scrapy is not beginner friendly and is more suited for advanced users.

### <font color = teal > BeautifulSoup

#### BeautifulSoup on the other hand, is a parsing library. There are two steps for scrapping data, first is getting the data from the website and then to parse it and save the output. BeautifulSoup only does the second bit, to actually pull the data from a website, you will need "requests" or other libraries.

#### BeautifulSoup can automatically detect the structures in the html and xml documents and find what you need. It is beginner friendly and a perfect library for small projects.

#### The downsides are that it is a bit slow, has few dependencies.

### <font color = teal > Conclusion

#### This web scrapping project is quite small and simple, so I chose Selenium.

### Create Driver

In [25]:
driver = webdriver.Chrome(ChromeDriverManager().install())

  driver = webdriver.Chrome(ChromeDriverManager().install())


In [26]:
page_url = "https://witcher.fandom.com/wiki/Category:Characters_in_the_stories"
driver.get(page_url)

In [27]:
# Find books
book_categories = driver.find_elements(by=By.CLASS_NAME, value='category-page__member-link')

In [28]:
books = []
for category in book_categories:
    book_url = category.get_attribute('href')
    book_name = category.text
    books.append({'book_name': book_name, "url": book_url})

In [29]:
books

[{'book_name': 'Category:Baptism of Fire characters',
  'url': 'https://witcher.fandom.com/wiki/Category:Baptism_of_Fire_characters'},
 {'book_name': 'Category:Blood of Elves characters',
  'url': 'https://witcher.fandom.com/wiki/Category:Blood_of_Elves_characters'},
 {'book_name': "Godamba Thaess'en",
  'url': 'https://witcher.fandom.com/wiki/Godamba_Thaess%27en'},
 {'book_name': 'Category:Season of Storms characters',
  'url': 'https://witcher.fandom.com/wiki/Category:Season_of_Storms_characters'},
 {'book_name': 'Category:Something Ends, Something Begins characters',
  'url': 'https://witcher.fandom.com/wiki/Category:Something_Ends,_Something_Begins_characters'},
 {'book_name': 'Category:Sword of Destiny characters',
  'url': 'https://witcher.fandom.com/wiki/Category:Sword_of_Destiny_characters'},
 {'book_name': 'Category:Szpony i kły characters',
  'url': 'https://witcher.fandom.com/wiki/Category:Szpony_i_k%C5%82y_characters'},
 {'book_name': 'Category:Tales from the world of The W

In [None]:
character_list = []

for book in books:
    # go to book page
    driver.get(book['url'])
    
    character_elems = driver.find_elements(by=By.CLASS_NAME, value = 'category-page__member-link')
    
    for elem in character_elems:
        character_list.append({'book': book['book_name'],'character': elem.text})