In [32]:
import os
import re
import time

import shutil
import requests

import numpy as np
import pandas as pd
from tqdm import tqdm

from bs4 import BeautifulSoup as bs

from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.core.os_manager import ChromeType
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service as ChromiumService

In [33]:
# Ensuring empty directory is created

if os.path.isdir('data'):
    shutil.rmtree('data')
os.makedirs('data/summaries')

In [34]:
# selenium 4
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager

driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))



In [35]:
# Parsing website skelton to bs4 object

BASE_URL = 'https://breakingbad.fandom.com' 

r = requests.get(BASE_URL + '/wiki/Category:Seasons_(Better_Call_Saul)')

soup = bs(r.content, 'lxml')

In [36]:
# Generating links to each seasons web page

all_items_S = soup.find_all('div', class_ = 'category-page__members-wrapper')[-1]
season_list = all_items_S.find_all('li', class_ = 'category-page__member')

season_link = []

for season in season_list:
    season_link.append((season.find('a').get('title'), BASE_URL + season.find('a').get('href')))

for sea, link in season_link:
    print(f'{sea} :: {link}')

Season 1 (Better Call Saul) :: https://breakingbad.fandom.com/wiki/Season_1_(Better_Call_Saul)
Season 2 (Better Call Saul) :: https://breakingbad.fandom.com/wiki/Season_2_(Better_Call_Saul)
Season 3 (Better Call Saul) :: https://breakingbad.fandom.com/wiki/Season_3_(Better_Call_Saul)
Season 4 (Better Call Saul) :: https://breakingbad.fandom.com/wiki/Season_4_(Better_Call_Saul)
Season 5 (Better Call Saul) :: https://breakingbad.fandom.com/wiki/Season_5_(Better_Call_Saul)
Season 6 (Better Call Saul) :: https://breakingbad.fandom.com/wiki/Season_6_(Better_Call_Saul)


In [37]:
# https://pypi.org/project/webdriver-manager/

def call_webdriver():
    
    return webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))

In [38]:
def get_summmary(episode_link, season_name):
    
    driver = call_webdriver()
    driver.minimize_window()
    driver.get(episode_link)
    time.sleep(5)
    all_para = driver.find_elements(By.XPATH, '//*[@id="mw-content-text"]/div/h3[1]//following::p')
    
    with open(f'data/summaries/{season_name}.txt', 'a') as file:
        for para in all_para:
            if para.aria_role =='paragraph':
                file.write(para.text + '\n')
                
    # list_para = [para.text for para in all_para if para.aria_role =='paragraph']
    driver.close()
    # return list_para

In [39]:
def get_starring(episode_link):
    
    r = requests.get(episode_link)
    soup = bs(r.content, 'lxml')
    
    starring_block = soup.find('div', class_ = 'tabber wds-tabber')
    charactres_class = starring_block.find_all('div', class_ = 'wds-tab__content')

    # for char in charactres_class:
    #     for ch in char.select('li'):
    #         print(ch.text)
        
    starrings = [ch.text for char in charactres_class for ch in char.select('li')]
        
    # starrings = [star.text for star in soup.select('div.tabber.wds-tabber table li')]
    
    # for char in starrings:
    #     print(char.text)
    
    return starrings

In [41]:
episode_links, characters = {}, {}

for season, link in tqdm(season_link):
    
    r = requests.get(link)

    soup = bs(r.content, 'lxml')
    episodes = ['https://breakingbad.fandom.com' + x.find('a').get('href') for x in soup.select('#gallery-0 div.thumb')]
    
    season = season.split(' (')[0].replace(' ', '_')
    episode_links[season] = episodes
    
    characts = []
    
    for episode in episodes:
        get_summmary(episode, season)
        characts.append(get_starring(episode))
        
    characters[season] = list(np.concatenate(characts))
    
    # cnt += len(episodes)
    # break
        
print(f'Number of episodes recoreded   :: {np.concatenate(list(episode_links.values())).shape[0]}')

100%|███████████████████████████████████████████████████████████████████████████████████| 6/6 [20:49<00:00, 208.24s/it]

Number of episodes recoreded   :: 63





In [42]:
# Saving episodes link to a text file for future needs

with open('data/bcs_season_nd_episode_links.txt', 'w') as file:
        file.write(str(episode_links))


In [45]:
Season_1 = ['Season_1'] * len(characters['Season_1'])
Season_2 = ['Season_2'] * len(characters['Season_2'])
Season_3 = ['Season_3'] * len(characters['Season_3'])
Season_4 = ['Season_4'] * len(characters['Season_4'])
Season_5 = ['Season_5'] * len(characters['Season_5'])
Season_6 = ['Season_6'] * len(characters['Season_6'])

seas = Season_1 + Season_2 + Season_3 + Season_4 + Season_5 + Season_6

chars = characters['Season_1'] + characters['Season_2'] + characters['Season_3'] + characters['Season_4'] + characters['Season_5'] + characters['Season_6']

In [46]:
character_df1 = pd.DataFrame({'Season' : seas, 'Characters' : chars})
character_df1.to_csv('data/character_df1.csv', index = False)
character_df1.head()

Unnamed: 0,Season,Characters
0,Season_1,Bob Odenkirk as Jimmy McGill
1,Season_1,Jonathan Banks as Mike Ehrmantraut
2,Season_1,Rhea Seehorn as Kim Wexler
3,Season_1,Patrick Fabian as Howard Hamlin
4,Season_1,Michael Mando as Nacho Varga (credit only)


In [47]:
print('Number of Characters recorded per season'); print('-' * 40)
character_df1.Season.value_counts(sort = False)

Number of Characters recorded per season
----------------------------------------


Season
Season_1    228
Season_2    281
Season_3    282
Season_4    336
Season_5    304
Season_6    389
Name: count, dtype: int64

In [48]:
# Cleaning character_df : Removing and stripping unwanted characters

character_df1.Characters = character_df1.Characters.apply(lambda x : re.sub(u'\xa0', u' ', x).strip()) # https://stackoverflow.com/a/11566398
character_df1.Characters = character_df1.Characters.apply(lambda x : re.sub(r'\t', ' ', x).strip())
character_df1.Characters = character_df1.Characters.apply(lambda x : x.split(' as ')[-1].strip())
character_df1.Characters = character_df1.Characters.apply(lambda x : re.sub(r'^as\s', ' ', x).strip())

In [49]:
for idx in [19, 20, 32, 1230]:
    print(character_df1.iloc[idx][1])

Prosecutor
Defendant #1
Jimmy McGill
Judge Matlof


In [50]:
# Processing character names

character_df1.Characters = character_df1.Characters.apply(lambda x : re.sub(r'#\d*', ' ', x).strip()) # Removing '#' from name
character_df1.Characters = character_df1.Characters.apply(lambda x : re.sub(r'\(.*\)', ' ', x).strip()) # Removing '()' and text inside those
character_df1.Characters = character_df1.Characters.apply(lambda x : re.sub(r'CarWash Patron', 'Car Wash Patron', x).strip())
character_df1.Characters = character_df1.Characters.apply(lambda x : re.sub('\s+', ' ', x).strip()) # Removing unwanted extra white spaces

In [51]:
print(f'Total number of characters\t\t:: {len(character_df1.Characters)}')
print(f'Total number of unique characters\t:: {character_df1.Characters.nunique()}')

Total number of characters		:: 1820
Total number of unique characters	:: 737


In [52]:
character_df1.head()

Unnamed: 0,Season,Characters
0,Season_1,Jimmy McGill
1,Season_1,Mike Ehrmantraut
2,Season_1,Kim Wexler
3,Season_1,Howard Hamlin
4,Season_1,Nacho Varga


In [53]:
character_df1.to_csv('data/character_df1_cleaned.csv', index = False)
character_df1.tail()

Unnamed: 0,Season,Characters
1815,Season_6,Face Bandage Inmate
1816,Season_6,Squinty Inmate
1817,Season_6,Bakery Inmate
1818,Season_6,Schlubby Guy
1819,Season_6,Hot Chick Cop
