## Importing Libraries

In [1]:
# Web scraping
import requests
from bs4 import BeautifulSoup

# Data processing
import numpy as np 
import pandas as pd 

In [2]:
# Makes Jupyter Notebook cells wider
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

## Extracting Data on Stargate SG-1 from IMDB

In [3]:
# Creating a user agent (needed because IMDB blocks the default python user agent)
user_agent = {'user-agent': 'Mozilla/5.0'}

In [4]:
# After appending the relevant number, this URL links to each season of Stargate SG1 on IMDB
season_URL = 'https://www.imdb.com/title/tt0118480/episodes/?season='

In [5]:
# Empty list which will collect links to episode pages
episode_links = []

# Loop through every season's page on imdb and collect every episode link
for season_number in range (1, 11):
    response = requests.get(season_URL + str(season_number), headers = user_agent)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    episode_list = soup.find_all('a', class_='ipc-title-link-wrapper')[:-1]
    
    # Add cleaned link to episode_links
    for episode in episode_list:
        episode_links.append("https://imdb.com" + str(episode).split('"')[3].split("?")[0][:-1])

In [None]:
# Empty list which will collect episode data
data = []
episode_total = 1

for episode_link in episode_links:
    response = requests.get(episode_link, headers = user_agent)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    season_number, episode_number = soup.find('div', class_='sc-3f4e3993-0 fYpskP').text.split(".")
    episode_title = soup.find('span', class_='hero__primary-text').text
    episode_rating = soup.find('span', class_='sc-eb51e184-1 ljxVSS').text
    num_episode_ratings = soup.find('div', class_='sc-eb51e184-3 kgbSIj').text
    
    summaries_link = episode_link.split("?")[0] + "/plotsummary/?ref_=tt_stry_pl"
    response = requests.get(summaries_link, headers = user_agent)
    soup = BeautifulSoup(response.text, 'html.parser')
    summaries = [summary.text.split("—")[0] for summary in soup.find_all('div', class_='ipc-html-content-inner-div')][::2]
    
    data.append((episode_total, season_number, episode_number, episode_title, float(episode_rating), num_episode_ratings, summaries))
    episode_total += 1

In [None]:
# Create a dataframe from data list
IMDB_df = pd.DataFrame(data, columns=['Episode_Total', 'Season_Num', 'Episode_Num', 'Title', 'Rating', 'Num_Ratings', 'Summaries']) 
IMDB_df

In [None]:
# Export dataframe as an Excel file for manual data entry
IMDB_df.to_excel('Stargate_IMDB_Info.xlsx', index=False)