## Importing Libraries

In [1]:
# Web scraping
import requests
from bs4 import BeautifulSoup

# Data processing
import numpy as np 
import pandas as pd 

In [2]:
# Makes Jupyter Notebook cells wider
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

## Extracting Data on Stargate SG-1 from IMDB

In [3]:
# Creating a user agent (needed because IMDB blocks the default python user agent)
user_agent = {'user-agent': 'Mozilla/5.0'}

In [4]:
# After appending the relevant number, this URL links to each season of Stargate SG1 on IMDB
season_URL = 'https://www.imdb.com/title/tt0118480/episodes/?season='

In [5]:
# Empty list which will collect links to episode pages
episode_links = []

# Loop through every season's page on imdb and collect every episode link
for season_number in range (1, 11):
    response = requests.get(season_URL + str(season_number), headers = user_agent)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    episode_list = soup.find_all('a', class_='ipc-title-link-wrapper')[:-1]
    
    # Add cleaned link to episode_links
    for episode in episode_list:
        episode_links.append("https://imdb.com" + str(episode).split('"')[3].split("?")[0][:-1])

In [6]:
# Empty list which will collect episode data
data = []
episode_total = 1

for episode_link in episode_links:
    response = requests.get(episode_link, headers = user_agent)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    season_number, episode_number = soup.find('div', class_='sc-3f4e3993-0 fYpskP').text.split(".")
    episode_title = soup.find('span', class_='hero__primary-text').text
    episode_rating = soup.find('span', class_='sc-eb51e184-1 ljxVSS').text
    num_episode_ratings = soup.find('div', class_='sc-eb51e184-3 kgbSIj').text
    
    summaries_link = episode_link.split("?")[0] + "/plotsummary/?ref_=tt_stry_pl"
    response = requests.get(summaries_link, headers = user_agent)
    soup = BeautifulSoup(response.text, 'html.parser')
    summaries = [summary.text.split("—")[0] for summary in soup.find_all('div', class_='ipc-html-content-inner-div')][::2]
    
    data.append((episode_total, season_number, episode_number, episode_title, float(episode_rating), num_episode_ratings, summaries))
    episode_total += 1

In [7]:
# Create a dataframe from data list
IMDB_df = pd.DataFrame(data, columns=['Episode_Total', 'Season_Num', 'Episode_Num', 'Title', 'Rating', 'Num_Ratings', 'Summaries']) 
IMDB_df

Unnamed: 0,Episode_Total,Season_Num,Episode_Num,Title,Rating,Num_Ratings,Summaries
0,1,S1,E1,Children of the Gods,8.1,3.8K,[Colonel Jack O'Neill is brought out of retire...
1,2,S1,E2,The Enemy Within,7.6,1.9K,"[The team, now designated SG-1 are planning th..."
2,3,S1,E3,Emancipation,5.9,2K,[While exploring a world populated by Mongol d...
3,4,S1,E4,The Broca Divide,7.0,1.7K,[The Stargate base is put in deadly peril when...
4,5,S1,E5,The First Commandment,6.6,1.7K,[The SG-1 team is sent after the SG-9 team tha...
...,...,...,...,...,...,...,...
208,209,S10,E16,Bad Guys,8.0,1.1K,"[When SG-1, minus Carter, investigates a previ..."
209,210,S10,E17,Talion,7.2,942,[Teal'c and Bra'tac are among the numerous vic...
210,211,S10,E18,Family Ties,6.6,1K,[Stargate Command gets a video message from Va...
211,212,S10,E19,Dominion,8.1,995,"[Vala gambles, wins a cargo ship, but is found..."


In [8]:
# Export dataframe as an Excel file for manual data entry
IMDB_df.to_excel('Stargate_IMDB_Info.xlsx', index=False)