In [11]:
import requests
import pandas as pd
from tools import DB  # Ensure tools.py is in the same folder as this notebook


We now define a helper function to categorize genres, giving priority to "Anime" so if works regardless of capital letters.

In [12]:
def determine_genre_tag(genres):
    """
    Determines a simplified genre tag.
    Returns 'Anime' if 'anime' is one of the genres (case-insensitive),
    otherwise returns the first genre or 'Unknown' if no genres are provided.
    """
    if not genres:
        return "Unknown"
    return "Anime" if "anime" in [g.lower() for g in genres] else genres[0]


We now set up the number of pages we want to fetch from TVmaze,
initialize our database, and prepare a dictionary to store unique shows.

In [13]:
NUM_PAGES = 100  # Number of pages to fetch from TVmaze (each ~250 shows)

# Initialize database connection (data will be saved under the 'data' directory)
db = DB("data")

# Create an empty dictionary to store unique shows
all_shows = {}


Now we loop through the TVmaze API pages and collect data.
We check for duplicates and safely extract nested fields like network and country.

In [14]:
# Loop through the defined number of pages
for page in range(NUM_PAGES):
    print(f"Fetching page {page} of popular shows...")

    # Request shows for the current page
    response = requests.get(f"https://api.tvmaze.com/shows?page={page}")

    # Stop loop if the request fails
    if response.status_code != 200:
        print(f"⚠️ Failed to fetch page {page}. Status code: {response.status_code}")
        break

    # Process each show in the response
    for show in response.json():
        show_id = show['id']
        if show_id in all_shows:
            continue  # Skip if we've already seen this show

        genres = show.get('genres', [])
        network_info = show.get('network')

        all_shows[show_id] = {
            'id': show_id,
            'name': show.get('name'),
            'network': network_info['name'] if network_info else None,
            'genre_tag': determine_genre_tag(genres),
            'showType': show.get('type'),
            'country': network_info['country']['name'] if network_info and network_info.get('country') else None,
            'language': show.get('language'),
            'premiered': show.get('premiered'),
            'runtime': show.get('runtime'),
            'rating': show['rating']['average'] if show.get('rating') and show['rating']['average'] is not None else None,
            'image': show['image']['medium'] if show.get('image') and show['image'].get('medium') else None,
            'summary': show.get('summary')
        }


Fetching page 0 of popular shows...
Fetching page 1 of popular shows...
Fetching page 2 of popular shows...
Fetching page 3 of popular shows...
Fetching page 4 of popular shows...
Fetching page 5 of popular shows...
Fetching page 6 of popular shows...
Fetching page 7 of popular shows...
Fetching page 8 of popular shows...
Fetching page 9 of popular shows...
Fetching page 10 of popular shows...
Fetching page 11 of popular shows...
Fetching page 12 of popular shows...
Fetching page 13 of popular shows...
Fetching page 14 of popular shows...
Fetching page 15 of popular shows...
Fetching page 16 of popular shows...
Fetching page 17 of popular shows...
Fetching page 18 of popular shows...
Fetching page 19 of popular shows...
Fetching page 20 of popular shows...
Fetching page 21 of popular shows...
Fetching page 22 of popular shows...
Fetching page 23 of popular shows...
Fetching page 24 of popular shows...
Fetching page 25 of popular shows...
Fetching page 26 of popular shows...
Fetching pa

Finally, we convert the collected data into a DataFrame and save it into the database.

In [15]:
# Convert the dictionary of shows into a pandas DataFrame
df = pd.DataFrame(list(all_shows.values()))

# Load the DataFrame into the SQLite database under the "shows" table
db.load_from_dataframe(df, "shows")

# Print confirmation
print(f"✅ Loaded {len(df)} shows from TVmaze with all columns.")


✅ Loaded 23259 shows from TVmaze with all columns.
