# Star Wars Data Analysis

## Section 1: Setup imports and dataframes

### Setup imports and variables

In [1]:
# Uncomment the following lines to install the necessary libraries for the async calls

#!pip install asyncio
#!pip install aiohttp

In [2]:
# Import necessary libraries for dataframes, HTTP requests, JSON, and charts

import pandas as pd
import numpy as np
import requests
import matplotlib.pyplot as plt
import asyncio
import aiohttp

%matplotlib inline

In [3]:
# Set the URLs for the APIs for each category SWAPI provides

films_url = "https://swapi.dev/api/films/"
people_url = "https://swapi.dev/api/people/"
planets_url = "https://swapi.dev/api/planets/"
species_url = "https://swapi.dev/api/species/"
starships_url = "https://swapi.dev/api/starships/"
vehicles_url = "https://swapi.dev/api/vehicles/"

### Gather Initial Data from API

In [4]:
# Retrieve the limit each category has for API requests

def retrieve_pages(url):
    response = requests.get(url)
    data = response.json()
    
    total_records = 10
    try:
        total_records = data["count"]
    except:
        pass
    
    total_pages = int(total_records/10) + (1 if total_records%10 > 0 else 0)

    return total_pages

In [None]:
# Get the limits for each category

films_pages = retrieve_pages(films_url)
people_pages = retrieve_pages(people_url)  
planets_pages = retrieve_pages(planets_url)
species_pages = retrieve_pages(species_url)
starships_pages = retrieve_pages(starships_url)
vehicles_pages = retrieve_pages(vehicles_url)

print(f"The number of pages of films is {films_pages}")
print(f"The number of pages of people is {people_pages}")
print(f"The number of pages of planets is {planets_pages}")
print(f"The number of pages of species is {species_pages}")
print(f"The number of pages of starships is {starships_pages}")
print(f"The number of pages of vehicles is {vehicles_pages}")

In [6]:
# Function to fetch data from a URL asynchronously, reducing response time by about 40% from synchronously  

async def fetch(url):
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as response:
            return await response.json()
        
# Define function to call url based on number of pages and append JSON from results property
async def assemble_json(url, pages):
    total_json = []
    print(f"Retrieving {pages} pages from {url}")
    for page in range(1, pages+1):
        composed_url = f"{url}?page={page}"
        # print(f"\tRetrieving {composed_url}")
        req_json = await fetch(composed_url)
        total_json.extend(req_json["results"]) 
    print(f"Found {len(total_json)} records at {url}")
    return total_json        

In [None]:
# Call assemble_json with each SWAPI URL and number of pages

# Run the async tasks
results = await asyncio.gather(
    assemble_json(films_url, films_pages),
    assemble_json(people_url, people_pages),
    assemble_json(planets_url, planets_pages),
    assemble_json(species_url, species_pages),
    assemble_json(starships_url, starships_pages),
    assemble_json(vehicles_url, vehicles_pages),
)

film_data, people_data, planets_data, species_data, starships_data, vehicles_data = results


In [None]:
# Create dataframes for each category
# Need to explode the arrays in films_df to get the data in a usable format
films_df = pd.DataFrame(film_data)
films_df.attrs['data'] = 'films'

people_df = pd.DataFrame(people_data)
people_df.attrs['data'] = 'people'

planets_df = pd.DataFrame(planets_data)
planets_df.attrs['data'] = 'planets'

species_df = pd.DataFrame(species_data)
species_df.attrs['data'] = 'species'

starships_df = pd.DataFrame(starships_data)
starships_df.attrs['data'] = 'starships'

vehicles_df = pd.DataFrame(vehicles_data)
vehicles_df.attrs['data'] = 'vehicles'

# Display the first 5 rows of each dataframe
display(films_df.head())
display(people_df.head())
display(planets_df.head())
display(species_df.head())
display(starships_df.head())
display(vehicles_df.head())

### Data Cleaning and Preparation

In [None]:
# Clean up the Gender column to change all entires that aren't "male" or "female" to be "non-binary"
people_df['gender']= people_df['gender'].apply(lambda x: x if x in ['male', 'female'] else 'Non-Binary').replace({"male": "Male", "female": "Female"})
display(people_df.head())

In [None]:
#Convert height and mass columns to int type from object. 
people_df[["mass", "height"]] = (
    people_df[["mass", "height"]]
    .astype(str)  
    .apply(lambda x: x.str.replace(",", ""))  # Remove commas (the jabba exception)
    .apply(pd.to_numeric, errors="coerce")  
    .fillna(0)  
    .astype(int)  
)
people_df.head()

In [None]:
# Some basic statistics on the height and mass columns
print(f"The min of mass column is {people_df['mass'].min()}") 
print(f"The mean of mass column is {people_df['mass'].mean().round(2)}")
print(f"The max of mass column is {people_df['mass'].max()}")
print(f"The min of height column is {people_df['height'].min()}") # index 27: Arvel Crynyd. 
print(f"The mean of height column is {people_df['height'].mean().round(2)}")
print(f"The max of height column is {people_df['height'].max()}")

In [None]:
#Creates a function that categorizes character mass into ranges: '0-50', '51-100', 'Over 100'

def categorize_mass(mass):
    """Categorizes mass into ranges: '0-50', '51-100', 'Over 100'."""
    if mass <= 50:
        return "0-50"
    elif 51 <= mass <= 100:
        return "51-100"
    else:
        return "Over 100"

people_df["mass range"] = people_df["mass"].apply(categorize_mass)

# Print the DataFrame
display(people_df)


In [None]:
# Clean up species data that is stored as empty arrays

# replace species entry for "R4-P17" with the URL for the species "Droid" in the species_df
people_df.loc[people_df["name"] == "R4-P17", "species"] = ["https://swapi.dev/api/species/2/"]

# in people_df, replace species of [] with the URL for the species "Human" in the species_df
# people_df.loc[people_df["species"].apply(len) == 0, "species"] = ["https://swapi.dev/api/species/1/"]
people_df.loc[people_df["species"].apply(len) == 0, "species"] = people_df.loc[
    people_df["species"].apply(len) == 0, "species"
].apply(lambda x: ["https://swapi.dev/api/species/1/"])

# Species should not be an array, so we will extract the URL from the array
people_df["species"] = people_df["species"].explode()

people_df.head()

In [None]:
# Clean up the classification column to convert reptilian to reptile and mammals to mammal
species_df["classification"] = species_df["classification"].replace({"reptilian": "reptile", "sentient": "reptile", "mammals": "mammal"})
species_df["classification"].value_counts()

species_df.head()

### Functions to Get Information for a Character Name

In [None]:
# Get all information from the other dataframes based on the person's homeworld, species, starships, and vehicles. 
# These columns are arrays of URLs that need to be exploded to get the data in a usable format.

# Get homeworld for a specific person from the planets_df
def get_homeworld(homeworld_url):
    homeworld = planets_df[planets_df["url"] == homeworld_url]["name"].values[0]
    return homeworld

# Get species for a specific person from the species_df
def get_species(species_url):
    species = species_df[species_df["url"] == species_url]["name"].values[0]
    return species

# Get list of starships for a specific person from the starships_df
def get_starships(starships_urls):
    starships = []
    for starship_url in starships_urls:
        starship = starships_df[starships_df["url"] == starship_url]["name"].values[0]
        starships.append(starship)
    return starships

# Get list of vehicles for a specific person from the vehicles_df
def get_vehicles(vehicles_urls):
    vehicles = []
    for vehicle_url in vehicles_urls:
        vehicle = vehicles_df[vehicles_df["url"] == vehicle_url]["name"].values[0]
        vehicles.append(vehicle)

    return vehicles

# Return information about a person based on what information is being requested
def get_person_info(person_name, information):
    person_df = people_df[people_df["name"] == person_name]
    if information == "homeworld":
        return get_homeworld(person_df["homeworld"].values[0])
    elif information == "species":
        return get_species(person_df["species"].values[0])
    elif information == "starships":
        return get_starships(person_df["starships"].values[0])
    elif information == "vehicles":
        return get_vehicles(person_df["vehicles"].values[0])
    else:
        return "Invalid information requested."
    
# TEST
# Apply the functions to the people_df to get the homeworld, species, starships, and vehicles for a specific person
# This person's name is stored in the person variable
person = "Chewbacca"
homeworld = get_person_info(person, "homeworld")
starships = get_person_info(person, "starships")
vehicles = get_person_info(person, "vehicles")
species = get_person_info(person, "species")

print(f"{person} is from {homeworld}.")
print(f"{person} has piloted the following starships: {starships}")
print(f"{person} has driven the following vehicles: {vehicles}")
print(f"{person} is a {species}.")



## Section 2: Filter Character List, Get Random 3 Characters, Ask User to Pick One Character

In [None]:
import ipywidgets as widgets
from IPython.display import display
# Ensure dataframes exist before running
if 'people_df' not in globals() or 'species_df' not in globals():
    raise ValueError("Ensure 'people_df' and 'species_df' are loaded before running this script.")

# Map species URLs to names for filtering
species_mapping = dict(zip(species_df['url'], species_df['name']))
people_df['species_name'] = people_df['species'].map(species_mapping)

# Dropdown option functions
def get_gender_options():
    return ["Select An Option"] + sorted(set(people_df['gender'].dropna().astype(str)))

def get_species_options():
    return ["Select An Option"] + sorted(set(species_df['name'].dropna().astype(str)))

def get_mass_ranges():
    return ["Select An Option", "0-50", "51-100", "Over 100"]

# Create Dropdowns
gender_dropdown = widgets.Dropdown(
    options=get_gender_options(), value="Select An Option", description='Gender:', disabled=False
)
species_dropdown = widgets.Dropdown(
    options=get_species_options(), value="Select An Option", description='Species:', disabled=False
)
mass_dropdown = widgets.Dropdown(
    options=get_mass_ranges(), value="Select An Option", description='Mass Range:', disabled=False
)

#Function to Find Matching Characters
def select_random_characters(_):
    selected_gender = gender_dropdown.value
    selected_species = species_dropdown.value
    selected_mass_range = mass_dropdown.value

    if all(option == "Select An Option" for option in [selected_gender, selected_species, selected_mass_range]):
       output.value = "Please select at least one option from the dropdowns above."
       return

    filter_conditions = []
    if selected_gender != "Select An Option":
        filter_conditions.append(people_df['gender'].str.lower() == selected_gender.lower())
    if selected_species != "Select An Option":
        filter_conditions.append(people_df['species_name'].str.lower() == selected_species.lower())
    if selected_mass_range != "Select An Option":
        if selected_mass_range == "Over 100":
            filter_conditions.append(people_df['mass'] > 100)
        else:
            try:
                mass_low, mass_high = map(int, selected_mass_range.split('-'))
                filter_conditions.append(people_df['mass'].between(mass_low, mass_high))
            except ValueError:
                output.value = (f"Invalid mass range: {selected_mass_range}")  # Debugging


    if filter_conditions:
        filtered_people = people_df[np.logical_and.reduce(filter_conditions)]
    else:
        filtered_people = people_df  # If no filters, use entire dataset

    if not filtered_people.empty:
        selected_characters = filtered_people.sample(n=min(3, len(filtered_people)), replace=False)
    else:
        output.value = "❌ No matching characters found."


    if 'name' not in people_df.columns:
        print("🔍 Debug: people_df columns ->", people_df.columns)
        print("🔍 Debug: First few rows of people_df ->")
        print(people_df.head())
        output.value = "⚠️ No valid character names found in the dataset. Ensure 'name' exists in people_df."
        return

    if not filtered_people.empty:
            selected_characters = filtered_people.sample(n=min(3, len(filtered_people)), replace=False)
            character_dropdown.options = ["Select An Option"] + sorted(selected_characters['name'].tolist())
            character_dropdown.value = "Select An Option"
            output.value = "🗡️ Choose a character from the dropdown below!"
    else:
        output.value = "❌ No matching characters found."
#Buttons and Output Fields
select_button = widgets.Button(description="Find 3 Random Characters", button_stype='primary')
select_button.on_click(select_random_characters)

reset_button = widgets.Button(description = "Reset", button_style='warning')

def reset_selection(_):
    gender_dropdown.value = "Select An Option"
    species_dropdown.value = "Select An Option"
    mass_dropdown.value = "Select An Option"
    character_dropdown.options = ["Select An Option"]
    character_dropdown.value = "Select An Option"
    output.value = "🗡️ Choose a character from the dropdown below!"
    final_output.value = ""

reset_button.on_click(reset_selection)

instruction_text = widgets.HTML(value='<b>Please select at least one option from the dropdowns below to generate characters.</b>')
output = widgets.Textarea(value="Select options and press the button!", layout={'width': '100%', 'height': '100px'})
character_dropdown = widgets.Dropdown(options=["Select An Option"], value="Select An Option", description="Choose:", disabled=False)

def final_character_selection(change):
    selected_character = change['new']
    if selected_character != "Select An Option":
        final_output.value = f"🎉 You have chosen: {selected_character}!"

character_dropdown.observe(final_character_selection, names='value')
final_output = widgets.Textarea(value="Your chosen character will appear here!", layout={'width': '100%', 'height': '50px'})

display(instruction_text, gender_dropdown, species_dropdown, mass_dropdown, select_button, reset_button, output, character_dropdown, final_output)

## Section 3: Display Selected Visualizations

In [None]:
# Function to plot the graph and highlight a specific character


def plot_character_height(people_df, highlight_character=None):
    # Convert 'height' column to numeric, ignoring errors for non-numeric values
    people_df['height'] = pd.to_numeric(people_df['height'], errors='coerce')

    # Drop rows with missing or invalid height values
    people_df_cleaned = people_df.dropna(subset=['height'])

    # Sort by height
    people_df_sorted = people_df_cleaned.sort_values(by='height')

    # Assign colors: highlight one character differently
    colors = ['skyblue' if name != highlight_character else 'orange' for name in people_df_sorted['name']]

    # Create a bar chart
    plt.figure(figsize=(12, 6))
    plt.bar(people_df_sorted['name'], people_df_sorted['height'], color=colors)
    plt.xticks(rotation=90)  # Rotate x-axis labels for better readability
    plt.title('Star Wars Characters: Height from Shortest to Tallest', fontsize=16)
    plt.xlabel('Character', fontsize=14)
    plt.ylabel('Height (cm)', fontsize=14)

    # Add annotation if a character is highlighted
    if highlight_character and highlight_character in people_df_sorted['name'].values:
        char_height = people_df_sorted.loc[people_df_sorted['name'] == highlight_character, 'height'].values[0]
        plt.text(
            people_df_sorted['name'].tolist().index(highlight_character),
            char_height + 5,  # Position slightly above the bar
            f"{highlight_character}: {char_height} cm",
            ha='center', color='orange', fontsize=10
        )

    plt.tight_layout()
    plt.show()

# Example Usage: Highlight Luke Skywalker
plot_character_height(people_df, highlight_character='Luke Skywalker')

In [None]:
def plot_mass_by_character(people_df, highlight_character=None):
    # Convert 'mass' and 'height' columns to numeric, ignoring errors for non-numeric values
    people_df['mass'] = pd.to_numeric(people_df['mass'], errors='coerce')
    people_df['height'] = pd.to_numeric(people_df['height'], errors='coerce')

    # Drop rows with missing or invalid values in 'mass' or 'height'
    people_df_cleaned = people_df.dropna(subset=['mass', 'height'])

    # Sort characters by height for better visualization
    people_df_sorted = people_df_cleaned.sort_values(by='height', ascending=True)

    # Assign colors: highlight one character differently
    colors = ['orange' if name == highlight_character else 'skyblue' for name in people_df_sorted['name']]

    # Create a bar chart
    plt.figure(figsize=(14, 6))
    plt.bar(people_df_sorted['name'], people_df_sorted['mass'], color=colors)

    # Add labels and title
    plt.xticks(rotation=45, ha='right', fontsize=12)
    plt.yticks(fontsize=12)
    plt.xlabel('Character', fontsize=14)
    plt.ylabel('Mass (kg)', fontsize=14)
    plt.title('Mass of Star Wars Characters by Height', fontsize=16)
    plt.grid(axis='y', linestyle='--', alpha=0.7)

    # Annotate highlighted character
    if highlight_character in people_df_sorted['name'].values:
        highlight_row = people_df_sorted[people_df_sorted['name'] == highlight_character]
        plt.text(
            people_df_sorted['name'].tolist().index(highlight_character),
            highlight_row['mass'].values[0] + 5,
            f"{highlight_character}: {highlight_row['mass'].values[0]} kg",
            ha='center', color='orange', fontsize=10
        )

    plt.tight_layout()
    plt.show()

# Example Usage: Highlight "Luke Skywalker"
plot_mass_by_character(people_df, highlight_character='Luke Skywalker')

In [None]:
def plot_surface_water_by_planet(planets_df, highlight_planet=None):
    # Convert 'diameter' and 'surface_water' columns to numeric, ignoring errors for non-numeric values
    planets_df['diameter'] = pd.to_numeric(planets_df['diameter'], errors='coerce')
    planets_df['surface_water'] = pd.to_numeric(planets_df['surface_water'], errors='coerce')

    # Drop rows with missing or invalid values in 'diameter' or 'surface_water'
    planets_df_cleaned = planets_df.dropna(subset=['diameter', 'surface_water'])

    # Sort planets by surface water percentage for better visualization
    planets_df_sorted = planets_df_cleaned.sort_values(by='surface_water', ascending=False)

    # Assign colors: highlight one planet differently
    colors = ['orange' if name == highlight_planet else 'skyblue' for name in planets_df_sorted['name']]

    # Create a bar chart
    plt.figure(figsize=(14, 6))
    bars = plt.bar(planets_df_sorted['name'], planets_df_sorted['surface_water'], color=colors)

    # Add labels and title
    plt.xticks(rotation=45, ha='right', fontsize=12)
    plt.yticks(fontsize=12)
    plt.xlabel('Planet', fontsize=14)
    plt.ylabel('Surface Water (%)', fontsize=14)
    plt.title('Surface Water Percentage on Star Wars Planets', fontsize=16)
    plt.grid(axis='y', linestyle='--', alpha=0.7)

    # Annotate each bar with surface water percentage
    for bar, (name, surface_water) in zip(bars, zip(planets_df_sorted['name'], planets_df_sorted['surface_water'])):
        plt.text(
            bar.get_x() + bar.get_width() / 2, 
            bar.get_height() + 1, 
            f"{surface_water}%", 
            ha='center', fontsize=10, color='black'
        )

    # Highlighted planet annotation (only name, no percentage)
    if highlight_planet in planets_df_sorted['name'].values:
        highlight_index = planets_df_sorted['name'].tolist().index(highlight_planet)
        plt.text(
            highlight_index,
            planets_df_sorted['surface_water'].values[highlight_index] + 3,
            f"{highlight_planet}",
            ha='center', color='red', fontsize=11, fontweight='bold'
        )

    plt.tight_layout()
    plt.show()

# Example Usage: Highlight "Tatooine"
plot_surface_water_by_planet(planets_df, highlight_planet='Tatooine')

In [None]:
def plot_population_density_by_planet(planets_df, highlight_planet=None):
    # Convert 'diameter' and 'population' columns to numeric, ignoring errors for non-numeric values
    planets_df['diameter'] = pd.to_numeric(planets_df['diameter'], errors='coerce')
    planets_df['population'] = pd.to_numeric(planets_df['population'], errors='coerce')

    # Calculate population density (population per km² of diameter) and add it as a new column
    planets_df['population_density'] = planets_df['population'] / ((planets_df['diameter'] / 2) ** 2 * 3.14159)  # Area of a circle
    planets_df['population_density'] = planets_df['population_density'].replace([np.inf, -np.inf], np.nan)  # Handle infinite values

    # Drop rows with missing or invalid values in 'diameter' or 'population_density'
    planets_df_cleaned = planets_df.dropna(subset=['diameter', 'population_density'])

    # Sort planets by population density for better visualization
    planets_df_sorted = planets_df_cleaned.sort_values(by='population_density', ascending=False)

    # Assign colors: highlight one planet differently
    colors = ['orange' if name == highlight_planet else 'skyblue' for name in planets_df_sorted['name']]

    # Create a bar chart
    plt.figure(figsize=(14, 6))
    bars = plt.bar(planets_df_sorted['name'], planets_df_sorted['population_density'], color=colors)

    # Add labels and title
    plt.xticks(rotation=45, ha='right', fontsize=12)
    plt.yticks(fontsize=12)
    plt.xlabel('Planet', fontsize=14)
    plt.ylabel('Population Density (per km²)', fontsize=14)
    plt.title('Population Density of Star Wars Planets', fontsize=16)
    plt.grid(axis='y', linestyle='--', alpha=0.7)

    # Annotate each bar with population density
    for bar, (name, pop_density) in zip(bars, zip(planets_df_sorted['name'], planets_df_sorted['population_density'])):
        plt.text(
            bar.get_x() + bar.get_width() / 2, 
            bar.get_height() + 1, 
            f"{pop_density:.2f}", 
            ha='center', fontsize=10, color='black'
        )

    # Highlighted planet annotation (only name)
    if highlight_planet in planets_df_sorted['name'].values:
        highlight_index = planets_df_sorted['name'].tolist().index(highlight_planet)
        plt.text(
            highlight_index,
            planets_df_sorted['population_density'].values[highlight_index] + 3,
            f"{highlight_planet}",
            ha='center', color='red', fontsize=11, fontweight='bold'
        )

    plt.tight_layout()
    plt.show()

# Example Usage: Highlight "Coruscant"
plot_population_density_by_planet(planets_df, highlight_planet='Coruscant')

In [None]:

def plot_population_density_by_planet(planets_df, highlight_planet=None):
    # Convert 'diameter' and 'population' columns to numeric, ignoring errors for non-numeric values
    planets_df['diameter'] = pd.to_numeric(planets_df['diameter'], errors='coerce')
    planets_df['population'] = pd.to_numeric(planets_df['population'], errors='coerce')

    # Calculate population density (population per km² of diameter)
    planets_df['population_density'] = planets_df['population'] / ((planets_df['diameter'] / 2) ** 2 * 3.14159)
    planets_df['population_density'] = planets_df['population_density'].replace([np.inf, -np.inf], np.nan)  # Handle infinite values

    # Drop rows with missing or invalid values
    planets_df_cleaned = planets_df.dropna(subset=['diameter', 'population_density'])

    # Apply log10 transformation to population density to avoid extreme skewing
    planets_df_cleaned['log_population_density'] = np.log10(planets_df_cleaned['population_density'] + 1)  # Avoid log(0)

    # Sort planets by transformed density for better visualization
    planets_df_sorted = planets_df_cleaned.sort_values(by='log_population_density', ascending=False)

    # Assign colors: highlight one planet differently
    colors = ['orange' if name == highlight_planet else 'skyblue' for name in planets_df_sorted['name']]

    # Create a bar chart
    plt.figure(figsize=(14, 6))
    bars = plt.bar(planets_df_sorted['name'], planets_df_sorted['log_population_density'], color=colors)

    # Add labels and title
    plt.xticks(rotation=45, ha='right', fontsize=12)
    plt.yticks(fontsize=12)
    plt.xlabel('Planet', fontsize=14)
    plt.ylabel('Log10(Population Density)', fontsize=14)
    plt.title('Log-Scaled Population Density of Star Wars Planets', fontsize=16)
    plt.grid(axis='y', linestyle='--', alpha=0.7)

    # Annotate each bar with the actual population density (not log-scaled)
    for bar, (name, pop_density) in zip(bars, zip(planets_df_sorted['name'], planets_df_sorted['population_density'])):
        plt.text(
            bar.get_x() + bar.get_width() / 2, 
            bar.get_height() + 0.1,  # Offset to prevent overlap
            f"{pop_density:.2f}", 
            ha='center', fontsize=10, color='black'
        )

    # Highlighted planet annotation
    if highlight_planet in planets_df_sorted['name'].values:
        highlight_index = planets_df_sorted['name'].tolist().index(highlight_planet)
        plt.text(
            highlight_index,
            planets_df_sorted['log_population_density'].values[highlight_index] + 0.2,
            f"{highlight_planet}",
            ha='center', color='red', fontsize=11, fontweight='bold'
        )

    plt.tight_layout()
    plt.show()

# Example Usage: Highlight "Coruscant"
plot_population_density_by_planet(planets_df, highlight_planet='Coruscant')


In [None]:
def plot_lifespan_by_species(species_df, highlight_species=None):
    # Convert 'average_lifespan' to numeric, handling missing values
    species_df['average_lifespan'] = pd.to_numeric(species_df['average_lifespan'], errors='coerce')
    species_df['average_lifespan'].fillna(species_df['average_lifespan'].median(), inplace=True)

    # Remove negative or zero lifespan values
    species_df_cleaned = species_df[species_df['average_lifespan'] > 0]

    # Sort species by lifespan
    species_df_sorted = species_df_cleaned.sort_values(by='average_lifespan', ascending=False)

    # Ensure all 37 species are included
    print(f"Total species displayed: {len(species_df_sorted)}")

    # Set color for highlighted species
    colors = ['orange' if name == highlight_species else 'skyblue' for name in species_df_sorted['name']]

    # Create a bar chart
    plt.figure(figsize=(20, 8))
    plt.bar(species_df_sorted['name'], species_df_sorted['average_lifespan'], color=colors)
    plt.xticks(rotation=90, ha='right', fontsize=12)
    plt.yticks(fontsize=12)
    plt.xlabel('Species', fontsize=14)
    plt.ylabel('Average Lifespan (years)', fontsize=14)
    plt.title('Average Lifespan of Star Wars Species', fontsize=16)
    plt.grid(axis='y', linestyle='--', alpha=0.7)

    # Annotate highlighted species
    if highlight_species in species_df_sorted['name'].values:
        highlight_row = species_df_sorted[species_df_sorted['name'] == highlight_species]
        plt.text(
            species_df_sorted['name'].tolist().index(highlight_species),
            highlight_row['average_lifespan'].values[0] + 5,
            f"{highlight_species}: {highlight_row['average_lifespan'].values[0]} years",
            ha='center', color='orange', fontsize=10
        )

    plt.tight_layout()
    plt.show()

# Example Usage
plot_lifespan_by_species(species_df, highlight_species='Nautolan')

In [None]:
def plot_lifespan_by_species(species_df, highlight_species=None):
    # Convert 'average_lifespan' and 'average_height' to numeric, handling missing values
    species_df['average_lifespan'] = pd.to_numeric(species_df['average_lifespan'], errors='coerce')
    species_df['average_height'] = pd.to_numeric(species_df['average_height'], errors='coerce')

    # Drop rows with missing or invalid values in 'average_lifespan' or 'average_height'
    species_df_cleaned = species_df.dropna(subset=['average_lifespan', 'average_height'])

    # Sort species by lifespan for better visualization
    species_df_sorted = species_df_cleaned.sort_values(by='average_lifespan', ascending=False)

    # Assign colors: highlight one species differently
    colors = ['orange' if name == highlight_species else 'skyblue' for name in species_df_sorted['name']]

    # Create a bar chart
    plt.figure(figsize=(18, 6))
    bars = plt.bar(species_df_sorted['name'], species_df_sorted['average_lifespan'], color=colors)

    # Add labels and title
    plt.xticks(rotation=45, ha='right', fontsize=12)
    plt.yticks(fontsize=12)
    plt.xlabel('Species', fontsize=14)
    plt.ylabel('Average Lifespan (years)', fontsize=14)
    plt.title('Average Lifespan of Star Wars Species (with Heights)', fontsize=16)
    plt.grid(axis='y', linestyle='--', alpha=0.7)

    # Annotate each bar with species height
    for bar, (name, height) in zip(bars, zip(species_df_sorted['name'], species_df_sorted['average_height'])):
        plt.text(
            bar.get_x() + bar.get_width() / 2, 
            bar.get_height() + 5,  # Position above the bar
            f"{height} cm", 
            ha='center', fontsize=6, color='black'
        )

    # Highlighted species annotation (only name)
    if highlight_species in species_df_sorted['name'].values:
        highlight_index = species_df_sorted['name'].tolist().index(highlight_species)
        plt.text(
            highlight_index,
            species_df_sorted['average_lifespan'].values[highlight_index] + 10,
            f"{highlight_species}",
            ha='center', color='red', fontsize=11, fontweight='bold'
        )

    plt.tight_layout()
    plt.show()

# Example Usage: Highlight "Human"
plot_lifespan_by_species(species_df, highlight_species='Human')

In [None]:
def plot_log_cost_by_starship(starships_df, highlight_starship=None):
    # Convert 'cost_in_credits' and 'max_atmosphering_speed' to numeric, ignoring errors for non-numeric values
    starships_df['cost_in_credits'] = pd.to_numeric(starships_df['cost_in_credits'], errors='coerce')
    starships_df['max_atmosphering_speed'] = pd.to_numeric(starships_df['max_atmosphering_speed'], errors='coerce')

    # Drop rows with missing or invalid values
    starships_df_cleaned = starships_df.dropna(subset=['cost_in_credits', 'max_atmosphering_speed'])

    # Apply log10 transformation to cost to avoid extreme skewing
    starships_df_cleaned['log_cost'] = np.log10(starships_df_cleaned['cost_in_credits'] + 1)  # Avoid log(0)

    # Sort starships by transformed cost for better visualization
    starships_df_sorted = starships_df_cleaned.sort_values(by='log_cost', ascending=False)

    # Assign colors: highlight one starship differently
    colors = ['orange' if name == highlight_starship else 'skyblue' for name in starships_df_sorted['name']]

    # Create a bar chart
    plt.figure(figsize=(20, 6))
    bars = plt.bar(starships_df_sorted['name'], starships_df_sorted['log_cost'], color=colors)

    # Add labels and title
    plt.xticks(rotation=45, ha='right', fontsize=12)
    plt.yticks(fontsize=12)
    plt.xlabel('Starship', fontsize=14)
    plt.ylabel('Log10(Cost in Credits)', fontsize=14)
    plt.title('Log-Scaled Starship Cost in Credits (with Max Speed)', fontsize=16)
    plt.grid(axis='y', linestyle='--', alpha=0.7)

    # Annotate each bar with actual cost (not log-scaled)
    for bar, (name, cost, speed) in zip(bars, zip(starships_df_sorted['name'], starships_df_sorted['cost_in_credits'], starships_df_sorted['max_atmosphering_speed'])):
        plt.text(
            bar.get_x() + bar.get_width() / 2, 
            bar.get_height() + 0.5,  # Offset to prevent overlap
            f"{cost:,} credits\n{speed} speed", 
            ha='center', fontsize=6, color='black'
        )

    # Highlighted starship annotation (only name)
    if highlight_starship in starships_df_sorted['name'].values:
        highlight_index = starships_df_sorted['name'].tolist().index(highlight_starship)
        plt.text(
            highlight_index,
            starships_df_sorted['log_cost'].values[highlight_index] + 0.2,
            f"{highlight_starship}",
            ha='center', color='red', fontsize=11, fontweight='bold'
        )

    plt.tight_layout()
    plt.show()

# Example Usage: Highlight "Millennium Falcon"
plot_log_cost_by_starship(starships_df, highlight_starship='Millennium Falcon')

In [None]:

def plot_log_passenger_capacity_by_starship(starships_df, highlight_starship=None):
    # Convert 'passengers' and 'length' to numeric, ignoring errors for non-numeric values
    starships_df['passengers'] = pd.to_numeric(starships_df['passengers'], errors='coerce')
    starships_df['length'] = pd.to_numeric(starships_df['length'], errors='coerce')

    # Drop rows with missing or invalid values
    starships_df_cleaned = starships_df.dropna(subset=['passengers', 'length'])

    # Apply log10 transformation to passenger capacity to handle extreme values
    starships_df_cleaned['log_passengers'] = np.log10(starships_df_cleaned['passengers'] + 1)  # Avoid log(0)

    # Sort starships by transformed passenger capacity for better visualization
    starships_df_sorted = starships_df_cleaned.sort_values(by='log_passengers', ascending=False)

    # Assign colors: highlight one starship differently
    colors = ['orange' if name == highlight_starship else 'skyblue' for name in starships_df_sorted['name']]

    # Create a bar chart
    plt.figure(figsize=(20, 6))
    bars = plt.bar(starships_df_sorted['name'], starships_df_sorted['log_passengers'], color=colors)

    # Add labels and title
    plt.xticks(rotation=45, ha='right', fontsize=12)
    plt.yticks(fontsize=12)
    plt.xlabel('Starship', fontsize=14)
    plt.ylabel('Log10(Passenger Capacity)', fontsize=14)
    plt.title('Log-Scaled Passenger Capacity of Star Wars Starships (with Length)', fontsize=16)
    plt.grid(axis='y', linestyle='--', alpha=0.7)

    # Annotate each bar with actual passenger capacity & length
    for bar, (name, passengers, length) in zip(bars, zip(starships_df_sorted['name'], starships_df_sorted['passengers'], starships_df_sorted['length'])):
        plt.text(
            bar.get_x() + bar.get_width() / 2, 
            bar.get_height() + 0.1,  # Offset to prevent overlap
            f"{passengers:,} passengers\n{length} m", 
            ha='center', fontsize=5, color='black'
        )

    # Highlighted starship annotation (only name)
    if highlight_starship in starships_df_sorted['name'].values:
        highlight_index = starships_df_sorted['name'].tolist().index(highlight_starship)
        plt.text(
            highlight_index,
            starships_df_sorted['log_passengers'].values[highlight_index] + 0.2,
            f"{highlight_starship}",
            ha='center', color='red', fontsize=11, fontweight='bold'
        )

    plt.tight_layout()
    plt.show()

# Example Usage: Highlight "Star Destroyer"
plot_log_passenger_capacity_by_starship(starships_df, highlight_starship='Star Destroyer')


In [None]:
# Calculate the number of films for each character
people_df['film_count'] = people_df['films'].apply(len)

# Function to create the bar chart with spacing
def plot_film_count(people_df, highlight_character=None):
    # Sort by film count for a cleaner chart
    people_df_sorted = people_df.sort_values(by='film_count', ascending=False)

    # Create space between bars
    x_positions = np.arange(len(people_df_sorted))

    # Assign colors: highlight one character differently
    colors = ['orange' if name == highlight_character else 'skyblue' for name in people_df_sorted['name']]

    # Create the bar chart with adjusted width and spacing
    plt.figure(figsize=(18, 6))
    plt.bar(x_positions, people_df_sorted['film_count'], color=colors, width=0.6)  # Adjust width for spacing
    plt.xticks(x_positions, people_df_sorted['name'], rotation=45, ha='right')  # Add rotation and spacing
    plt.title('Number of Films per Character', fontsize=16)
    plt.xlabel('Character', fontsize=14)
    plt.ylabel('Number of Films', fontsize=14)

    # Annotate highlighted character
    if highlight_character in people_df_sorted['name'].values:
        highlight_row = people_df_sorted[people_df_sorted['name'] == highlight_character]
        highlight_index = people_df_sorted[people_df_sorted['name'] == highlight_character].index[0]
        plt.text(
            x_positions[list(people_df_sorted.index).index(highlight_index)],
            highlight_row['film_count'].values[0] + 0.2,
            f"{highlight_character}: {highlight_row['film_count'].values[0]} films",
            ha='center',
            fontsize=10,
            color='orange'
        )

    # Add spacing between bars
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    plt.grid(axis='y', linestyle='--', alpha=0.7)  # Add light grid for better readability
    plt.tight_layout()

    plt.show()

# Example Usage: Highlight "Luke Skywalker"
plot_film_count(people_df, highlight_character='Luke Skywalker')

In [None]:
# Calculate the number of films for each planet
planets_df['film_count'] = planets_df['films'].apply(len)

def plot_film_count_by_planet(planets_df, highlight_planet=None):
    # Sort by film count for a cleaner chart
    planets_df_sorted = planets_df.sort_values(by='film_count', ascending=False)

    # Create space between bars
    x_positions = np.arange(len(planets_df_sorted))

    # Assign colors: highlight one planet differently
    colors = ['orange' if name == highlight_planet else 'skyblue' for name in planets_df_sorted['name']]

    # Create the bar chart with adjusted width and spacing
    plt.figure(figsize=(18, 6))
    plt.bar(x_positions, planets_df_sorted['film_count'], color=colors, width=0.6)  # Adjust width for spacing
    plt.xticks(x_positions, planets_df_sorted['name'], rotation=45, ha='right')  # Add rotation and spacing
    plt.title('Number of Films per Planet', fontsize=16)
    plt.xlabel('Planet', fontsize=14)
    plt.ylabel('Number of Films', fontsize=14)

    # Annotate highlighted planet
    if highlight_planet in planets_df_sorted['name'].values:
        highlight_row = planets_df_sorted[planets_df_sorted['name'] == highlight_planet]
        highlight_index = planets_df_sorted[planets_df_sorted['name'] == highlight_planet].index[0]
        plt.text(
            x_positions[list(planets_df_sorted.index).index(highlight_index)],
            highlight_row['film_count'].values[0] + 0.2,
            f"{highlight_planet}: {highlight_row['film_count'].values[0]} films",
            ha='center',
            fontsize=10,
            color='orange'
        )

    # Add spacing between bars
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    plt.grid(axis='y', linestyle='--', alpha=0.7)  # Add light grid for better readability
    plt.tight_layout()

    plt.show()

# Example Usage: Highlight "Tatooine"
plot_film_count_by_planet(planets_df, highlight_planet='Tatooine')

In [None]:
# Calculate the number of films for each species
species_df['film_count'] = species_df['films'].apply(len)

def plot_film_count_by_species(species_df, highlight_species=None):
    # Sort by film count for a cleaner chart
    species_df_sorted = species_df.sort_values(by='film_count', ascending=False)

    # Create space between bars
    x_positions = np.arange(len(species_df_sorted))

    # Assign colors: highlight one species differently
    colors = ['orange' if name == highlight_species else 'skyblue' for name in species_df_sorted['name']]

    # Create the bar chart with adjusted width and spacing
    plt.figure(figsize=(18, 6))
    plt.bar(x_positions, species_df_sorted['film_count'], color=colors, width=0.6)  # Adjust width for spacing
    plt.xticks(x_positions, species_df_sorted['name'], rotation=45, ha='right')  # Add rotation and spacing
    plt.title('Number of Films per Species', fontsize=16)
    plt.xlabel('Species', fontsize=14)
    plt.ylabel('Number of Films', fontsize=14)

    # Annotate highlighted species
    if highlight_species in species_df_sorted['name'].values:
        highlight_row = species_df_sorted[species_df_sorted['name'] == highlight_species]
        highlight_index = species_df_sorted[species_df_sorted['name'] == highlight_species].index[0]
        plt.text(
            x_positions[list(species_df_sorted.index).index(highlight_index)],
            highlight_row['film_count'].values[0] + 0.2,
            f"{highlight_species}: {highlight_row['film_count'].values[0]} films",
            ha='center',
            fontsize=10,
            color='orange'
        )

    # Add spacing between bars
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    plt.grid(axis='y', linestyle='--', alpha=0.7)  # Add light grid for better readability
    plt.tight_layout()

    plt.show()

# Example Usage: Highlight "Wookiee"
plot_film_count_by_species(species_df, highlight_species='Wookiee')

In [None]:
# Calculate the number of films for each starship
starships_df['film_count'] = starships_df['films'].apply(len)

def plot_film_count_by_starship(starships_df, highlight_starship=None):
    # Sort by film count for a cleaner chart
    starships_df_sorted = starships_df.sort_values(by='film_count', ascending=False)

    # Create space between bars
    x_positions = np.arange(len(starships_df_sorted))

    # Assign colors: highlight one starship differently
    colors = ['orange' if name == highlight_starship else 'skyblue' for name in starships_df_sorted['name']]

    # Create the bar chart with adjusted width and spacing
    plt.figure(figsize=(18, 6))
    plt.bar(x_positions, starships_df_sorted['film_count'], color=colors, width=0.6)  # Adjust width for spacing
    plt.xticks(x_positions, starships_df_sorted['name'], rotation=45, ha='right')  # Add rotation and spacing
    plt.title('Number of Films per Starship', fontsize=16)
    plt.xlabel('Starship', fontsize=14)
    plt.ylabel('Number of Films', fontsize=14)

    # Annotate highlighted starship
    if highlight_starship in starships_df_sorted['name'].values:
        highlight_row = starships_df_sorted[starships_df_sorted['name'] == highlight_starship]
        highlight_index = starships_df_sorted[starships_df_sorted['name'] == highlight_starship].index[0]
        plt.text(
            x_positions[list(starships_df_sorted.index).index(highlight_index)],
            highlight_row['film_count'].values[0] + 0.2,
            f"{highlight_starship}: {highlight_row['film_count'].values[0]} films",
            ha='center',
            fontsize=10,
            color='orange'
        )

    # Add spacing between bars
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    plt.grid(axis='y', linestyle='--', alpha=0.7)  # Add light grid for better readability
    plt.tight_layout()

    plt.show()

# Example Usage: Highlight "Millennium Falcon"
plot_film_count_by_starship(starships_df, highlight_starship='Millennium Falcon')

### Chart Function Testing

In [30]:
def plot_bar_chart(
    df, x_col, y_col, 
    title, xlabel, ylabel, 
    highlight_item=None, log_scale=False, annotation_col=None
):
    """
    Generalized function to create bar charts for different Star Wars data.

    Parameters:
        df (DataFrame): The input DataFrame.
        x_col (str): Column name to be used for X-axis labels.
        y_col (str): Column name for Y-axis values.
        title (str): Chart title.
        xlabel (str): Label for X-axis.
        ylabel (str): Label for Y-axis.
        highlight_item (str, optional): Name of the item to highlight in orange.
        log_scale (bool, optional): Apply log10 transformation to Y-axis values.
        annotation_col (str, optional): Column for additional bar annotations (e.g., height, speed).
    """
    # Convert y_col to numeric and drop NaN values
    df[y_col] = pd.to_numeric(df[y_col], errors='coerce')
    df = df.dropna(subset=[y_col])

    # Apply log10 transformation if enabled
    if log_scale:
        df[f'log_{y_col}'] = np.log10(df[y_col] + 1)  # Avoid log(0)
        y_col = f'log_{y_col}'

    # Sort by Y values for better visualization
    df_sorted = df.sort_values(by=y_col, ascending=False)

    # Assign colors: highlight selected item differently
    colors = ['orange' if name == highlight_item else 'skyblue' for name in df_sorted[x_col]]

    # Create the bar chart
    plt.figure(figsize=(14, 6))
    bars = plt.bar(df_sorted[x_col], df_sorted[y_col], color=colors)

    # Add labels and title
    plt.xticks(rotation=45, ha='right', fontsize=12)
    plt.yticks(fontsize=12)
    plt.xlabel(xlabel, fontsize=14)
    plt.ylabel(ylabel if not log_scale else f'Log10({ylabel})', fontsize=14)
    plt.title(title, fontsize=16)
    plt.grid(axis='y', linestyle='--', alpha=0.7)

    # Annotate each bar with additional info if provided
    if annotation_col:
        df_sorted[annotation_col] = pd.to_numeric(df_sorted[annotation_col], errors='coerce')
        for bar, (name, annotation) in zip(bars, zip(df_sorted[x_col], df_sorted[annotation_col])):
            plt.text(
                bar.get_x() + bar.get_width() / 2, 
                bar.get_height() + 0.1, 
                f"{annotation}" if not pd.isna(annotation) else "", 
                ha='center', fontsize=9, color='black'
            )

    # Highlighted item annotation (only name)
    if highlight_item in df_sorted[x_col].values:
        highlight_index = df_sorted[x_col].tolist().index(highlight_item)
        plt.text(
            highlight_index,
            df_sorted[y_col].values[highlight_index] + 0.2,
            f"{highlight_item}",
            ha='center', color='red', fontsize=11, fontweight='bold'
        )

    plt.tight_layout()
    plt.show()

In [None]:
plot_bar_chart(
    df=people_df, 
    x_col='name', y_col='height', 
    title='Star Wars Characters: Height from Shortest to Tallest',
    xlabel='Character', ylabel='Height (cm)', 
    highlight_item='Luke Skywalker'
)

In [None]:
plot_bar_chart(
    df=starships_df, 
    x_col='name', y_col='cost_in_credits', 
    title='Log-Scaled Starship Cost in Credits',
    xlabel='Starship', ylabel='Cost in Credits', 
    highlight_item='Millennium Falcon', log_scale=True,
    annotation_col='max_atmosphering_speed'
)

In [None]:
planets_df['film_count'] = planets_df['films'].apply(len)  # Count films per planet

plot_bar_chart(
    df=planets_df, 
    x_col='name', y_col='film_count', 
    title='Number of Films per Planet',
    xlabel='Planet', ylabel='Number of Films', 
    highlight_item='Tatooine'
)

## Section 4: Ask User to Select Visualization

In [None]:
import ipywidgets as widgets

categories = {
            "Character": {
            "subcategories": ["Height Comparison", "Mass Comparison"],
            "visualizations": {
            "Height Comparison": "Character Height Bar Plot",
            "Mass Comparison": "Character Mass vs. Height Scatter Plot"}
            },
            "Homeworld": {
            "subcategories": ["Diameter Comparison", "Population Comparison"],
            "visualizations": {
            "Diameter Comparison": "Planet Diameter vs. Surface Water %",
            "Population Comparison": "Planet Population vs. Diameter Plot"}
            },
            "Species": {
            "subcategories": ["Lifespan Comparison", "Height Comparison"],
            "visualizations": {
            "Lifespan Comparison": "Avg Lifespan vs. Species Type Plot",
            "Height Comparison": "Avg Height vs. Lifespan Plot"}
            },
            "Spaceship": {
            "subcategories": ["Speed Comparison", "Capacity Comparison"],
            "visualizations": {
            "Speed Comparison": "Max Speed vs. Cost Plot",
            "Capacity Comparison": "Passenger Capacity vs. Size Plot"}
            }}

# Create the first dropdown for main categories
main_category_dropdown = widgets.Dropdown(
    options=list(categories.keys()),
    description='Main Category:',
)
# Create the second dropdown for subcategories
sub_category_dropdown = widgets.Dropdown(
    options=[],
    description='Subcategory:',
)

# Define a function to update the subcategory dropdown based on the main category selection
def update_subcategories(change):
    selected_category = change['new']
    subcategories = categories[selected_category]['subcategories']
    sub_category_dropdown.options = subcategories
    sub_category_dropdown.value = subcategories[0] if subcategories else None
    
# Attach the update function to the main category dropdown
main_category_dropdown.observe(update_subcategories, names='value')

# Function to generate the plot based on the selected options
def generate_visualization(b):
    selected_main = main_category_dropdown.value
    selected_sub = sub_category_dropdown.value
    
    
    # Clear previous output
    if output:
        output.clear_output()   
    
    # Display the selected options
    print(f"You selected Main Category: **{selected_main}**, Subcategory: **{selected_sub}**.")
    
    
    # Call the appropriate plot function based on dropdown selections
    if selected_main == "Character":
        if selected_sub == "Height Comparison":
            plot_character_height(people_df)
    if selected_main == "Character":
        if selected_sub == "Mass Comparison":
            plot_mass_by_character(people_df)
    if selected_main == "Homeworld":
        if selected_sub == "Diameter Comparison":
            plot_diameter_comparison()
    if selected_main == "Homeworld":
        if selected_sub == "Population Comparison":
            plot_population_comparison()
    if selected_main == "Species":
        if selected_sub == "Lifespan Comparison":
            plot_lifespan_comparison()
    if selected_main == "Species":
        if selected_sub == "Height Comparison":
            plot_height_comparison()    
    if selected_main == "Spaceship":
        if selected_sub == "Speed Comparison":
            plot_spaceship_speed()
    if selected_main == "Spaceship":
        if selected_sub == "Capacity Comparison":
            plot_spaceship_capacity()
    if selected_main == "Homeworld":
        if selected_sub == "Diameter Comparison":
            plot_diameter_comparison()
    
#     # Assuming you have a DataFrame named 'df' with data to visualize
# df = ... 

# # Create a dropdown widget for visualization types
# viz_type = widgets.Dropdown(
#     options=['Bar Chart', 'Line Chart', 'Scatter Plot', 'Pie Chart', 'Histogram'],
#     description='Choose a visualization:'
# )

# # Create a function to display the visualization based on user's selection
# def display_visualization(viz_type):
#     if viz_type == 'Bar Chart':
#         # Code to create bar chart
#         df.plot.bar()
#     elif viz_type == 'Line Chart':
#         # Code to create line chart
#         df.plot.line()
#     # ... (Add other visualization types as needed)

# # Display the dropdown widget
# display(viz_type)

# # Call the display_visualization function when the dropdown value changes
# widgets.interact(display_visualization, viz_type=viz_type)
            
            
# Create a button to generate the plot
generate_button = widgets.Button(
description='Generate Visualization',
button_style="success", 
    icon="check"
)


# Attach the button's action to the generate_visualization function
generate_button.on_click(generate_visualization)

# Output widget to display the selections and visualizations
output = widgets.Output()
display(main_category_dropdown, sub_category_dropdown, generate_button, output)


# Initialize the subcategories based on the default selection
update_subcategories({'new': main_category_dropdown.value})
