In [None]:
import pandas as pd
import sqlite3
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [None]:
# Here we connect to the 'movies.sqlite' database where all our movie data is stored
conn = sqlite3.connect("movies.sqlite")
cur = conn.cursor()

In [None]:
# Let's query the database to get movie-related data for our visualizations. We’ll focus on budget, revenue, popularity, and release date
query = "SELECT budget, revenue, popularity, release_date FROM movies WHERE budget IS NOT NULL AND revenue IS NOT NULL AND popularity IS NOT NULL;"
movie_data = pd.read_sql(query, conn)

In [None]:
# In this step, we create a histogram to explore the distribution of movie budgets
plt.figure(figsize=(10, 6))
plt.hist(movie_data['budget'], bins=50, color='lightcoral', edgecolor='black')
plt.title('Movie Budget Distribution')
plt.xlabel('Budget in USD')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

# Comment: The histogram displays how movie budgets are spread out across all movies in the dataset. We can spot any budget outliers in the data.

In [None]:
# A heatmap will help us visualize the relationships between the movie features (budget, revenue, popularity)
corr = movie_data.corr()

plt.figure(figsize=(8, 6))
sns.heatmap(corr, annot=True, cmap='viridis', fmt='.2f', linewidths=1)
plt.title('Correlation Between Budget, Revenue, and Popularity')
plt.show()

# Comment: The heatmap shows how features like budget, revenue, and popularity correlate with each other. Positive correlations are highlighted, which can give us insights into trends.

In [None]:
# An interactive scatter plot helps us examine how budget and revenue are related, with the ability to zoom and explore specific points
fig = px.scatter(movie_data, x='budget', y='revenue',
                 title='Budget vs. Revenue for Movies',
                 labels={'budget': 'Movie Budget ($)', 'revenue': 'Movie Revenue ($)'},
                 hover_data=['budget', 'revenue'])

fig.show()

# Comment: This Plotly scatter plot lets us examine the relationship between the budgets and revenues of the movies interactively. Hovering over data points shows detailed values.

In [None]:
# Let’s explore how the number of movies released each year has changed. This gives us an idea of the movie industry's growth.
query_years = "SELECT strftime('%Y', release_date) AS release_year, COUNT(*) AS movie_count FROM movies GROUP BY release_year ORDER BY release_year;"
movies_per_year = pd.read_sql(query_years, conn)
plt.figure(figsize=(10, 6))
plt.plot(movies_per_year['release_year'], movies_per_year['movie_count'], marker='x', color='purple')
plt.title('Movies Released Per Year')
plt.xlabel('Year')
plt.ylabel('Count of Movies')
plt.grid(True)
plt.xticks(rotation=45)
plt.show()

# Comment: This line plot shows the trend in movie releases over time. It helps us see the overall growth or decline in movie production year by year.

In [None]:
# We should always close the connection once we're done fetching data to prevent any issues.
conn.close()