In [1]:
import pandas as pd

In [None]:
# Plot the distribution of movie ratings
plt.figure(figsize=(12, 8))  
sns.histplot(
    df_merged_cleaned['rating'], 
    bins=20, 
    kde=True, 
    color='dodgerblue', 
    edgecolor='black', 
    alpha=0.7
)

# Enhance plot aesthetics
plt.title('Distribution of Movie Ratings', fontsize=18, fontweight='bold', pad=15)
plt.xlabel('Rating', fontsize=14, labelpad=10)
plt.ylabel('Frequency', fontsize=14, labelpad=10)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.grid(visible=True, linestyle='--', alpha=0.6)

# Add mean and median lines
mean_rating = df_merged_cleaned['rating'].mean()
median_rating = df_merged_cleaned['rating'].median()

plt.axvline(mean_rating, color='red', linestyle='--', linewidth=2, label=f'Mean: {mean_rating:.2f}')
plt.axvline(median_rating, color='green', linestyle='--', linewidth=2, label=f'Median: {median_rating:.2f}')

plt.legend(fontsize=12, loc='upper left')
plt.show()


In [None]:
genre_counts = Counter(
    [genre for genres in df_merged_cleaned['genres'].dropna() for genre in genres.split(', ')]
)
top_genres = dict(genre_counts.most_common(10))
genre_df = pd.DataFrame(list(top_genres.items()), columns=['Genre', 'Frequency']).sort_values(by='Frequency', ascending=False)

plt.figure(figsize=(14, 8))
sns.barplot(
    data=genre_df, 
    x='Frequency', 
    y='Genre', 
    palette='viridis', 
    orient='h'  # Horizontal bars for better readability
)

# Enhance plot aesthetics
plt.title('Top 10 Most Popular Genres', fontsize=18, fontweight='bold', pad=15)
plt.xlabel('Frequency of Appearance', fontsize=14, labelpad=10)
plt.ylabel('Genre', fontsize=14, labelpad=10)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.grid(axis='x', linestyle='--', alpha=0.6) 

# Annotate bar values with exact counts
for index, value in enumerate(genre_df['Frequency']):
    plt.text(value + 2, index, f"{value:,}", va='center', fontsize=12, color='black')

plt.show()


In [None]:

def format_millions(value, tick_number):
    if value >= 1e9:
        return f'{int(value / 1e9)}B'  # Format as billions without decimals
    elif value >= 1e6:
        return f'{int(value / 1e6)}M'  # Format as millions without decimals
    else:
        return f'{int(value)}'        


df_merged_cleaned['budget_bins'] = pd.cut(
    df_merged_cleaned['budget'], 
    bins=[100000, 1_000_000, 10_000_000, 100_000_000, 1_000_000_000], 
    labels=['100K–1M', '1M–10M', '10M–100M', '100M–1B'], 
    include_lowest=True
)

# Aggregate average revenue per budget bin (explicitly pass observed=False)
budget_vs_revenue = df_merged_cleaned.groupby(
    'budget_bins', observed=False
)['revenue'].mean().reset_index()

# Create a bar plot for visualizing average revenue by budget range
plt.figure(figsize=(12, 8))
sns.barplot(
    data=budget_vs_revenue, 
    x='budget_bins', 
    y='revenue', 
    palette='Blues_d'
)

plt.gca().yaxis.set_major_formatter(FuncFormatter(format_millions))


plt.title('Average Revenue by Budget Range', fontsize=16, fontweight='bold')
plt.xlabel('Budget Range (USD)', fontsize=14)
plt.ylabel('Average Revenue (USD)', fontsize=14)
plt.grid(visible=True, linestyle='--', alpha=0.6)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

# Show plot
plt.show()


In [None]:

top_directors = (
    df_merged_cleaned.explode('director') 
    .groupby('director')['rating']
    .mean()
    .sort_values(ascending=False)
    .head(10)
)

plt.figure(figsize=(14, 7))
sns.barplot(
    x=top_directors.values, 
    y=top_directors.index, 
    palette='magma', 
    orient='h'  # Horizontal orientation for readability
)

# Add ratings to the end of each bar
for index, value in enumerate(top_directors.values):
    plt.text(value + 0.1, index, f"{value:.2f}", va='center', fontsize=12)

plt.title('Top 10 Directors by Average Movie Rating', fontsize=18, fontweight='bold', pad=15)
plt.xlabel('Average Rating', fontsize=14)
plt.ylabel('Director', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.grid(visible=True, linestyle='--', alpha=0.6, axis='x')  

plt.show()


In [None]:

df_merged_cleaned['release_year'] = pd.to_datetime(df_merged_cleaned['release_date']).dt.year

movies_per_year = df_merged_cleaned['release_year'].value_counts().sort_index()

plt.figure(figsize=(14, 7))
sns.lineplot(x=movies_per_year.index, y=movies_per_year.values, marker='o', linestyle='-', linewidth=2)

plt.title('Evolution of Movie Production Over the Years', fontsize=16, fontweight='bold')
plt.xlabel('Year', fontsize=14)
plt.ylabel('Number of Movies Released', fontsize=14)
plt.grid(visible=True, linestyle='--', alpha=0.6)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()


In [None]:
actor_counts = Counter(
    [actor for cast in df_merged_cleaned['cast'].dropna() for actor in cast.split(', ')]
)
top_actors = dict(actor_counts.most_common(10))

actor_df = pd.DataFrame(list(top_actors.items()), columns=['Actor', 'Number of Movies']).sort_values(by='Number of Movies', ascending=False)

plt.figure(figsize=(14, 8))
sns.barplot(
    data=actor_df, 
    x='Number of Movies', 
    y='Actor', 
    palette='plasma', 
    orient='h'  # Horizontal bars for better readability
)

# Enhance plot aesthetics
plt.title('Top 10 Actors with Most Movie Appearances', fontsize=18, fontweight='bold', pad=15)
plt.xlabel('Number of Movies', fontsize=14, labelpad=10)
plt.ylabel('Actor', fontsize=14, labelpad=10)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.grid(axis='x', linestyle='--', alpha=0.6) 
# Annotate bar values with exact counts
for index, value in enumerate(actor_df['Number of Movies']):
    plt.text(value + 0.5, index, f"{value:,}", va='center', fontsize=12, color='black')

plt.show()


In [None]:
df_merged_cleaned['duration_time'] = pd.to_timedelta(df_merged_cleaned['duration_time']).dt.total_seconds() / 60

selected_columns = ['rating', 'ratingcount', 'imdb_votes', 'popularity', 'budget', 'revenue']
numeric_columns = df_merged_cleaned[selected_columns]

numeric_columns = numeric_columns.apply(pd.to_numeric, errors='coerce').dropna()

correlation_matrix = numeric_columns.corr()

plt.figure(figsize=(14, 10))  
sns.heatmap(
    correlation_matrix, 
    annot=True,  
    fmt=".2f",   
    cmap='coolwarm',  
    vmin=-1, vmax=1,  
    linewidths=0.5,  
    annot_kws={"size": 12}  
)

plt.title('Correlation Heatmap: Metrics Driving Movie Success', fontsize=18, fontweight='bold', pad=15)
plt.xticks(fontsize=12, rotation=45)  
plt.yticks(fontsize=12)
plt.tight_layout()  
plt.show()


In [None]:
df_genres = df_merged_cleaned.explode('genres')

genre_summary = df_genres.groupby('genres').agg(
    avg_revenue=('revenue', 'mean'),
    avg_rating=('rating', 'mean'),
    movie_count=('title', 'count')
).sort_values(by='avg_revenue', ascending=False).reset_index()

top_genres = genre_summary.head(10)

fig, axes = plt.subplots(1, 2, figsize=(18, 10), gridspec_kw={'width_ratios': [3, 1]}, dpi=120)

sns.barplot(
    data=top_genres, 
    x='avg_revenue', 
    y='genres', 
    ax=axes[0], 
    palette='viridis'
)
axes[0].set_title('Top Genres by Average Revenue', fontsize=14, fontweight='bold', pad=15)
axes[0].set_xlabel('Average Revenue (in Billion USD)', fontsize=14, labelpad=10)
axes[0].set_ylabel('Genres', fontsize=14, labelpad=10)
axes[0].xaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f'{x / 1e9:.1f}B')) 
axes[0].grid(axis='x', linestyle='--', alpha=0.7)

sns.barplot(
    data=top_genres, 
    x='avg_rating', 
    y=top_genres['genres'], 
    ax=axes[1], 
    palette='magma'
)
axes[1].set_title('Top Genres by Average Rating', fontsize=14, fontweight='bold', pad=15)
axes[1].set_xlabel('Average Rating (Out of 10)', fontsize=14, labelpad=10)
axes[1].set_ylabel('')  
axes[1].set_xlim(7, 10)  
axes[1].yaxis.set_visible(False)  
axes[1].grid(axis='x', linestyle='--', alpha=0.7)


plt.tight_layout()
plt.subplots_adjust(top=0.9)
plt.suptitle(
    'Comparison of Average Revenue and Ratings Across Top Genres', 
    fontsize=20, fontweight='bold', y=0.98
)

for ax, metric, fmt, alignment_offset in zip(
    axes, 
    ['avg_revenue', 'avg_rating'], 
    [lambda x: f'{x / 1e9:.3f}B', lambda x: f'{x:.3f}'],  
    [0.01 * max(top_genres['avg_revenue']), 0.02]  
):
    for bar, value in zip(ax.patches, top_genres[metric]):
        ax.text(
            bar.get_width() + alignment_offset,  
            bar.get_y() + bar.get_height() / 2, 
            fmt(value), 
            va='center', fontsize=12, color='black'
        )

plt.show()
