In [1]:
print('Successfully Imported')

Successfully Imported


In [31]:
import pandas as pd
import seaborn as sns
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [37]:
def show_rating_distribution(data):
    # Create a figure with two subplots
    # made with the help of DeepSeek

    # Calculate missing values
    missing_count = data.isna().sum()
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
    
    sns.histplot(data, kde=True, ax=ax1)
    ax1.set_title('IMDB Rating Histogram')
    ax1.set_xlabel('IMDB Rating')
    if ax1.get_legend():  # Safely remove legend if it exists
        ax1.get_legend().set_visible(False)
    
    sns.boxplot(data, ax=ax2)
    ax2.set_title('IMDB Rating Box Plot')
    ax2.set_xlabel('Movies')
    ax2.set_ylabel('Rating')

    fig.suptitle("IMDB Rating Distribution")

    fig.text(0.1, 0.95, f'Missing data: {missing_count}', 
            ha='center', va='bottom', fontsize=10, color='gray')
    
    plt.tight_layout()
    plt.show()

In [40]:
def show_budget_hist(data):
    # Create a figure with two subplots
    # made with the help of DeepSeek

    # Calculate missing values
    missing_count = data.isna().sum()
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
    
    sns.histplot(data, kde=True, ax=ax1)
    ax1.set_title('Budget Histogram')
    ax1.set_xlabel('Budget')
    if ax1.get_legend():  # Safely remove legend if it exists
        ax1.get_legend().set_visible(False)
    
    sns.boxplot(data, ax=ax2)
    ax2.set_title('Budget Box Plot')
    ax2.set_xlabel('Movies')
    ax2.set_ylabel('Budget')

    fig.suptitle("Budget Distribution")

    fig.text(0.1, 0.95, f'Missing data: {missing_count}', 
            ha='center', va='bottom', fontsize=10, color='gray')
    
    plt.tight_layout()
    plt.show()

In [None]:
def show_mpa_hist(data):
    # Given dictionary
    Ratings = {
        'G': 1,
        'PG': 2,
        'PG-13': 3,
        'R': 4,
        'NC-17': 5,
        'No Certificate': 6
    }
    
    # Inverse the dictionary (keys ↔ values)
    inverse_ratings = {v: k for k, v in Ratings.items()}
    
    # Calculate missing values
    missing_count = data.isna().sum()
    
    # Get unique values (excluding NaN) and map back to original labels
    unique_numeric_vals = data.dropna().unique()
    unique_labels = [inverse_ratings[val] for val in unique_numeric_vals]
    
    # Calculate counts for each unique value
    counts = [(data == val).sum() for val in unique_numeric_vals]
    
    # Create DataFrame for plotting
    import pandas as pd
    plot_data = pd.DataFrame({
        'MPAA Rating': unique_labels,
        'Count': counts
    })
    
    # Create figure and axis
    import matplotlib.pyplot as plt
    import seaborn as sns
    fig, ax = plt.subplots(figsize=(10, 6))
    
    # Create the bar plot
    sns.barplot(data=plot_data, x='MPAA Rating', y='Count', ax=ax, order=['G', 'PG', 'PG-13', 'R', 'NC-17', 'No Certificate'])
    ax.set_title('MPA Rating Distribution')
    ax.set_xlabel('MPA Rating')
    ax.set_ylabel('Count')
    
    # Remove legend if present
    if ax.get_legend():
        ax.get_legend().remove()
    
    # Add missing data annotation
    fig.text(0.1, 0.95, f'Missing data: {missing_count}', 
             ha='left', va='top', fontsize=10, color='gray')
    
    plt.tight_layout()
    plt.show()

In [33]:
def show_token_distribution(df):
    # Sum TF-IDF scores for each token across all documents
    word_weights = df.mean().to_dict()
    # Generate the word cloud
    wordcloud = WordCloud(
        width=800, 
        height=400,
        background_color='white'
    ).generate_from_frequencies(word_weights)
    
    # Display the word cloud
    plt.figure(figsize=(10, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')  # Hide axes
    plt.show()

In [56]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

def plot_pareto_charts(df, feature_dict):
    """
    Generates Pareto charts with consistent colors and proper subplot handling
    """
    groups = ['Cast', 'Directors', 'Distributors', 'Producers', 'Genre']
    valid_groups = [g for g in groups if feature_dict.get(g) and not df[feature_dict[g]].empty]
    
    if not valid_groups:
        print("No valid data to plot")
        return None
    
    # Create only needed subplots
    fig, axes = plt.subplots(len(valid_groups), 1, 
                            figsize=(14, 5 * len(valid_groups)),
                            squeeze=False)
    axes = axes.flatten()
    
    for idx, group in enumerate(valid_groups):
        ax = axes[idx]
        cols = feature_dict[group]
        
        # Calculate frequencies and sort
        group_data = df[cols].sum().sort_values(ascending=False)
        total = group_data.sum()
        
        if total == 0:
            ax.remove()
            continue
            
        # Get top 20 and cumulative percentages
        top20 = group_data.head(20)
        x_labels = [label[:20] + '...' if len(label) > 20 else label 
                   for label in top20.index]
        cumulative_pct = (group_data.cumsum()/total * 100).head(20).values
        
        # Plot bars with original colors
        sns.barplot(x=top20.index, y=top20.values, ax=ax, 
                   color='b', order=top20.index)
        
        # Configure main axis
        ax.set_xticks(range(len(top20)))
        ax.set_xticklabels(x_labels, rotation=45, ha='right', fontsize=9)
        ax.tick_params(axis='x', pad=2)
        ax.set_title(f'{group} Pareto Analysis', fontsize=13, pad=15)
        ax.set_ylabel('Frequency', fontsize=10)
        ax.set_xlabel('')
        
        # Cumulative percentage line
        ax2 = ax.twinx()
        ax2.plot(np.arange(len(top20)), cumulative_pct, 
                color='r', marker='o', ms=5, linewidth=1.5, alpha=0.8)
        ax2.set_ylim(0, 105)
        ax2.set_ylabel('Cumulative %', fontsize=10, color='r')
        ax2.tick_params(axis='y', labelcolor='r')
        
        # Add light grid
        ax.yaxis.grid(True, linestyle='--', alpha=0.3)
    
    # Remove empty axes
    for j in range(len(valid_groups), len(axes)):
        fig.delaxes(axes[j])
        
    plt.tight_layout()

In [57]:
def show_runtime_distribution(data):
    # Create a figure with two subplots
    # made with the help of DeepSeek

    # Calculate missing values
    missing_count = data.isna().sum()
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
    
    sns.histplot(data, kde=True, ax=ax1)
    ax1.set_title('Runtime Histogram')
    ax1.set_xlabel('Runtime')
    if ax1.get_legend():  # Safely remove legend if it exists
        ax1.get_legend().set_visible(False)
    
    sns.boxplot(data, ax=ax2)
    ax2.set_title('Runtime Box Plot')
    ax2.set_xlabel('Movies')
    ax2.set_ylabel('Runtime')

    fig.suptitle("Runtime Distribution")

    fig.text(0.1, 0.95, f'Missing data: {missing_count}', 
            ha='center', va='bottom', fontsize=10, color='gray')
    
    plt.tight_layout()
    plt.show()