# Import libraries

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.ticker import ScalarFormatter # Import ScalarFormatter for axis formatting

In [2]:
# read instagram
insta_df = pd.read_csv('./Clean_Data/merged_instagram_data_cleaned.csv')

# read tiktok
tiktok_df = pd.read_csv('./Clean_Data/merged_tiktok_data_cleaned.csv')

# read youtube
youtube_df = pd.read_csv('./Clean_Data/merged_youtube_data_cleaned.csv')

In [3]:
def calculate_engagement_rate(row, like_col, comment_col, subscriber_col, view_col=None, share_col=None):
    likes = row[like_col] if like_col in row else 0
    comments = row[comment_col] if comment_col in row else 0
    views = row[view_col] if view_col in row else 0
    shares = row[share_col] if share_col in row else 0
    subscribers = row[subscriber_col] if subscriber_col in row else 0

    total_engagement = likes + comments + views + shares

    if subscribers > 0:
        return (total_engagement / subscribers) * 100
    else:
        return 0  # Or return np.nan or some other value

In [4]:
def compare_engagement_across_platforms(insta_df, tiktok_df, youtube_df):
    engagement_data = []

    # Inside the Instagram processing block
    if insta_df is not None and 'likes_avg' in insta_df.columns and 'comments_avg' in insta_df.columns and 'subscribers' in insta_df.columns and 'instagram_name' in insta_df.columns:
        print("\nData types for Instagram engagement columns:")
        print(f"likes_avg: {insta_df['likes_avg'].dtype}")
        print(f"comments_avg: {insta_df['comments_avg'].dtype}")
        if 'views_avg' in insta_df.columns:
            print(f"views_avg: {insta_df['views_avg'].dtype}")
        if 'shares_avg' in insta_df.columns:
            print(f"shares_avg: {insta_df['shares_avg'].dtype}")
        # ... rest of the Instagram processing code

    # Inside the TikTok processing block
    if tiktok_df is not None and 'likes_avg' in tiktok_df.columns and 'comments_avg' in tiktok_df.columns and 'subscribers' in tiktok_df.columns and 'tiktoker_name' in tiktok_df.columns:
        print("\nData types for TikTok engagement columns:")
        print(f"likes_avg: {tiktok_df['likes_avg'].dtype}")
        print(f"comments_avg: {tiktok_df['comments_avg'].dtype}")
        if 'views_avg' in tiktok_df.columns:
            print(f"views_avg: {tiktok_df['views_avg'].dtype}")
        if 'shares_avg' in tiktok_df.columns:
            print(f"shares_avg: {tiktok_df['shares_avg'].dtype}")
        # ... rest of the TikTok processing code

    # Inside the YouTube processing block
    if youtube_df is not None and 'avg_likes' in youtube_df.columns and 'avg_comments' in youtube_df.columns and 'subscribers' in youtube_df.columns and 'youtuber_name' in youtube_df.columns and 'avg_views' in youtube_df.columns:
        print("\nData types for YouTube engagement columns:")
        print(f"avg_likes: {youtube_df['avg_likes'].dtype}")
        print(f"avg_comments: {youtube_df['avg_comments'].dtype}")
        if 'avg_views' in youtube_df.columns:
            print(f"avg_views: {youtube_df['avg_views'].dtype}")
        if 'shares_avg' in youtube_df.columns: # Assuming you might have a shares column for YouTube later
            print(f"shares_avg: {youtube_df['shares_avg'].dtype}")
        # ... rest of the YouTube processing code
    elif youtube_df is not None and 'likes_avg' in youtube_df.columns and 'comments_avg' in youtube_df.columns and 'subscribers' in youtube_df.columns and 'youtuber_name' in youtube_df.columns and 'views_avg' in youtube_df.columns:
        print("\nData types for YouTube engagement columns (alternative):")
        print(f"likes_avg: {youtube_df['likes_avg'].dtype}")
        print(f"comments_avg: {youtube_df['comments_avg'].dtype}")
        if 'views_avg' in youtube_df.columns:
            print(f"views_avg: {youtube_df['views_avg'].dtype}")

    if engagement_data:
        engagement_df = pd.DataFrame(engagement_data)
        engagement_df['engagement_rate'] = engagement_df['engagement_rate'].replace([np.inf, -np.inf], np.nan).dropna()

        print("\n--- Comparative Analysis of Engagement Across Platforms ---")
        print("\nAverage Engagement Rate by Platform:")
        print(engagement_df.groupby('platform')['engagement_rate'].mean().sort_values(ascending=False))

        plt.figure(figsize=(10, 7))
        sns.boxplot(x='platform', y='engagement_rate', data=engagement_df)
        plt.title('Comparison of Engagement Rates Across Platforms')
        plt.xlabel('Platform')
        plt.ylabel('Engagement Rate (%)')
        plt.yscale('log') # Using log scale to handle potential outliers
        plt.show()

        plt.figure(figsize=(12, 8))
        sns.violinplot(x='platform', y='engagement_rate', data=engagement_df)
        plt.title('Distribution of Engagement Rates Across Platforms')
        plt.xlabel('Platform')
        plt.ylabel('Engagement Rate (%)')
        plt.yscale('log') # Using log scale to handle potential outliers
        plt.show()
    else:
        print("Could not calculate engagement rate for any of the platforms due to missing data.")

# Assuming your DataFrames are named insta_df, tiktok_df, youtube_df
compare_engagement_across_platforms(insta_df, tiktok_df, youtube_df)


Data types for Instagram engagement columns:
likes_avg: object
comments_avg: object
views_avg: object

Data types for TikTok engagement columns:
likes_avg: float64
comments_avg: float64
views_avg: float64
shares_avg: float64

Data types for YouTube engagement columns:
avg_likes: float64
avg_comments: float64
avg_views: float64
Could not calculate engagement rate for any of the platforms due to missing data.


In [6]:
# --- 1. Define the Core Analysis Function (Subscriber-focused) ---
def compare_subscriber_distribution_and_top_influencers(insta_df, tiktok_df, youtube_df,
                                                       insta_subscriber_col='subscribers', insta_name_col='instagram_name',
                                                       tiktok_subscriber_col='subscribers', tiktok_name_col='tiktoker_name',
                                                       yt_subscriber_col='subscribers', yt_name_col='youtuber_name'):
    """
    Compares subscriber distribution and identifies top influencers across Instagram, TikTok, and YouTube.
    Focuses on raw subscriber counts as a primary metric for cross-platform comparison.

    Args:
        insta_df (pd.DataFrame): DataFrame containing Instagram influencer data.
        tiktok_df (pd.DataFrame): DataFrame containing TikTok influencer data.
        youtube_df (pd.DataFrame): DataFrame containing YouTube influencer data.
        insta_subscriber_col (str): Name of the subscribers column in insta_df.
        insta_name_col (str): Name of the influencer name column in insta_df.
        tiktok_subscriber_col (str): Name of the subscribers column in tiktok_df.
        tiktok_name_col (str): Name of the influencer name column in tiktok_df.
        yt_subscriber_col (str): Name of the subscribers column in youtube_df.
        yt_name_col (str): Name of the influencer name column in youtube_df.
    """
    # Initialize all_subscriber_data at the beginning of the function scope
    all_subscriber_data = []

    print("--- Starting Subscriber Analysis ---")

    # --- Process Instagram data ---
    if insta_df is not None and insta_subscriber_col in insta_df.columns and insta_name_col in insta_df.columns:
        temp_df = insta_df[[insta_name_col, insta_subscriber_col]].copy()
        temp_df.columns = ['influencer_name', 'subscribers'] # Standardize column names for concatenation
        temp_df['platform'] = 'Instagram'
        all_subscriber_data.append(temp_df)
        print(f"\n--- Instagram Subscriber Summary ({insta_df.shape[0]} rows) ---")
        # Ensure subscribers column is numeric for describe()
        temp_df['subscribers'] = pd.to_numeric(temp_df['subscribers'], errors='coerce')
        print(temp_df['subscribers'].describe().apply(lambda x: f"{x:,.0f}")) # Format for readability
        print(f"\nTop 10 Instagram Influencers by Subscribers:")
        print(temp_df.sort_values(by='subscribers', ascending=False).head(10).to_string(index=False)) # to_string for better print formatting
    else:
        print("Instagram data or key columns missing for subscriber analysis. Skipping Instagram.")

    # --- Process TikTok data ---
    if tiktok_df is not None and tiktok_subscriber_col in tiktok_df.columns and tiktok_name_col in tiktok_df.columns:
        temp_df = tiktok_df[[tiktok_name_col, tiktok_subscriber_col]].copy()
        temp_df.columns = ['influencer_name', 'subscribers'] # Standardize column names
        temp_df['platform'] = 'TikTok'
        all_subscriber_data.append(temp_df)
        print(f"\n--- TikTok Subscriber Summary ({tiktok_df.shape[0]} rows) ---")
        # Ensure subscribers column is numeric for describe()
        temp_df['subscribers'] = pd.to_numeric(temp_df['subscribers'], errors='coerce')
        print(temp_df['subscribers'].describe().apply(lambda x: f"{x:,.0f}")) # Format for readability
        print(f"\nTop 10 TikTok Influencers by Subscribers:")
        print(temp_df.sort_values(by='subscribers', ascending=False).head(10).to_string(index=False))
    else:
        print("TikTok data or key columns missing for subscriber analysis. Skipping TikTok.")

    # --- Process YouTube data ---
    if youtube_df is not None and yt_subscriber_col in youtube_df.columns and yt_name_col in youtube_df.columns:
        temp_df = youtube_df[[yt_name_col, yt_subscriber_col]].copy()
        temp_df.columns = ['influencer_name', 'subscribers'] # Standardize column names
        temp_df['platform'] = 'YouTube'
        all_subscriber_data.append(temp_df)
        print(f"\n--- YouTube Subscriber Summary ({youtube_df.shape[0]} rows) ---")
        # Ensure subscribers column is numeric for describe()
        temp_df['subscribers'] = pd.to_numeric(temp_df['subscribers'], errors='coerce')
        print(temp_df['subscribers'].describe().apply(lambda x: f"{x:,.0f}")) # Format for readability
        print(f"\nTop 10 YouTube Influencers by Subscribers:")
        print(temp_df.sort_values(by='subscribers', ascending=False).head(10).to_string(index=False))
    else:
        print("YouTube data or key columns missing for subscriber analysis. Skipping YouTube.")

    # --- Combine data for comparative visualization ---
    # This block will only execute if at least one platform had valid subscriber data
    if all_subscriber_data:
        combined_subscribers_df = pd.concat(all_subscriber_data, ignore_index=True)
        # Final cleanup for the combined DataFrame
        combined_subscribers_df['subscribers'] = pd.to_numeric(combined_subscribers_df['subscribers'], errors='coerce')
        combined_subscribers_df.dropna(subset=['subscribers'], inplace=True) # Drop rows where subscribers became NaN

        if combined_subscribers_df.empty:
            print("\nCombined subscriber DataFrame is empty after cleaning. Cannot generate plots.")
            return # Exit the function if no data to plot

        print("\n--- Comparative Subscriber Analysis Across Platforms ---")
        print("\nOverall Average Subscribers by Platform:")
        # Use .apply(lambda x: f"{x:,.0f}") for thousands comma separator
        print(combined_subscribers_df.groupby('platform')['subscribers'].mean().sort_values(ascending=False).apply(lambda x: f"{x:,.0f}"))

        # --- Box Plot: Subscriber Distribution ---
        plt.figure(figsize=(10, 7))
        ax1 = sns.boxplot(x='platform', y='subscribers', data=combined_subscribers_df)
        plt.title('Comparison of Subscriber Distribution Across Platforms')
        plt.xlabel('Platform')
        plt.ylabel('Subscribers')
        plt.yscale('log') # Log scale is often necessary for subscriber counts

        # FIX: Manually set formatter for log scale to avoid AttributeError
        formatter = ScalarFormatter()
        formatter.set_scientific(False) # Turn off scientific notation
        formatter.set_powerlimits((-3, 6)) # Pass as a single tuple argument
        ax1.yaxis.set_major_formatter(formatter)
        ax1.yaxis.set_minor_formatter(formatter) # Apply to minor ticks too for consistency

        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.tight_layout() # Adjust layout to prevent labels from overlapping
        plt.show()

        # --- Violin Plot: Subscriber Distribution ---
        plt.figure(figsize=(12, 8))
        ax2 = sns.violinplot(x='platform', y='subscribers', data=combined_subscribers_df)
        plt.title('Detailed Distribution of Subscribers Across Platforms')
        plt.xlabel('Platform')
        plt.ylabel('Subscribers')
        plt.yscale('log')

        # FIX: Apply the same formatter to the second plot's y-axis
        formatter2 = ScalarFormatter()
        formatter2.set_scientific(False)
        formatter2.set_powerlimits((-3, 6)) # Pass as a single tuple argument
        ax2.yaxis.set_major_formatter(formatter2)
        ax2.yaxis.set_minor_formatter(formatter2)

        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.tight_layout() # Adjust layout
        plt.show()

    else:
        print("No valid subscriber data could be consolidated for comparative analysis. Check individual platform data and column names.")

# --- 2. Example Data Loading (Replace with your actual data loading) ---
# IMPORTANT: These are placeholders. You MUST replace these with your actual
# pd.read_csv() or pd.read_excel() calls to load your dataframes.
# Make sure your CSV files are in the same directory as your Python script,
# or provide the full path to the files.

try:
    insta_df = pd.read_csv('instagram_data.csv') # <--- Replace with your actual Instagram CSV file path
    print("Instagram data loaded.")
except FileNotFoundError:
    print("instagram_data.csv not found. Please check the file path.")
    insta_df = None # Set to None if file not found

try:
    tiktok_df = pd.read_csv('tiktok_data.csv') # <--- Replace with your actual TikTok CSV file path
    print("TikTok data loaded.")
except FileNotFoundError:
    print("tiktok_data.csv not found. Please check the file path.")
    tiktok_df = None # Set to None if file not found

try:
    youtube_df = pd.read_csv('youtube_data.csv') # <--- Replace with your actual YouTube CSV file path
    print("YouTube data loaded.")
except FileNotFoundError:
    print("youtube_data.csv not found. Please check the file path.")
    youtube_df = None # Set to None if file not found


# --- 3. Call the Main Analysis Function ---
# This is where you actually run the analysis with your loaded DataFrames
# Ensure the column names here EXACTLY match the column names in your loaded DataFrames.
# If your 'subscribers' column is named 'followers' in Instagram, change it below.
compare_subscriber_distribution_and_top_influencers(
    insta_df=insta_df,
    tiktok_df=tiktok_df,
    youtube_df=youtube_df,
    insta_subscriber_col='subscribers', # Adjust if your Instagram dataframe's subscriber column is different
    insta_name_col='instagram_name',    # Adjust if your Instagram dataframe's name column is different
    tiktok_subscriber_col='subscribers',# Adjust if your TikTok dataframe's subscriber column is different
    tiktok_name_col='tiktoker_name',    # Adjust if your TikTok dataframe's name column is different
    yt_subscriber_col='subscribers',    # Adjust if your YouTube dataframe's subscriber column is different
    yt_name_col='youtuber_name'         # Adjust if your YouTube dataframe's name column is different
)

instagram_data.csv not found. Please check the file path.
tiktok_data.csv not found. Please check the file path.
youtube_data.csv not found. Please check the file path.
--- Starting Subscriber Analysis ---
Instagram data or key columns missing for subscriber analysis. Skipping Instagram.
TikTok data or key columns missing for subscriber analysis. Skipping TikTok.
YouTube data or key columns missing for subscriber analysis. Skipping YouTube.
No valid subscriber data could be consolidated for comparative analysis. Check individual platform data and column names.
