# Import libraries

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# read instagram
insta_df = pd.read_csv('./Clean_Data/merged_instagram_data_cleaned.csv')

# read tiktok
tiktok_df = pd.read_csv('./Clean_Data/merged_tiktok_data_cleaned.csv')

# read youtube
youtube_df = pd.read_csv('./Clean_Data/merged_youtube_data_cleaned.csv')

In [3]:
def calculate_engagement_rate(row, like_col, comment_col, subscriber_col, view_col=None, share_col=None):
    likes = row[like_col] if like_col in row else 0
    comments = row[comment_col] if comment_col in row else 0
    views = row[view_col] if view_col in row else 0
    shares = row[share_col] if share_col in row else 0
    subscribers = row[subscriber_col] if subscriber_col in row else 0

    total_engagement = likes + comments + views + shares

    if subscribers > 0:
        return (total_engagement / subscribers) * 100
    else:
        return 0  # Or return np.nan or some other value

In [4]:
def compare_engagement_across_platforms(insta_df, tiktok_df, youtube_df):
    engagement_data = []

    # Inside the Instagram processing block
    if insta_df is not None and 'likes_avg' in insta_df.columns and 'comments_avg' in insta_df.columns and 'subscribers' in insta_df.columns and 'instagram_name' in insta_df.columns:
        print("\nData types for Instagram engagement columns:")
        print(f"likes_avg: {insta_df['likes_avg'].dtype}")
        print(f"comments_avg: {insta_df['comments_avg'].dtype}")
        if 'views_avg' in insta_df.columns:
            print(f"views_avg: {insta_df['views_avg'].dtype}")
        if 'shares_avg' in insta_df.columns:
            print(f"shares_avg: {insta_df['shares_avg'].dtype}")
        # ... rest of the Instagram processing code

    # Inside the TikTok processing block
    if tiktok_df is not None and 'likes_avg' in tiktok_df.columns and 'comments_avg' in tiktok_df.columns and 'subscribers' in tiktok_df.columns and 'tiktoker_name' in tiktok_df.columns:
        print("\nData types for TikTok engagement columns:")
        print(f"likes_avg: {tiktok_df['likes_avg'].dtype}")
        print(f"comments_avg: {tiktok_df['comments_avg'].dtype}")
        if 'views_avg' in tiktok_df.columns:
            print(f"views_avg: {tiktok_df['views_avg'].dtype}")
        if 'shares_avg' in tiktok_df.columns:
            print(f"shares_avg: {tiktok_df['shares_avg'].dtype}")
        # ... rest of the TikTok processing code

    # Inside the YouTube processing block
    if youtube_df is not None and 'avg_likes' in youtube_df.columns and 'avg_comments' in youtube_df.columns and 'subscribers' in youtube_df.columns and 'youtuber_name' in youtube_df.columns and 'avg_views' in youtube_df.columns:
        print("\nData types for YouTube engagement columns:")
        print(f"avg_likes: {youtube_df['avg_likes'].dtype}")
        print(f"avg_comments: {youtube_df['avg_comments'].dtype}")
        if 'avg_views' in youtube_df.columns:
            print(f"avg_views: {youtube_df['avg_views'].dtype}")
        if 'shares_avg' in youtube_df.columns: # Assuming you might have a shares column for YouTube later
            print(f"shares_avg: {youtube_df['shares_avg'].dtype}")
        # ... rest of the YouTube processing code
    elif youtube_df is not None and 'likes_avg' in youtube_df.columns and 'comments_avg' in youtube_df.columns and 'subscribers' in youtube_df.columns and 'youtuber_name' in youtube_df.columns and 'views_avg' in youtube_df.columns:
        print("\nData types for YouTube engagement columns (alternative):")
        print(f"likes_avg: {youtube_df['likes_avg'].dtype}")
        print(f"comments_avg: {youtube_df['comments_avg'].dtype}")
        if 'views_avg' in youtube_df.columns:
            print(f"views_avg: {youtube_df['views_avg'].dtype}")

    if engagement_data:
        engagement_df = pd.DataFrame(engagement_data)
        engagement_df['engagement_rate'] = engagement_df['engagement_rate'].replace([np.inf, -np.inf], np.nan).dropna()

        print("\n--- Comparative Analysis of Engagement Across Platforms ---")
        print("\nAverage Engagement Rate by Platform:")
        print(engagement_df.groupby('platform')['engagement_rate'].mean().sort_values(ascending=False))

        plt.figure(figsize=(10, 7))
        sns.boxplot(x='platform', y='engagement_rate', data=engagement_df)
        plt.title('Comparison of Engagement Rates Across Platforms')
        plt.xlabel('Platform')
        plt.ylabel('Engagement Rate (%)')
        plt.yscale('log') # Using log scale to handle potential outliers
        plt.show()

        plt.figure(figsize=(12, 8))
        sns.violinplot(x='platform', y='engagement_rate', data=engagement_df)
        plt.title('Distribution of Engagement Rates Across Platforms')
        plt.xlabel('Platform')
        plt.ylabel('Engagement Rate (%)')
        plt.yscale('log') # Using log scale to handle potential outliers
        plt.show()
    else:
        print("Could not calculate engagement rate for any of the platforms due to missing data.")

# Assuming your DataFrames are named insta_df, tiktok_df, youtube_df
compare_engagement_across_platforms(insta_df, tiktok_df, youtube_df)


Data types for Instagram engagement columns:
likes_avg: object
comments_avg: object
views_avg: object

Data types for TikTok engagement columns:
likes_avg: float64
comments_avg: float64
views_avg: float64
shares_avg: float64

Data types for YouTube engagement columns:
avg_likes: float64
avg_comments: float64
avg_views: float64
Could not calculate engagement rate for any of the platforms due to missing data.


In [5]:
def calculate_engagement_rate(row, like_col, comment_col, subscriber_col, view_col=None, share_col=None):
    likes = row[like_col] if like_col in row else 0
    comments = row[comment_col] if comment_col in row else 0
    views = row[view_col] if view_col in row else 0
    shares = row[share_col] if share_col in row else 0
    subscribers = row[subscriber_col] if subscriber_col in row else 1e-9

    total_engagement = likes + comments + views + shares
    return (total_engagement / subscribers) * 100

def compare_engagement_across_platforms(insta_df, tiktok_df, youtube_df,
                                        insta_like_col='likes_avg', insta_comment_col='comments_avg', insta_subscriber_col='subscribers', insta_view_col='views_avg',
                                        tiktok_like_col='likes_avg', tiktok_comment_col='comments_avg', tiktok_subscriber_col='subscribers', tiktok_view_col='views_avg', tiktok_share_col='shares_avg',
                                        yt_like_col='avg_likes', yt_comment_col='avg_comments', yt_subscriber_col='subscribers', yt_view_col='avg_views'):
    engagement_data = []

    # Calculate engagement rate for Instagram
    if insta_df is not None and insta_like_col in insta_df.columns and insta_comment_col in insta_df.columns and insta_subscriber_col in insta_df.columns and 'instagram_name' in insta_df.columns:
        insta_df['engagement_rate'] = insta_df.apply(
            calculate_engagement_rate, axis=1,
            like_col=insta_like_col, comment_col=insta_comment_col, subscriber_col=insta_subscriber_col, view_col=insta_view_col
        )
        engagement_data.extend(insta_df[['instagram_name', insta_subscriber_col, 'engagement_rate']].rename(columns={insta_subscriber_col: 'subscribers'}).assign(platform='Instagram'))

    # Calculate engagement rate for TikTok
    if tiktok_df is not None and tiktok_like_col in tiktok_df.columns and tiktok_comment_col in tiktok_df.columns and tiktok_subscriber_col in tiktok_df.columns and 'tiktoker_name' in tiktok_df.columns:
        tiktok_df['engagement_rate'] = tiktok_df.apply(
            calculate_engagement_rate, axis=1,
            like_col=tiktok_like_col, comment_col=tiktok_comment_col, subscriber_col=tiktok_subscriber_col, view_col=tiktok_view_col, share_col=tiktok_share_col
        )
        engagement_data.extend(tiktok_df[['tiktoker_name', tiktok_subscriber_col, 'engagement_rate']].rename(columns={tiktok_subscriber_col: 'subscribers'}).assign(platform='TikTok'))

    # Calculate engagement rate for YouTube
    if youtube_df is not None and yt_like_col in youtube_df.columns and yt_comment_col in youtube_df.columns and yt_subscriber_col in youtube_df.columns and 'youtuber_name' in youtube_df.columns and yt_view_col in youtube_df.columns:
        youtube_df['engagement_rate'] = youtube_df.apply(
            calculate_engagement_rate, axis=1,
            like_col=yt_like_col, comment_col=yt_comment_col, subscriber_col=yt_subscriber_col, view_col=yt_view_col
        )
        engagement_data.extend(youtube_df[['youtuber_name', yt_subscriber_col, 'engagement_rate']].rename(columns={yt_subscriber_col: 'subscribers'}).assign(platform='YouTube'))
    elif youtube_df is not None and 'likes_avg' in youtube_df.columns and 'comments_avg' in youtube_df.columns and 'subscribers' in youtube_df.columns and 'youtuber_name' in youtube_df.columns and 'views_avg' in youtube_df.columns:
         youtube_df['engagement_rate'] = youtube_df.apply(
            calculate_engagement_rate, axis=1,
            like_col='likes_avg', comment_col='comments_avg', subscriber_col='subscribers', view_col='views_avg'
        )
         engagement_data.extend(youtube_df[['youtuber_name', 'subscribers', 'engagement_rate']].assign(platform='YouTube'))


    if engagement_data:
        engagement_df = pd.DataFrame(engagement_data)
        engagement_df['engagement_rate'] = engagement_df['engagement_rate'].replace([np.inf, -np.inf], np.nan).dropna()

        print("\n--- Comparative Analysis of Engagement Across Platforms ---")
        print("\nAverage Engagement Rate by Platform:")
        print(engagement_df.groupby('platform')['engagement_rate'].mean().sort_values(ascending=False))

        plt.figure(figsize=(10, 7))
        sns.boxplot(x='platform', y='engagement_rate', data=engagement_df)
        plt.title('Comparison of Engagement Rates Across Platforms')
        plt.xlabel('Platform')
        plt.ylabel('Engagement Rate (%)')
        plt.yscale('log')
        plt.show()

        plt.figure(figsize=(12, 8))
        sns.violinplot(x='platform', y='engagement_rate', data=engagement_df)
        plt.title('Distribution of Engagement Rates Across Platforms')
        plt.xlabel('Platform')
        plt.ylabel('Engagement Rate (%)')
        plt.yscale('log')
        plt.show()
    else:
        print("Could not calculate engagement rate for any of the platforms due to missing data.")

# Call the function with the correct column names based on your DataFrames
compare_engagement_across_platforms(
    insta_df=insta_df,
    tiktok_df=tiktok_df,
    youtube_df=youtube_df,
    insta_like_col='likes_avg',  # Adjust if different
    insta_comment_col='comments_avg', # Adjust if different
    insta_subscriber_col='subscribers', # Adjust if different
    insta_view_col='views_avg', # Adjust if different
    tiktok_like_col='likes_avg', # Adjust if different
    tiktok_comment_col='comments_avg', # Adjust if different
    tiktok_subscriber_col='subscribers', # Adjust if different
    tiktok_view_col='views_avg', # Adjust if different
    tiktok_share_col='shares_avg', # Adjust if different
    yt_like_col='avg_likes', # Adjust if different
    yt_comment_col='avg_comments', # Adjust if different
    yt_subscriber_col='subscribers', # Adjust if different
    yt_view_col='avg_views' # Adjust if different
)

ZeroDivisionError: float division by zero