# Social Media PII Disclosure Analyses [Original Notebook]

This notebook serves as an example for users of the social-media-pii-disclosures-analyses dataset associated with the `Privacy vs. Social Capital: Examining Information Disclosure Patterns within Social Media Influencer Networks` and the research paper titled `Unveiling Influencer-Driven PII Disclosures in Social Media Discourse`.

Associated dataset: https://www.kaggle.com/datasets/edyvision/social-media-pii-disclosure-analyses

<em>Note: Only analysis results and summarizatiosn provided. No raw social media data is provided or exposed.</em>

Kaggle Hosted Notebook: [![Kaggle](https://img.shields.io/badge/Kaggle-035a7d?style=for-the-badge&logo=kaggle&logoColor=white)](https://www.kaggle.com/code/edyvision/social-media-pii-disclosure-analyses-reddit)

## Citation Details
When using this work, please cite with the following:

```
Eidan J. Rosado. (2024). Privacy vs. Social Capital: Social Media PII Disclosure Analyses (0.0.1). Zenodo. https://doi.org/10.5281/zenodo.13133302
```

In [1]:
import os
import warnings
import pandas as pd
from typing import List, Optional
from enum import Enum
from scipy.stats import pearsonr, spearmanr, anderson
from statsmodels.tsa.stattools import acf, pacf, adfuller
import statsmodels.api as sm
import plotly.graph_objects as go
from random import randint
from pandas.core.resample import Resampler

import numpy as np

warnings.filterwarnings('ignore')

In [2]:
root_path = '/kaggle/input/social-media-pii-disclosure-analyses'
main_study = "/main_study"

cluster_files = []
analyzed_posts_files = []

def get_analysis_files(directory):
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.__contains__("clusters.csv"):
                cluster_files.append(os.path.join(root, file))
            if file.__contains__("analyzed_posts.csv"):
                analyzed_posts_files.append(os.path.join(root, file))

get_analysis_files(root_path+main_study)

In [3]:
print(f"Found {len(cluster_files)} cluster summary file collections")

# Read and concatenate all CSV files
cluster_dataframes = [pd.read_csv(file_path, index_col=0) for file_path in cluster_files]
clusters_df = pd.concat(cluster_dataframes, ignore_index=True)

# Display the combined DataFrame
clusters_df.head()

Found 11 cluster summary file collections


Unnamed: 0,cluster_name,collection_name,graph_details,hashtags,influencer_count,influencer_tier_frequencies,node_count,pii_detection_count,pii_detection_frequencies,risk_score_max,...,timestamp_range,timestamp_span_sec,top_influence_power_score,top_influencer_tier,graph,pos,node_sizes,node_colors,title,labels
0,top_conversations_2023-09-12_graph-0,top_conversations_2023-09-12,{'graph': {'c56f537b-fd23-49c0-b9f7-237e9ef3d7...,[],1,"{'MICRO_INFLUENCER': 1, 'NON_INFLUENCER': 1407}",1408,308.0,"{'DATE_TIME': 223, 'NRP': 11, 'PERSON': 84, 'L...",3.0,...,"[1694508217.0, 1694520138.0]",11921.0,0.706826,MICRO_INFLUENCER,Graph with 1408 nodes and 1407 edges,{'c56f537b-fd23-49c0-b9f7-237e9ef3d79d': array...,"[312, 312, 312, 312, 312, 312, 312, 312, 312, ...","[[0.0, 0.5019607843137255, 0.0, 1.0], [0.0, 0....",top_conversations_2023-09-12_graph-0,"{'c56f537b-fd23-49c0-b9f7-237e9ef3d79d': '', '..."
1,top_conversations_2023-09-12_graph-1,top_conversations_2023-09-12,{'graph': {'4bfbae86-d3e0-452b-af7b-038b0b7072...,[],8,"{'NANO_INFLUENCER': 8, 'NON_INFLUENCER': 1380}",1388,466.0,"{'PERSON': 164, 'DATE_TIME': 164, 'NRP': 109, ...",3.0,...,"[1694540754.0, 1694556002.0]",15248.0,0.706302,NANO_INFLUENCER,Graph with 1388 nodes and 1387 edges,{'4bfbae86-d3e0-452b-af7b-038b0b707298': array...,"[312, 312, 312, 312, 312, 312, 312, 312, 312, ...","[[0.0, 0.5019607843137255, 0.0, 1.0], [0.0, 0....",top_conversations_2023-09-12_graph-1,"{'4bfbae86-d3e0-452b-af7b-038b0b707298': '', '..."
2,top_conversations_2023-09-12_graph-2,top_conversations_2023-09-12,{'graph': {'458063c5-70b4-4cb8-8e5b-93626a6a58...,[],5,"{'NANO_INFLUENCER': 5, 'NON_INFLUENCER': 946}",951,464.0,"{'LOCATION': 311, 'NRP': 108, 'MEDICAL_LICENSE...",3.0,...,"[1694525829.0, 1694546985.0]",21156.0,0.706563,NANO_INFLUENCER,Graph with 951 nodes and 950 edges,{'458063c5-70b4-4cb8-8e5b-93626a6a58b3': array...,"[312, 312, 312, 312, 312, 312, 312, 312, 312, ...","[[0.0, 0.5019607843137255, 0.0, 1.0], [1.0, 0....",top_conversations_2023-09-12_graph-2,"{'458063c5-70b4-4cb8-8e5b-93626a6a58b3': '', '..."
3,top_conversations_2023-09-12_graph-3,top_conversations_2023-09-12,{'graph': {'3924bab2-5bca-4ae7-b004-da66141451...,[],2,"{'NANO_INFLUENCER': 2, 'NON_INFLUENCER': 698}",700,244.0,"{'PERSON': 170, 'NRP': 46, 'LOCATION': 48, 'DA...",3.0,...,"[1694537856.0, 1694550589.0]",12733.0,0.706438,NANO_INFLUENCER,Graph with 700 nodes and 699 edges,{'3924bab2-5bca-4ae7-b004-da66141451ed': array...,"[312, 312, 312, 312, 625, 312, 312, 312, 312, ...","[[0.0, 0.5019607843137255, 0.0, 1.0], [0.0, 0....",top_conversations_2023-09-12_graph-3,"{'3924bab2-5bca-4ae7-b004-da66141451ed': '', '..."
4,top_conversations_2023-09-12_graph-4,top_conversations_2023-09-12,{'graph': {'9735ad21-143f-4cd1-bd72-953c1a6bab...,[],2,"{'MICRO_INFLUENCER': 2, 'NON_INFLUENCER': 649}",651,209.0,"{'DATE_TIME': 149, 'LOCATION': 28, 'PERSON': 3...",3.0,...,"[1694512708.0, 1694525394.0]",12686.0,0.706794,MICRO_INFLUENCER,Graph with 651 nodes and 650 edges,{'9735ad21-143f-4cd1-bd72-953c1a6bab15': array...,"[312, 312, 312, 312, 312, 312, 312, 312, 312, ...","[[0.0, 0.5019607843137255, 0.0, 1.0], [1.0, 0....",top_conversations_2023-09-12_graph-4,"{'9735ad21-143f-4cd1-bd72-953c1a6bab15': '', '..."


In [4]:
# Change the following settings for your analyses
minimum_node_count = 30
maximum_node_count = 10000
time_season = 15

# Filter out by the cluster size thresholds
viable_clusters_df = clusters_df[
        (clusters_df.node_count >= minimum_node_count)
        & (clusters_df.node_count <= maximum_node_count)
    ]

print(f"Of the {len(clusters_df)} clusters, only {len(viable_clusters_df)} or {round(len(viable_clusters_df)/len(clusters_df), 4)*100}% match the criterion")

Of the 285 clusters, only 183 or 64.21% match the criterion


In [5]:
print(f"Found {len(analyzed_posts_files)} post analysis file collections")

# Read and concatenate all CSV files
analyzed_posts_dataframes = [pd.read_csv(file_path, index_col=0) for file_path in analyzed_posts_files]
analyzed_posts_df = pd.concat(analyzed_posts_dataframes, ignore_index=True)

# Display the combined DataFrame
analyzed_posts_df.head()

Found 11 post analysis file collections


Unnamed: 0,post_uuid,user_uuid,influence_power,user_follower_count,user_influencer_tier,post_crosspost_count,post_quote_count,post_like_count,post_dislike_count,post_reply_count,...,pii_detected,pii_disclosed,post_uuids_replied_to,time_elapsed,collection_name,is_comment,is_text_starter,timestamp,cluster_name,urls
0,c314ad00-50dd-4cd9-8ff4-abc98a229d25,b1848c4a-d76e-4d3c-8da6-576c95a1a2bb,0.706739,6006,NANO_INFLUENCER,5,,9636,0,520,...,False,[],[],0,top_conversations_2023-09-17,False,False,2023-09-17 02:34:18,top_conversations_2023-09-17_graph-8,[]
1,122e46dc-1dbf-4091-8544-b21582aafeca,762e86e5-227b-4442-98d6-99686d8f1564,0.04199,74,NON_INFLUENCER,0,,596,0,3,...,False,[],['c314ad00-50dd-4cd9-8ff4-abc98a229d25'],7921,top_conversations_2023-09-17,True,True,2023-09-17 04:46:19,top_conversations_2023-09-17_graph-8,[]
2,eb46ec75-4d58-49bb-925d-0f2dabf94e40,03095f5e-2a50-4911-b7ee-8aac0ffa5611,0.042138,44,NON_INFLUENCER,0,,449,0,4,...,False,[],['c314ad00-50dd-4cd9-8ff4-abc98a229d25'],2165,top_conversations_2023-09-17,True,True,2023-09-17 03:10:23,top_conversations_2023-09-17_graph-8,[]
3,00466221-ddf3-4cbd-8e57-c3541c4e61bc,08889b62-e6a4-4b6a-9250-da943558d175,0.042749,43,NON_INFLUENCER,0,,1096,0,8,...,False,[],['c314ad00-50dd-4cd9-8ff4-abc98a229d25'],620,top_conversations_2023-09-17,True,True,2023-09-17 02:44:38,top_conversations_2023-09-17_graph-8,[]
4,ef4f4ad9-8d51-4081-a091-40b7a36784cc,5d6dcb3d-9dfa-43a6-8b40-d5ff1f17295e,0.041554,35,NON_INFLUENCER,0,,98,0,0,...,False,[],['c314ad00-50dd-4cd9-8ff4-abc98a229d25'],1475,top_conversations_2023-09-17,True,True,2023-09-17 02:58:53,top_conversations_2023-09-17_graph-8,[]


In [6]:
viable_posts_df = analyzed_posts_df[analyzed_posts_df["cluster_name"].isin(viable_clusters_df["cluster_name"])]
print(f"{len(analyzed_posts_df)} / {len(viable_posts_df)} ({round(len(viable_posts_df)/len(analyzed_posts_df), 4)*100}%) of posts left after filtering non-viable clusters")

121856 / 121628 (99.81%) of posts left after filtering non-viable clusters


## Graph Engagement and Disclosure Timeseries

In [7]:
# Select some collection
collection = list(set(viable_posts_df["collection_name"]))[1]
collection

'top_conversations_2023-09-20'

In [8]:
posts_df = viable_posts_df[viable_posts_df["collection_name"] == collection]
posts_df.head(3)

Unnamed: 0,post_uuid,user_uuid,influence_power,user_follower_count,user_influencer_tier,post_crosspost_count,post_quote_count,post_like_count,post_dislike_count,post_reply_count,...,pii_detected,pii_disclosed,post_uuids_replied_to,time_elapsed,collection_name,is_comment,is_text_starter,timestamp,cluster_name,urls
95906,dbcbff33-6faf-49c2-814b-5cddeaed0c03,5083552f-75f4-4cd6-9fe4-f592cc78a1ad,0.700264,6261,NANO_INFLUENCER,1,,7314,0,154,...,False,[],[],0,top_conversations_2023-09-20,False,False,2023-09-20 04:07:11,top_conversations_2023-09-20_graph-19,[]
95907,f7153851-ab58-48b3-9f0b-8d846eee29f7,795ef217-5e9c-492a-987c-07a46b22f250,0.112746,110,NON_INFLUENCER,0,,224,0,1,...,False,[],['dbcbff33-6faf-49c2-814b-5cddeaed0c03'],1318,top_conversations_2023-09-20,True,True,2023-09-20 04:29:09,top_conversations_2023-09-20_graph-19,[]
95908,e616c196-6e52-4b91-b8d2-5293dd1358a9,cdcdad57-eb3e-4329-815b-4883a6f3480a,0.112932,110,NON_INFLUENCER,0,,121,0,1,...,True,['PERSON'],['dbcbff33-6faf-49c2-814b-5cddeaed0c03'],2286,top_conversations_2023-09-20,True,True,2023-09-20 04:45:17,top_conversations_2023-09-20_graph-19,[]


In [9]:
def random_color():
    """
    Generates light to medium colors
    @return:
    """
    r = randint(100, 220)
    g = randint(100, 220)
    b = randint(100, 220)

    return f"rgb({r}, {g}, {b})"

In [10]:
def resample_data_by_time(
    data,
    column_name: str = "timestamp",
    time_bin_size: int = 1,
    convert_timestamp: bool = False,
) -> Resampler:
    """
    Resample data, grouping by timeslice or time bin
    @param time_bin_size: int - minutes to bin data into
    @param column_name: str - column name of timestamp values
    @param convert_timestamp: bool - conversion flag in case pandas timestamp column requires conversion
    @param data: Dataframe
    @return: resampled data
    """
    resampled_data = data

    if convert_timestamp:
        resampled_data["timestamp"] = pd.to_datetime(data[column_name])

    bin_size = "min" if time_bin_size == 1 else f"{time_bin_size}min"
    return resampled_data.set_index("timestamp").resample(bin_size).count()

In [11]:
def draw_timeseries_graph_for_collection(
    data,
    cluster_names: List[str],
    graph_mode="engagements",
    time_bin_size: int = 15,
):
    """

    @param cluster_names:
    @param data:
    @param graph_mode:
    @param time_bin_size:
    @return:
    """
    graph_type = "Engagements" if graph_mode == "engagements" else "Disclosures"
    fig = go.Figure()

    grouped_groups = []
    for cluster_entry in cluster_names:
        cluster_df = data[(data["cluster_name"] == cluster_entry)]

        if graph_mode == "disclosures":
            cluster_df = cluster_df[cluster_df["pii_detected"] == True]

        df_grouped = resample_data_by_time(
            data=cluster_df, time_bin_size=time_bin_size, convert_timestamp=True
        )
        df_grouped.reset_index(inplace=True)

        grouped_groups.append(df_grouped)

        fig.add_trace(
            go.Scatter(
                x=df_grouped["timestamp"],
                y=df_grouped["post_uuid"],
                mode="lines",
                name=cluster_entry.split("_", 1)[-1],
                hoverinfo="x+y",
                marker=dict(color=random_color()),
            )
        )

    # Adding the average trend line
    average_series = pd.concat(grouped_groups).groupby("timestamp")["post_uuid"].mean()
    fig.add_trace(
        go.Scatter(
            x=average_series.index,
            y=average_series,
            mode="lines",
            name="Average",
            hoverinfo="x+y",
            line=dict(color="black", dash="dash"),
        )
    )

    max_mean_date = average_series.idxmax()
    max_mean_value = average_series.max()

    fig.add_annotation(
        x=max_mean_date,
        y=max_mean_value,
        text=f"Highest Avg {graph_type}",
        bgcolor="white",
        showarrow=True,
        arrowhead=2,
        ax=0,
        ay=-40,
    )

    # Update titles
    fig.update_layout(
        xaxis_title=f"Time ({time_bin_size} minute intervals)",
        yaxis_title=graph_type,
        height=600,
        title=None,
    )

    # Add Legend
    fig.update_traces(showlegend=True)

    fig.show()

In [12]:
cluster_names = list(set(posts_df["cluster_name"]))

draw_timeseries_graph_for_collection(
                        data=posts_df,
                        graph_mode="engagements",
                        cluster_names=cluster_names,
                        time_bin_size=15,
                    )
draw_timeseries_graph_for_collection(
                        data=posts_df,
                        graph_mode="disclosures",
                        cluster_names=cluster_names,
                        time_bin_size=15,
                    )

## Run ARIMA Analysis

In [13]:
def run_arima(
    data,
    column_name,
    d=2,
    alpha=0.05,
    acf_values=None,
    pacf_values=None,
):
    """
    Runs ARIMA test
    @param d: int - the number of times the data had to be differenced to achieve stationarity (See ADFuller Test)
    @param data: DataFrame - social media data collection
    @param column_name: str - column to pull series data from
    @param alpha: float - alpha value (0.01, 0.05, etc.)
    @param acf_values: auto-correlation values from ACF test
    @param pacf_values: partial-correlation values from PACF test
    @return: ARIMA results
    """

    series = data[column_name]

    if pacf_values is None:
        pacf_values = calculate_timeseries_pacf(
            series, n_lags=int(len(series) / 2) - 1, alpha=alpha
        )["partial_correlation_values"]

    if acf_values is None:
        acf_values = calculate_timeseries_acf(series, alpha=alpha)[
            "auto_correlation_results"
        ]

    q = next(
        (
            i
            for i, value in enumerate(acf_values)
            if value < 1.96 / np.sqrt(len(series))
        ),
        None,
    )

    p = next(
        (
            i
            for i, value in enumerate(pacf_values)
            if value < 1.96 / np.sqrt(len(series))
        ),
        None,
    )

    # Fit the ARIMA model
    arima_model = sm.tsa.arima.ARIMA(series, order=(p, d, q))

    return arima_model.fit()

def run_adfuller_stationarity_test(data, column_name, log_transform=False, max_diff=2):
    """
    Runs ADFuller test for stationarity - will difference the data until it reaches stationarity.
    @param max_diff: int - default for max-diffs
    @param column_name: str - column name to pull series data from
    @param log_transform: bool - whether to use log transform on the data to achieve stationarity
    @param data: DataFrame - social media data
    @return: dict - ADFuller stationarity results
    """

    diff_count = 0

    if log_transform:
        data[column_name].diff().dropna().apply(np.log)

    data.dropna(inplace=True)
        
    result = adfuller(data[column_name])

    # If non-stationary, difference the series until stationary
    while result[1] > 0.05 and diff_count < max_diff:
        data[column_name] = data[column_name].diff()
        data.dropna(inplace=True)

        # Check for stationarity on the differenced data
        result = adfuller(data[column_name])

        diff_count += 1

    return {
        "adf_statistic": result[0],
        "p_value": result[1],
        "critical_values": result[4],
        "stationary_data": data,
        "diff_count": diff_count,
    }


def get_arima_coefficient(arima_model_fit):
    """
    Retrieves the ARIMA Coefficient given a model fit
    @param arima_model_fit:
    @return: coefficient for ARIMA model fit
    """
    return arima_model_fit.params["ar.L1"] if "ar.L1" in arima_model_fit.params else 0


def calculate_timeseries_acf(data, n_lags: Optional[int] = None, alpha=0.05):
    """
    Calculates Auto-correlation for timeseries data

    @param alpha: float - alpha value (0.01, 0.05, etc.)
    @param n_lags: int - number of lags (total time intervals)
    @param data: DataFrame - social media data
    @return: dict - ACF results
    """
    results = acf(
        data,
        nlags=n_lags if n_lags else len(data) - 1,
        alpha=alpha,
        fft=True,
        qstat=True,
    )

    return {
        "auto_correlation_values": results[0],
        "confidence_intervals": results[1],
        "ljung_box_statistics": results[2],
        "p_values": results[3],
    }


def calculate_timeseries_pacf(data, n_lags: Optional[int] = None, alpha=0.05):
    """
    Calculates Partial-correlation for timeseries data

    @param alpha: float - alpha value (0.01, 0.05, etc.)
    @param n_lags: int - number of lags (total time intervals)
    @param data: DataFrame - social media data
    @return: dict - PACF results
    """
    results = pacf(
        data,
        nlags=n_lags if n_lags else int(len(data) / 2) - 1,
        alpha=alpha,
    )

    return {
        "partial_correlation_values": results[0],
        "confidence_intervals": results[1],
        "p_values": None,
    }


def determine_arima_trend(acf_values, arima_coefficient, d, factor=0.1):
    """
    Determines whether ARIMA results present a trend (i.e. ascending, descending, no trend)
    @param factor:
    @param arima_coefficient:
    @param acf_values: list of autocorrelation values
    @param d:
    @return: str - trend
    """
    threshold = acf_values.std() * factor

    if d >= 1:
        if arima_coefficient > threshold:
            trend = "Asc"
        elif arima_coefficient < -(threshold):
            trend = "Desc"
        else:
            trend = "None"
    elif acf_values[1] > threshold:
        trend = "Asc"
    elif acf_values[1] < -(threshold):
        trend = "Desc"
    else:
        trend = "No Trend"

    return trend

In [14]:
def run_timeseries_analyses_for_cluster(
    grouped_posts, time_bin_size, alpha: float
) -> dict:
    """
    Runs the timeseries analysis for a collection of posts, a pre-determined time bin size (time intervals to group
    posts by), and an alpha value (0.01, 0.05, etc). Runs ADFuller, ARIMA, Auto-Correlation, Partial-Correlation, rate
    of change, and determines change points.

    @param grouped_posts: grouped posts in time bins
    @param time_bin_size: int - time bin size (in minutes)
    @param alpha: float - alpha value (0.01, 0.05, etc.)
    @return: dict - timeseries results
    """

    grouped_posts["rate_of_change"] = (
        grouped_posts["post_uuid"].diff() / grouped_posts["post_uuid"].shift()
    )
    grouped_posts["rate_of_change_percentage"] = grouped_posts["rate_of_change"] * 100
    grouped_posts["time_numeric"] = (
        grouped_posts["timestamp"] - grouped_posts["timestamp"].min()
    ) / np.timedelta64(time_bin_size, "m")

    # Extract change points
    change_points = determine_changepoints(grouped_posts["time_numeric"])
    decline_points, lags = determine_decline_points(
        grouped_posts["post_uuid"], change_points
    )
    

    adfuller_result = run_adfuller_stationarity_test(
        data=grouped_posts, column_name="post_uuid", log_transform=True
    )
    stationary_series = adfuller_result["stationary_data"]["post_uuid"]
    stationary_data = adfuller_result["stationary_data"]

    # Generate exponential decay data with timepoints
    acf_output = calculate_timeseries_acf(stationary_series, alpha=alpha)
    pacf_output = calculate_timeseries_pacf(
        stationary_series,
        n_lags=int(len(stationary_series) / 2) - 1,
        alpha=alpha,
    )

    # Run ARIMA
    arima_model_fit = run_arima(
        adfuller_result["stationary_data"],
        "post_uuid",
        d=adfuller_result["diff_count"],
        alpha=0.05,
        acf_values=acf_output["auto_correlation_values"],
        pacf_values=pacf_output["partial_correlation_values"],
    )
    
#     print(arima_model_fit.pvalues)

    arima_coefficient = get_arima_coefficient(arima_model_fit)

    return {
        "arima_model_fit": arima_model_fit,
        "arima_coefficient": arima_coefficient,
        "acf_results": acf_output,
        "pacf_results": pacf_output,
        "adfuller_results": adfuller_result,
        "stationary_data": stationary_data,
        "stationary_series": stationary_series,
        "rate_of_change_percentage": grouped_posts["rate_of_change_percentage"],
        "change_points": change_points,
        "decline_points": decline_points,
        "lags": lags,
    }

In [15]:
def determine_changepoints(signal, threshold=1):
    """
    Uses the Cumulative Sum Control Part (CUSUM) Algorithm to determine change points in the series.
    @param signal: series
    @param threshold: int - decision threshold
    @return: List[int] - associated changepoints
    """
    mean_signal = np.mean(signal)
    s_pos = np.zeros_like(signal)
    s_neg = np.zeros_like(signal)
    change_points = []

    for i in range(1, len(signal)):
        s_pos[i] = max(0, s_pos[i - 1] + signal[i] - mean_signal - threshold)
        s_neg[i] = min(0, s_neg[i - 1] + signal[i] - mean_signal + threshold)

        if s_pos[i] > threshold or s_neg[i] < -threshold:
            change_points.append(i)
            s_pos[i] = 0
            s_neg[i] = 0

    return change_points

def determine_decline_points(series, change_points):
    """
    Using a set of change points, this will go through a series and determine which of the change points present as
    "declines"

    @param series: series data
    @param change_points: List[int] - collection of change points determined by CUSUM algorithm
    @return: Tuple[List[int], List[int]] - Tuple of changepoint indices and associated lags
    """
    changepoint_indices = []
    lags = []
    for i in range(len(change_points) - 1):
        if series[change_points[i]] > series[change_points[i + 1]]:
            changepoint_indices.append(i + 1)
            lags.append(change_points[i])
    return changepoint_indices, lags

In [16]:
def build_collection_timeseries_analysis_summary(data, cluster_names, time_season: int = 15, min_threshold: int = 30):
    timeseries_analyses = []


    for cluster_name in cluster_names:
        cluster_posts_df = data[data["cluster_name"] == cluster_name]
        
        if len(cluster_posts_df) < min_threshold:
            continue

        # Group Engagements, filter, and analyze
        df_grouped = resample_data_by_time(
            data=cluster_posts_df, time_bin_size=time_season, convert_timestamp=True
        )
        df_grouped.reset_index(inplace=True)

        timeseries_analysis = run_timeseries_analyses_for_cluster(
            df_grouped, time_bin_size=time_season, alpha=0.05
        )

        # Pull ADFullter (stationarity test) results and Auto-correlation Function results
        adfuller_results = timeseries_analysis["adfuller_results"]
        acf_results = timeseries_analysis["acf_results"]

        # Run ARIMA and determine trend
        arima_model_fit = timeseries_analysis["arima_model_fit"]
        arima_coefficient = get_arima_coefficient(arima_model_fit)
        arima_trend = determine_arima_trend(
            acf_values=acf_results["auto_correlation_values"],
            arima_coefficient=arima_coefficient,
            d=adfuller_results["diff_count"],
        )
        
        summary = {
            "Cluster": cluster_name,
            "Posts": len(cluster_posts_df.index),
            "Lag 1 ACF [P-Val]": f"{round(acf_results['auto_correlation_values'][1], 4)} [{round(acf_results['p_values'][0], 4)}]",
            "Lags": len(df_grouped.index) - 1,
            "Declining Lags": f"{timeseries_analysis['lags']}",
            "ARIMA AR [P-Val]": f"{round(arima_model_fit.aic, 4)} [{round(arima_model_fit.pvalues['ar.L1'], 4)}]",
            "Significant Trend": round(arima_model_fit.pvalues['ar.L1'], 4) < 0.05,
            "Trend": f"{arima_trend}",
        }

        timeseries_analyses.append(summary)
        
    analyses_df = pd.DataFrame.from_dict(timeseries_analyses)
    
    return analyses_df.sort_values(by="Posts", ascending=False)

In [17]:
# Given a time season of 15 minutes and a singular collection name, build the ARIMA timeseries summaries for engagements and disclosures
posts_df = viable_posts_df[viable_posts_df["collection_name"] == collection]
cluster_names = list(set(posts_df["cluster_name"]))

# Retrieve ARIMA analysis summaries for engagements
eng_analysis_df = build_collection_timeseries_analysis_summary(posts_df, cluster_names, time_season, minimum_node_count)

# Filter for disclosures and retrieve ARIMA analysis summaries
disclosed_df = posts_df[posts_df["pii_detected"] == True]
disc_analysis_df = build_collection_timeseries_analysis_summary(disclosed_df, cluster_names, time_season, minimum_node_count)

Each analysis dataframe will have the cluster, associated posts, lags with pvalues, ARIMA AR/MR, and determined trend. Each dataframe is pre-sorted in descending order by number of Posts, presenting the larger conversations first.

In [18]:
eng_analysis_df.head()

Unnamed: 0,Cluster,Posts,Lag 1 ACF [P-Val],Lags,Declining Lags,ARIMA AR [P-Val],Significant Trend,Trend
10,top_conversations_2023-09-20_graph-0,2337,-0.0277 [0.8715],30,"[10, 11, 12, 19, 21, 22, 23, 25, 27, 28, 30, 31]",265.6221 [0.3638],False,Desc
6,top_conversations_2023-09-20_graph-1,1572,-0.49 [0.014],21,"[7, 8, 16, 18, 19, 21, 22, 23]",193.6428 [0.0],True,Desc
2,top_conversations_2023-09-20_graph-2,1384,-0.0613 [0.8294],8,[10],89.515 [0.0],True,Desc
8,top_conversations_2023-09-20_graph-3,862,0.3649 [0.0621],22,"[5, 6, 7, 9, 14, 18, 20, 21]",175.1548 [0.7841],False,Asc
5,top_conversations_2023-09-20_graph-4,629,-0.6262 [0.0011],23,"[6, 10, 17, 20, 22, 23, 25]",194.6717 [0.0],True,Desc


In [19]:
disc_analysis_df.head()

Unnamed: 0,Cluster,Posts,Lag 1 ACF [P-Val],Lags,Declining Lags,ARIMA AR [P-Val],Significant Trend,Trend
4,top_conversations_2023-09-20_graph-1,559,-0.1077 [0.589],21,"[6, 7, 14, 16, 17, 19, 20, 21, 22]",160.0372 [0.3709],False,Desc
1,top_conversations_2023-09-20_graph-2,536,0.1181 [0.7021],6,"[7, 8]",56.508 [0.9796],False,Asc
8,top_conversations_2023-09-20_graph-0,497,0.3972 [0.0223],29,"[2, 5, 8, 9, 10, 11, 18, 20, 21, 25, 26, 27]",202.0288 [0.8586],False,Asc
6,top_conversations_2023-09-20_graph-3,459,0.1578 [0.4199],22,"[5, 6, 9, 14, 15, 20, 21]",164.0947 [0.236],False,Asc
3,top_conversations_2023-09-20_graph-4,284,-0.7838 [0.0],23,"[1, 3, 6, 8, 10, 17, 19, 22, 23, 25]",176.2609 [0.0],True,Desc


In [20]:
# Clusters with Significant Descending Trends
eng_trend_significant_rows = eng_analysis_df[(eng_analysis_df['Significant Trend']) & (eng_analysis_df['Trend'] == 'Desc')]
print(f"Found {len(eng_trend_significant_rows['Cluster'].tolist())} clusters showing significant declining trends for engagements")

disc_trend_significant_rows = disc_analysis_df[(disc_analysis_df['Significant Trend']) & (disc_analysis_df['Trend'] == 'Desc')]
print(f"Found {len(disc_trend_significant_rows['Cluster'].tolist())} clusters showing significant declining trends for disclosures")

Found 8 clusters showing significant declining trends for engagements
Found 7 clusters showing significant declining trends for disclosures


Now, we will want to obtain the summaries for every day (every collection)

In [21]:
# Get all the collection names and sort them from the first to last day of the collection
def extract_date(collection_name):
    date_str = collection_name.split('_')[-1]
    return pd.to_datetime(date_str)

collections = list(set(viable_clusters_df["collection_name"]))
collections_sorted = sorted(collections, key=extract_date)
collections_sorted

['top_conversations_2023-09-12',
 'top_conversations_2023-09-13',
 'top_conversations_2023-09-14',
 'top_conversations_2023-09-15',
 'top_conversations_2023-09-16',
 'top_conversations_2023-09-17',
 'top_conversations_2023-09-18',
 'top_conversations_2023-09-19',
 'top_conversations_2023-09-20',
 'top_conversations_2023-09-21',
 'top_conversations_2023-09-22']

In [22]:
# For every collection entry, extract the analyses for engagements and disclosures, identifying the statistically significant along the way
dataset_trend_summaries = []
time_season = 10 # the timeseason needed to be adjusted for the following due to one of the collections' sample size
for collection in collections_sorted:
    print("#", end =" ")
    
    # Retrieve the viable clusters in collection
    filtered_collection = viable_posts_df[viable_posts_df["collection_name"]==collection]
    
    # Get current collection's cluster names
    cluster_names = list(set(filtered_collection["cluster_name"]))

    # Retrieve ARIMA analysis summaries for engagements
    eng_analysis_df = build_collection_timeseries_analysis_summary(filtered_collection, cluster_names, time_season, minimum_node_count)

    # Filter for disclosures and retrieve ARIMA analysis summaries
    disclosed_df = filtered_collection[filtered_collection["pii_detected"] == True]
    disc_analysis_df = build_collection_timeseries_analysis_summary(disclosed_df, cluster_names, time_season, minimum_node_count)
    
    # Extract significant descending trends
    eng_trend_significant_rows = eng_analysis_df[(eng_analysis_df['Significant Trend']) & (eng_analysis_df['Trend'] == 'Desc')]
    disc_trend_significant_rows = disc_analysis_df[(disc_analysis_df['Significant Trend']) & (disc_analysis_df['Trend'] == 'Desc')]
    
    eng_declining_trend_rows = eng_analysis_df[(eng_analysis_df['Trend'] == 'Desc')]
    disc_declining_trend_rows = disc_analysis_df[(disc_analysis_df['Trend'] == 'Desc')]
    
    # Append Summary   
    dataset_trend_summaries.append({
        "date": extract_date(collection),
        "total_clusters": len(cluster_names),
        "eng_declining_clusters": len(eng_declining_trend_rows),
        "disc_declining_clusters": len(disc_declining_trend_rows),
        "eng_sig_declining_clusters": len(eng_trend_significant_rows),
        "disc_sig_declining_clusters": len(disc_trend_significant_rows)
    })

print("Dataset Trend Summary Compilation Ready!")
dataset_trend_df = pd.DataFrame.from_dict(dataset_trend_summaries)
    

# # # # # # # # # # # Dataset Trend Summary Compilation Ready!


In [23]:
dataset_trend_df

Unnamed: 0,date,total_clusters,eng_declining_clusters,disc_declining_clusters,eng_sig_declining_clusters,disc_sig_declining_clusters
0,2023-09-12,11,10,6,4,5
1,2023-09-13,20,17,12,8,9
2,2023-09-14,21,17,16,8,10
3,2023-09-15,15,11,12,6,6
4,2023-09-16,13,11,10,6,5
5,2023-09-17,16,14,9,8,6
6,2023-09-18,18,11,12,9,7
7,2023-09-19,10,6,7,3,1
8,2023-09-20,22,19,13,7,6
9,2023-09-21,9,7,5,3,2


In [24]:
# Adjust the data for visualization
data = {
    "dates": dataset_trend_df['date'].dt.strftime('%-m/%-d').tolist(),

"total_clusters" : dataset_trend_df["total_clusters"].tolist(),
"eng_declining_clusters": dataset_trend_df["eng_declining_clusters"].tolist(),
"disc_declining_clusters": dataset_trend_df["disc_declining_clusters"].tolist(),
"clusters_with_significant_eng_declines": dataset_trend_df["eng_sig_declining_clusters"].tolist(),
"clusters_with_significant_disc_declines": dataset_trend_df["disc_sig_declining_clusters"].tolist(),
}

In [25]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=data['dates'], 
    y=data['total_clusters'], 
    mode='lines', 
    name='Total Clusters', 
    stackgroup='one'
))
fig.add_trace(go.Scatter(
    x=data['dates'], 
    y=data['eng_declining_clusters'], 
    mode='lines', 
    name='Declining Clusters', 
    stackgroup='two'
))

fig.add_trace(go.Scatter(
    x=data['dates'], 
    y=data['clusters_with_significant_eng_declines'], 
    mode='lines', 
    name='Significant Declines', 
    stackgroup='three'
))


fig.update_layout(
    xaxis_title='Collection Dates',
    yaxis_title='Clusters and Engagements Decline',
    legend_title='Categories'
)

fig.show()

In [26]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=data['dates'], 
    y=data['total_clusters'], 
    mode='lines', 
    name='Total Clusters', 
    stackgroup='one'
))
fig.add_trace(go.Scatter(
    x=data['dates'], 
    y=data['disc_declining_clusters'], 
    mode='lines', 
    name='Declining Clusters', 
    stackgroup='two'
))

fig.add_trace(go.Scatter(
    x=data['dates'], 
    y=data['clusters_with_significant_disc_declines'], 
    mode='lines', 
    name='Significant Declines', 
    stackgroup='three'
))

fig.update_layout(
    xaxis_title='Collection Dates',
    yaxis_title='Clusters and Disclosures Decline',
    legend_title='Categories'
)

fig.show()

From here, we determine the final percentages of total dataset that were declines and how many were statistically significant

In [27]:
total_clusters = sum(data["total_clusters"])
total_declining_eng_trend = sum(data["eng_declining_clusters"])
total_percentage_declining_eng_trend = round(sum(data["eng_declining_clusters"])/sum(data["total_clusters"]), 4)*100
total_percentage_sig_declining_eng_trend = round(sum(data["clusters_with_significant_eng_declines"])/sum(data["eng_declining_clusters"]), 4)*100

total_declining_disc_trend = sum(data["disc_declining_clusters"])
total_percentage_declining_disc_trend = round(sum(data["disc_declining_clusters"])/sum(data["total_clusters"]), 4)*100
total_percentage_sig_declining_disc_trend = round(sum(data["clusters_with_significant_disc_declines"])/sum(data["disc_declining_clusters"]), 4)*100

print(f"\nAcross {len(collections_sorted)} collections, there were {total_clusters} clusters with the proper threshold.\n")
print(f"Of these clusters, {total_declining_eng_trend} or {total_percentage_declining_eng_trend}% had a declining trend of which only {total_percentage_sig_declining_eng_trend}% had statistical significance.", end=" ")
print(f"When filtering by clusters with disclosures, {total_declining_disc_trend} or {total_percentage_declining_disc_trend}% had a declining trend of which only {total_percentage_sig_declining_disc_trend}% had statistical significance.")


Across 11 collections, there were 183 clusters with the proper threshold.

Of these clusters, 146 or 79.78% had a declining trend of which only 50.0% had statistical significance. When filtering by clusters with disclosures, 121 or 66.12% had a declining trend of which only 56.2% had statistical significance.


## Run Correlation Analysis

In [28]:
# Update the following as needed to account for varying node thresholds in correlation analysis runs
minimum_node_count = 30
maximum_node_count = 10000

In [29]:
class InfluencerTierHierarchy(Enum):
    """Influencer Size Tier enum, based on the number of followers"""

    NON_INFLUENCER: int = 0
    NANO_INFLUENCER: int = 1
    MICRO_INFLUENCER: int = 2
    MID_TIER_INFLUENCER: int = 3
    MACRO_INFLUENCER: int = 4
    MEGA_INFLUENCER: int = 5

In [30]:
# Rename columns for analysis
all_clusters = viable_clusters_df.rename(
        columns={
            "pii_detection_count": "disclosures",
            "node_count": "engagements",
            "top_influencer_tier": "influencer_tier",
            "top_influence_power_score": "influence_power",
            "timestamp_span_sec": "time_elapsed",
        }
    )

# Apply encoding for influencer tier
all_clusters["influencer_tier_encoded"] = all_clusters.apply(
    lambda x: InfluencerTierHierarchy[x["influencer_tier"]].value, axis=1
)

In [31]:
def generate_correlation_matrix(
    correlation_matrix, dependent_data, independent_data, title: str
):
    """

    @param title:
    @param correlation_matrix:
    @param dependent_data:
    @param independent_data:
    @return:
    """

    independent_columns = [
        column_name.upper() for column_name in list(independent_data.columns)
    ]
    dependent_columns = [
        column_name.upper() for column_name in list(dependent_data.columns)
    ]

    fig = go.Figure(
        data=go.Heatmap(
            z=correlation_matrix,
            x=dependent_columns,
            y=independent_columns,
            colorscale="viridis",
            zmin=-1,  # since correlations are between -1 and 1
            zmax=1,
        )
    )

    # Add annotations (ρ values)
    for i, indep_name in enumerate(independent_columns):
        for j, dep_name in enumerate(dependent_columns):
            rho = correlation_matrix[i][j]
            fig.add_annotation(
                go.layout.Annotation(
                    text=f"{rho:.4f}",
                    x=dep_name,
                    y=indep_name,
                    xref="x",
                    yref="y",
                    showarrow=False,
                    font=dict(
                        size=12,
                        color="black"
                        if abs(rho) < 0.5
                        else "white",  # Adjust color for visibility
                    ),
                )
            )

    fig.update_layout(title=f"{title}")

    fig.show()

In [32]:
def build_correlation_matrix(independent_data: dict, dependent_data: dict):
    """
    Builds correlation matrix given independent and dependent data. If the data is normally distributed, Pearson's
    Correlation will be used, otherwise Spearman's is used.
    @param independent_data: dict - independent variable data
    @param dependent_data: dict - dependent variable data
    @return: Tuple[List[dict], List[dict], str] - Tuple of correlation matrix, correlation data, and the test name
    """
    if run_normal_anderson(dependent_data["disclosures"]).get(
        "is_normally_distributed"
    ) and run_normal_anderson(dependent_data["engagements"]).get(
        "is_normally_distributed"
    ):
        correlation_test_name = "Pearson's Correlation"
        correlation_test = pearsonr
    else:
        correlation_test_name = "Spearman's Rank Correlation"
        correlation_test = spearmanr

    correlation_matrix = []
    correlation_data = []
    for indep_name, indep_data in independent_data.items():
        row = []
        for dep_name, dep_data in dependent_data.items():
            rho, p_val = correlation_test(indep_data, dep_data)
            row.append(rho)
            correlation_data.append(
                {
                    "Variables": f"{indep_name.upper()} + {dep_name.upper()}",
                    "Coefficient": round(rho, 4),
                    "P-Value": round(p_val, 10),
                    "Test": correlation_test_name,
                }
            )
        correlation_matrix.append(row)

    return correlation_matrix, correlation_data, correlation_test_name

In [33]:
def run_normal_anderson(data) -> dict:
    """
    Determine if data is normally distributed using Anderson-Darling test (more versatile)
    @param data: posts
    @return: dict - Anderson-Darling test results
    """
    result = anderson(data)
    if result.statistic > result.critical_values[2]:
        is_normally_distributed = False
    else:
        is_normally_distributed = True

    return {
        "is_normally_distributed": is_normally_distributed,
        "statistic": result.statistic,
        "critical_values": result.critical_values[2],
        "test": "Anderson-Darling Test",
    }

In [34]:
dependent_data = all_clusters[["engagements", "disclosures"]]
independent_data = all_clusters[
        ["influencer_tier_encoded", "influence_power", "time_elapsed"]
    ]

correlation_matrix, correlation_data, correlation_test = build_correlation_matrix(
        independent_data, dependent_data
    )

correlations_df = pd.DataFrame.from_dict(correlation_data)

generate_correlation_matrix(
    correlation_matrix,
    dependent_data,
    independent_data,
    f"{correlation_test} Results",
)

<em>A note regarding the correlation: The minimum and maximum node thresholds will impact results as discussed in the paper. It is recommended to mess with the minimum/maximum node threshold value for the correlation analysis but beware setting the node threshold too low, it will impact timeseries analysis by including short-lived conversations as the sample will become too small. Same for the time_season value.</em>