In [1]:
import pandas as pd
import string
import re
from collections import Counter

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# ensure NLTK data is available
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/lbitsiko/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/lbitsiko/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/lbitsiko/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Data Processing

In [2]:
def preprocess_ndjson(path: str) -> pd.DataFrame:
    """
    Load an NDJSON file of videos/comments, extract/clean text, 
    tokenize, remove stopwords, lemmatize, and extract hashtags.

    Returns a DataFrame with columns:
        - desc              : original description text
        - isAd              : ad flag (or "unknown")
        - hashtags          : list of hashtags found in desc
        - text_processed    : punctuation-stripped, lowercased text
        - tokens            : list of tokens with stopwords removed
        - words_lemmatized  : list of lemmatized tokens
        - text_lemmatized   : rejoined lemmatized text
    """
    # load
    df = pd.read_json(path, lines=True)

    # extract desc + ad flag
    df['desc'] = df['data'].apply(lambda x: x.get('desc', ''))
    df['isAd'] = df['data'].apply(lambda x: x.get('isAd', 'unknown'))

    # extract hashtags
    df['hashtags'] = df['desc'].apply(lambda txt: re.findall(r'#\w+', txt))

    # lowercase & strip punctuation
    df['text_processed'] = (
        df['desc']
        .str.lower()
        .apply(lambda txt: txt.translate(str.maketrans('', '', string.punctuation)))
    )

    # tokenize
    df['tokens'] = df['text_processed'].str.split()

    # build stopword set (English + Dutch)
    STOP = set(stopwords.words('english')) | set(stopwords.words('dutch'))

    # remove stopwords
    df['tokens'] = df['tokens'].apply(lambda toks: [w for w in toks if w not in STOP])

    # lemmatize
    lemmatizer = WordNetLemmatizer()
    df['words_lemmatized'] = df['tokens'].apply(lambda toks: [lemmatizer.lemmatize(w) for w in toks])

    # join lemmatized tokens back to text
    df['text_lemmatized'] = df['words_lemmatized'].apply(" ".join)

    return df


In [3]:
df_browsing = preprocess_ndjson("../data/archetypes/diabetes/browsing_videos.ndjson")
df_tuning = preprocess_ndjson("../data/archetypes/diabetes/tuning_videos.ndjson")

In [4]:
df_tuning.head(2)

Unnamed: 0,nav_index,item_id,timestamp_collected,source_platform,source_platform_url,source_url,user_agent,data,id,desc,isAd,hashtags,text_processed,tokens,words_lemmatized,text_lemmatized
0,2:46:NaN,7520333299827411968,2025-07-02 07:44:52.678,tiktok.com,https://www.tiktok.com/,https://www.tiktok.com/api/recommend/item_list...,Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:139...,"{'AIGCDescription': '', 'CategoryType': 120, '...",24,😡 Ismail el Abassi (DENK): Koranschennis moet ...,False,"[#denk, #politiek, #tweedekamer, #vjp, #ismail...",😡 ismail el abassi denk koranschennis moet nú ...,"[😡, ismail, el, abassi, denk, koranschennis, n...","[😡, ismail, el, abassi, denk, koranschennis, n...",😡 ismail el abassi denk koranschennis nú echt ...
1,2:46:NaN,7518029148623817728,2025-07-02 07:44:52.680,tiktok.com,https://www.tiktok.com/,https://www.tiktok.com/api/recommend/item_list...,Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:139...,"{'AIGCDescription': '', 'CategoryType': 101, '...",25,#fyp #fypシ #movie #film,False,"[#fyp, #fypシ, #movie, #film]",fyp fypシ movie film,"[fyp, fypシ, movie, film]","[fyp, fypシ, movie, film]",fyp fypシ movie film


In [5]:
df_browsing.head(2)

Unnamed: 0,nav_index,item_id,timestamp_collected,source_platform,source_platform_url,source_url,user_agent,data,id,desc,isAd,hashtags,text_processed,tokens,words_lemmatized,text_lemmatized
0,2:69:NaN,7514764087192996864,2025-07-02 10:07:46.550,tiktok.com,https://www.tiktok.com/,https://www.tiktok.com/api/recommend/item_list...,Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:139...,"{'AIGCDescription': '', 'CategoryType': 101, '...",2015,,False,[],,[],[],
1,2:69:NaN,7519346976580733952,2025-07-02 10:07:46.553,tiktok.com,https://www.tiktok.com/,https://www.tiktok.com/api/recommend/item_list...,Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:139...,"{'AIGCDescription': '', 'CategoryType': 120, '...",2016,Spidey makes a trade… will it be an upgrade? #...,False,"[#FitSwap, #SpiderMan, #Marvel]",spidey makes a trade… will it be an upgrade fi...,"[spidey, makes, trade…, upgrade, fitswap, raym...","[spidey, make, trade…, upgrade, fitswap, raymo...",spidey make trade… upgrade fitswap raymond wei...


In [6]:
def get_common_elements(set1, set2, sample_n=10):
    """Return the intersection and a sample of common elements."""
    common = set1.intersection(set2)
    print(f"Number of common elements: {len(common)}")
    print("Sample common elements:", list(common)[:sample_n])
    return common

In [7]:
get_common_elements(
    set([token for token_list in df_browsing['hashtags'].values for token in token_list]), 
    set([token for token_list in df_tuning['hashtags'].values for token in token_list])
)

Number of common elements: 118
Sample common elements: ['#netherlands', '#politiek', '#Friends', '#viraltiktok', '#sad', '#relatable', '#fast', '#netflix', '#foruyou', '#videoviral']


{'#2025',
 '#CapCut',
 '#FYP',
 '#FallonTonight',
 '#Friends',
 '#HarryPotter',
 '#JimmyFallon',
 '#Tatcha',
 '#TonightShow',
 '#ad',
 '#ai',
 '#america',
 '#animation',
 '#animations',
 '#anime',
 '#baby',
 '#blowthisup',
 '#breakingnews',
 '#business',
 '#chatgpt',
 '#comedy',
 '#contentcreator',
 '#corecore',
 '#creatorsearchinsights',
 '#dance',
 '#denk',
 '#edit',
 '#edits',
 '#engage',
 '#europe',
 '#f',
 '#family',
 '#familyguy',
 '#fast',
 '#film',
 '#fnaf',
 '#fortnite',
 '#foruyou',
 '#foryou',
 '#foryoupage',
 '#foryourpage',
 '#foryouu',
 '#france',
 '#fruits',
 '#funny',
 '#fy',
 '#fyp',
 '#fypage',
 '#fypppppppppppppp',
 '#fyppppppppppppppppppppppp',
 '#fypシ',
 '#fyyyyyyyyyyyyyyyy',
 '#god',
 '#googleveo',
 '#goviral',
 '#healthyrecipes',
 '#holland',
 '#humor',
 '#itadoriyuuji',
 '#jamaica',
 '#jamaicatiktok',
 '#jesuslovesyou',
 '#jjk',
 '#jujutsukaisen',
 '#learnontiktok',
 '#life',
 '#love',
 '#loveislandusa',
 '#marketing',
 '#mindset',
 '#minecraft',
 '#momtok',
 '#

# Word Frequencies

In [8]:
def get_word_frequencies(df, word_col='words_lemmatized_nltk'):
    """Return a DataFrame of word frequencies from a column of word lists."""
    all_words = [word for words_list in df[word_col] for word in words_list]
    word_freqs = Counter(all_words)
    word_freqs_df = pd.DataFrame.from_dict(word_freqs, orient='index', columns=['count']).reset_index().rename({'index':'word'}, axis=1)
    word_freqs_df = word_freqs_df.sort_values(by='count', ascending=False)
    return word_freqs_df

In [9]:
df_browsing_hashtags = get_word_frequencies(df_browsing, 'hashtags')
df_tuning_hashtags = get_word_frequencies(df_tuning, 'hashtags')

In [10]:
df_tuning_hashtags.head(10)

Unnamed: 0,word,count
168,#type2diabetes,134
43,#diabetes,114
170,#prediabetes,93
177,#bloodsugar,91
9,#fyp,87
172,#insulinresistance,82
821,#t1d,53
822,#type1diabetes,47
453,#minecraft,39
174,#diabetesawareness,38


In [11]:
df_browsing_hashtags.head(10)

Unnamed: 0,word,count
28,#fyp,388
8,#foryou,102
34,#viral,88
46,#foryoupage,72
4,#fypシ,67
1277,#robloxtower,57
63,#edit,32
287,#funny,30
66,#voorjou,29
40,#trending,28


# Network Analysis

In [12]:
import networkx as nx
from itertools import combinations

def build_hashtag_network_unweighted(df, hashtag_col='hashtags'):
    """
    Build an unweighted co-occurrence network of hashtags from a DataFrame.
    Nodes: hashtags
    Edges: two hashtags appear together in the same datapoint (row)

    Example:

    example_df = pd.DataFrame({
    'hashtags': [
        ['#cat', '#cute', '#pet'],
        ['#dog', '#pet'],
        ['#cat', '#sleep'],
        ['#dog'],
        []
        ]
    })

    # Build the hashtag co-occurrence network
    G_example = build_hashtag_network_unweighted(example_df)

    Nodes: ['#cat', '#cute', '#pet', '#dog', '#sleep']
    Edges: [('#cat', '#cute'), ('#cat', '#pet'), ('#cat', '#sleep'), ('#cute', '#pet'), ('#pet', '#dog')]
    """
    G = nx.Graph()
    for hashtags in df[hashtag_col]:
        if isinstance(hashtags, list) and len(hashtags) > 1:
            for h1, h2 in combinations(hashtags, 2):
                G.add_edge(h1, h2)
        elif isinstance(hashtags, list) and len(hashtags) == 1:
            G.add_node(hashtags[0])
    return G


In [13]:
def build_overall_network(df_tuning, df_browsing, hashtag_col='hashtags'):
    """
    Build an overall co-occurrence network from both tuning and browsing DataFrames.
    """
    G_tuning = build_hashtag_network_unweighted(df_tuning, hashtag_col)
    G_browsing = build_hashtag_network_unweighted(df_browsing, hashtag_col)
    
    # # Combine the two graphs
    # G_overall = nx.compose(G_tuning, G_browsing)

    # Combine nodes from both graphs and assign 'origin' attribute
    nodes_browsing = set(G_browsing.nodes)
    nodes_tuning = set(G_tuning.nodes)

    all_nodes = nodes_browsing | nodes_tuning

    # Build mapping for node origins
    node_origin = {}
    for node in all_nodes:
        if node in nodes_browsing and node in nodes_tuning:
            node_origin[node] = 'both'
        elif node in nodes_browsing:
            node_origin[node] = 'browsing'
        else:
            node_origin[node] = 'tuning'

    # Create combined graph
    G_combined = nx.Graph()
    for node, origin in node_origin.items():
        G_combined.add_node(node, origin=origin)

    # Add edges from both graphs
    G_combined.add_edges_from(G_browsing.edges)
    G_combined.add_edges_from(G_tuning.edges)
    
    return G_combined

In [14]:
# import matplotlib.pyplot as plt

# # Get node colors based on 'origin' attribute
# color_map = {
#     'browsing': 'skyblue',
#     'tuning': 'salmon',
#     'both': 'yellowgreen'
# }
# node_colors = [color_map[G_combined.nodes[n]['origin']] for n in G_combined.nodes]

# plt.figure(figsize=(10, 8))
# nx.draw_networkx(
#     G_combined,
#     with_labels=False,
#     node_color=node_colors,
#     edge_color='gray',
#     node_size=40,
#     alpha=0.8
# )
# plt.title("Combined Hashtag Co-occurrence Network (Node Color by Origin)")
# plt.axis('off')
# plt.show()

In [15]:
G_combined = build_overall_network(df_tuning, df_browsing, hashtag_col='hashtags')

In [16]:
import csv

def export_graph_to_csv(G, nodes_path='nodes.csv', edges_path='edges.csv'):
    """
    Export nodes (with attributes) and edges of a NetworkX graph to CSV files.
    """
    # Export nodes
    with open(nodes_path, 'w', newline='', encoding='utf-8') as f_nodes:
        writer = csv.writer(f_nodes)
        writer.writerow(['id', 'node', 'origin'])
        for node, data in G.nodes(data=True):
            writer.writerow([node, node, data.get('origin', '')])

    # Export edges
    with open(edges_path, 'w', newline='', encoding='utf-8') as f_edges:
        writer = csv.writer(f_edges)
        writer.writerow(['source', 'target'])
        for u, v in G.edges():
            writer.writerow([u, v])

# Example usage:
# export_graph_to_csv(G_combined)

In [18]:
export_graph_to_csv(G_combined, nodes_path='../output/diabetes/nodes.csv', edges_path='../output/diabetes/edges.csv')

# Venn Diagrams

In [20]:
def get_word_frequencies_overall(df_browsing, df_tuning, word_col='hashtags'):
    """Return a DataFrame of word frequencies from a column of word lists."""
    all_words_browsing = [word for words_list in df_browsing[word_col] for word in words_list]
    all_words_tuning = [word for words_list in df_tuning[word_col] for word in words_list]

    all_words = all_words_browsing + all_words_tuning

    word_freqs = Counter(all_words)
    word_freqs_df = pd.DataFrame.from_dict(word_freqs, orient='index', columns=['count']).reset_index().rename({'index':'word'}, axis=1)
    word_freqs_df = word_freqs_df.sort_values(by='count', ascending=False)

    # add information about source: only browsing, only tuning, or both
    word_freqs_df['source'] = word_freqs_df['word'].apply(lambda word: 
        'both' if (word in all_words_browsing and word in all_words_tuning) 
        else 'browsing' if word in all_words_browsing 
        else 'tuning' if word in all_words_tuning
        else 'unknown'
    )

    return word_freqs_df

In [21]:
all_word_freq = get_word_frequencies_overall(df_browsing, df_tuning, word_col='hashtags')
all_word_freq.head(10)

Unnamed: 0,word,count,source
28,#fyp,475,both
2961,#type2diabetes,134,tuning
8,#foryou,117,both
2840,#diabetes,114,tuning
34,#viral,112,both
4,#fypシ,99,both
2963,#prediabetes,93,tuning
46,#foryoupage,93,both
2970,#bloodsugar,91,tuning
2965,#insulinresistance,82,tuning


In [23]:
all_word_freq.to_csv('../output/diabetes/word_frequencies.csv', index=False)