Import the data

In [1]:
import pandas as pd
df = pd.read_csv("content_analysis.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,experiments,total_experiments,total_samples,interface,countries,languages,search_queries,video_id,video_title,...,trust,leader,pro_stance,moral_dilemma,misinformation,human_rights,abortion_rights,war_justification,womens_rights,topic
0,1018,{'general'},0,0,tk0,{'nl'},{'en'},{'anti-war candidate european elections 2024 n...,7304639473243933984,A far-right populist who has previously called...,...,0.004566,0.022831,0.009132,0.004566,0.0,0.0,0.0,0.0,0.0,war
1,1019,{'general'},0,0,tk0,{'nl'},{'en'},{'anti-war candidate european elections 2024 n...,7304691326711385377,New Dutch PM wants Islam banned #itvnews #neth...,...,0.0,0.014218,0.004739,0.0,0.0,0.0,0.0,0.0,0.0,war
2,1020,{'general'},0,0,tk0,{'nl'},{'en'},{'anti-war candidate european elections 2024 n...,7304737421478874400,The anti-Islam populist nicknamed Mozart has j...,...,0.0,0.013274,0.004425,0.0,0.0,0.0,0.0,0.0,0.0,war
3,1021,{'general'},0,0,tk0,{'nl'},{'en'},{'anti-war candidate european elections 2024 n...,7304785915040320801,Geerts Wilders’ far-right Freedom Party has wo...,...,0.0,0.002833,0.008499,0.0,0.002833,0.002833,0.0,0.0,0.0,war
4,1022,{'general'},0,0,tk0,{'nl'},{'en'},{'anti-war candidate european elections 2024 n...,7305171428117155104,What do you think of this? Let's treat each ot...,...,0.00813,0.01626,0.0,0.0,0.0,0.0,0.0,0.0,0.0,war


Create the list of political entities according to their ideologies

In [2]:
left = [
    "Die Linke", "IU", "Podemos", "PCE", "PCF", "LFI", "PRC", "SI", "Syriza",
    "BE", "Vänsterpartiet", "Vasemmistoliitto", "AKEL", "PTB/PVDA", "KPÖ",
    "SP", "Enhedslisten", "Rødt", "PST/POP", "PIE", "The Left", "Razem", "EFA",
    "S&D", "Renew Europe"
]

# TODO: Add more right parties to balance it with the left ones
right = [
    "EPP", "ECR", "PiS", "VOX", "ID", "RN", "Lega", "FPÖ", "Fidesz", "Patriots", 
    "ESN", "AfD", "SPD", "Republika", "Reconquête", "NOWA NADZIEJA", "Mi Hazánk" 
]

Classify entries depending on their idology

(Old function)

In [None]:
import numpy as np

def classify_ideology(description):
    if not isinstance(description, str):
        return np.nan
    
    # Convert to lowercase so I can find all the matches
    description = description.lower()
    left_lower = [party.lower() for party in left]
    right_lower = [party.lower() for party in right]

    # Find for left and right parties on the description
    found_left = any(party in description for party in left_lower)
    found_right = any(party in description for party in right_lower)

    if found_left and not found_right:
        return "left"
    elif found_right and not found_left:
        return "right"
    else:
        return np.nan


In [None]:
# df["ideology"] = df["description"].apply(classify_ideology)

(New function that indicates the % of right and left parties mentioned)

In [3]:
import numpy as np

def analyze_ideology(description):
    if not isinstance(description, str): # Return default values there is no description 
        return pd.Series([np.nan, 0.0, 0.0])

    # Comparisons are done in lower case to avoid skipping matches 
    description = description.lower()
    left_lower = [party.lower() for party in left]
    right_lower = [party.lower() for party in right]

    # Amount of parties mentioned from each side
    found_left = [party for party in left_lower if party in description]
    found_right = [party for party in right_lower if party in description]

    total_found = len(found_left) + len(found_right)

    if total_found == 0:
        perc_left = perc_right = 0.0
        ideology = "no mention"
    else:
        perc_left = len(found_left) / total_found
        perc_right = len(found_right) / total_found
        if perc_left > perc_right:
            ideology = "left"
        elif perc_right > perc_left:
            ideology = "right"
        else:
            ideology = "mixed"

    return pd.Series([ideology, perc_left, perc_right])

df[["ideology", "% left", "% right"]] = df["description"].apply(analyze_ideology)


In [4]:
df[["ideology", "% left", "% right"]] = df["description"].apply(analyze_ideology)

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,experiments,total_experiments,total_samples,interface,countries,languages,search_queries,video_id,video_title,...,moral_dilemma,misinformation,human_rights,abortion_rights,war_justification,womens_rights,topic,ideology,% left,% right
0,1018,{'general'},0,0,tk0,{'nl'},{'en'},{'anti-war candidate european elections 2024 n...,7304639473243933984,A far-right populist who has previously called...,...,0.004566,0.0,0.0,0.0,0.0,0.0,war,left,1.0,0.0
1,1019,{'general'},0,0,tk0,{'nl'},{'en'},{'anti-war candidate european elections 2024 n...,7304691326711385377,New Dutch PM wants Islam banned #itvnews #neth...,...,0.0,0.0,0.0,0.0,0.0,0.0,war,no mention,0.0,0.0
2,1020,{'general'},0,0,tk0,{'nl'},{'en'},{'anti-war candidate european elections 2024 n...,7304737421478874400,The anti-Islam populist nicknamed Mozart has j...,...,0.0,0.0,0.0,0.0,0.0,0.0,war,no mention,0.0,0.0
3,1021,{'general'},0,0,tk0,{'nl'},{'en'},{'anti-war candidate european elections 2024 n...,7304785915040320801,Geerts Wilders’ far-right Freedom Party has wo...,...,0.0,0.002833,0.002833,0.0,0.0,0.0,war,left,1.0,0.0
4,1022,{'general'},0,0,tk0,{'nl'},{'en'},{'anti-war candidate european elections 2024 n...,7305171428117155104,What do you think of this? Let's treat each ot...,...,0.0,0.0,0.0,0.0,0.0,0.0,war,left,0.666667,0.333333


Check the amount of videos from each ideology

In [6]:
# Count occurrences
left_count = (df["ideology"] == "left").sum()
right_count = (df["ideology"] == "right").sum()
nan_count = (df["ideology"] == "no mention").sum()
mixed_count = (df["ideology"] == "mixed").sum()

# Print results
print(f"Left count: {left_count}")
print(f"Right count: {right_count}")
print(f"No parties mentioned count: {nan_count}")
print(f"Mixed count: {mixed_count}") 


Left count: 701
Right count: 190
No parties mentioned count: 286
Mixed count: 327


Count likes and followers

(old but faster, only followers works)

In [None]:
import requests
import re

def get_video_likes_and_creator_followers(video_id: str, creator_id: str):

    # Construct video URL and creator URL
    video_url = f"https://www.tiktok.com/@{creator_id}/video/{video_id}"
    creator_url = f"https://www.tiktok.com/@{creator_id}"
    
    # Set up headers to mimic a real browser request
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/118.0",
        "Referer": "https://www.tiktok.com/",
        "Accept-Language": "en-US,en;q=0.9",
        "Connection": "keep-alive",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Upgrade-Insecure-Requests": "1"
    }

    # Fetch video page content
    try:
        video_response = requests.get(video_url, headers=headers)
        video_response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching video page: {e}")
        return None, None

    video_html = video_response.text
    # Debugging
    # print(video_html)
    print(f"Fetched video page HTML for video {video_id}")

    # Extract likes and followers using regex
    try:
        # TODO: Regex for getting the likes is not working as expected --> Buscar número de likes directamente en el inspector
        # TODO: Might be interesting to add views to (even though it might overlap with the order of display)
        likes_match = re.search(r'"likeCount":(\d+)', video_html)
        if likes_match:
            likes = int(likes_match.group(1))
        else:
            print(f"Likes not found for video {video_id}")
            likes = None
        
        # Fetch creator page content to get followers count
        creator_response = requests.get(creator_url, headers=headers)
        creator_response.raise_for_status()

        creator_html = creator_response.text

        # Extract followers using regex
        followers_match = re.search(r'"followerCount":(\d+)', creator_html)
        if followers_match:
            followers = int(followers_match.group(1))
        else:
            print(f"Followers not found for creator {creator_id}")
            followers = None
        
        return likes, followers
    except Exception as e:
        print(f"Error extracting data for video {video_id} and creator {creator_id}: {e}")
        return None, None


def add_likes_and_followers_to_dataframe(df):
    
    # Initialize new columns
    df["likes"] = None
    df["followers"] = None

    # Loop over rows and extract likes and followers
    for index, row in df.iterrows():
        video_id = row["video_id"]
        creator_id = row["creator_id"]
        
        likes, followers = get_video_likes_and_creator_followers(video_id, creator_id)
        
        # Assign the extracted values to the respective columns
        df.at[index, "likes"] = likes
        df.at[index, "followers"] = followers

    return df


Solution with Selenium (works both followers and likes)

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time
import pandas as pd

def parse_number(text):
    text = text.upper().strip()
    if 'K' in text:
        return int(float(text.replace('K', '')) * 1_000)
    elif 'M' in text:
        return int(float(text.replace('M', '')) * 1_000_000)
    elif 'B' in text:
        return int(float(text.replace('B', '')) * 1_000_000_000)
    return int(text.replace(',', ''))

def get_video_likes_and_creator_followers_selenium(driver, video_id: str, creator_id: str):
    video_url = f"https://www.tiktok.com/@{creator_id}/video/{video_id}"
    creator_url = f"https://www.tiktok.com/@{creator_id}"

    likes = None
    followers = None

    try:
        # Charging the video page
        driver.get(video_url)
        time.sleep(5)  # Wait to ensure is all loaded

        # Likes
        like_element = driver.find_element(By.CSS_SELECTOR, '[data-e2e="like-count"]')
        likes = parse_number(like_element.text)
    except Exception as e:
        print(f"Error obtaining likes for {video_id}: {e}")

    try:
        
        driver.get(creator_url)
        time.sleep(5)

        # Followers
        follower_element = driver.find_element(By.CSS_SELECTOR, '[data-e2e="followers-count"]')
        followers = parse_number(follower_element.text)
    except Exception as e:
        print(f"Error obtaining followers for  {creator_id}: {e}")

    return likes, followers

def add_likes_and_followers_to_dataframe(df):
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--window-size=1920,1080")
    
    driver = webdriver.Chrome(options=options)

    df = df.copy()
    df["likes"] = None
    df["followers"] = None

    for index, row in df.iterrows():
        video_id = row["video_id"]
        creator_id = row["creator_id"]
        print(f"Processing @{creator_id} / video {video_id}...")

        likes, followers = get_video_likes_and_creator_followers_selenium(driver, video_id, creator_id)

        df.at[index, "likes"] = likes
        df.at[index, "followers"] = followers

    driver.quit()
    return df


In [None]:
# Small test
example_df = df.head(5)

updated_df = add_likes_and_followers_to_dataframe(example_df)

print(updated_df)


In [76]:
df = add_likes_and_followers_to_dataframe(df)

df.head()


Procesando @vicenews / video 7304639473243933984...
Procesando @itvnews / video 7304691326711385377...
Procesando @bbcnews / video 7304737421478874400...
Procesando @criticalanalysisuk / video 7304785915040320801...
Procesando @tv.dutch / video 7305171428117155104...
Error obteniendo likes para 7305171428117155104: Message: no such element: Unable to locate element: {"method":"css selector","selector":"[data-e2e="like-count"]"}
  (Session info: chrome=135.0.7049.84); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
#0 0x557770cacd0a <unknown>
#1 0x55777075d5f0 <unknown>
#2 0x5577707aea33 <unknown>
#3 0x5577707aec21 <unknown>
#4 0x5577707fd274 <unknown>
#5 0x5577707d468d <unknown>
#6 0x5577707fa660 <unknown>
#7 0x5577707d4433 <unknown>
#8 0x5577707a0ea3 <unknown>
#9 0x5577707a1b01 <unknown>
#10 0x557770c71b5b <unknown>
#11 0x557770c75a41 <unknown>
#12 0x557770c58c52 <unknown>
#13 

Unnamed: 0.1,Unnamed: 0,experiments,total_experiments,total_samples,interface,countries,languages,search_queries,video_id,video_title,...,human_rights,abortion_rights,war_justification,womens_rights,topic,ideology,% left,% right,likes,followers
0,1018,{'general'},0,0,tk0,{'nl'},{'en'},{'anti-war candidate european elections 2024 n...,7304639473243933984,A far-right populist who has previously called...,...,0.0,0.0,0.0,0.0,war,left,1.0,0.0,76800.0,3600000
1,1019,{'general'},0,0,tk0,{'nl'},{'en'},{'anti-war candidate european elections 2024 n...,7304691326711385377,New Dutch PM wants Islam banned #itvnews #neth...,...,0.0,0.0,0.0,0.0,war,no mention,0.0,0.0,182600.0,4500000
2,1020,{'general'},0,0,tk0,{'nl'},{'en'},{'anti-war candidate european elections 2024 n...,7304737421478874400,The anti-Islam populist nicknamed Mozart has j...,...,0.0,0.0,0.0,0.0,war,no mention,0.0,0.0,8283.0,7500000
3,1021,{'general'},0,0,tk0,{'nl'},{'en'},{'anti-war candidate european elections 2024 n...,7304785915040320801,Geerts Wilders’ far-right Freedom Party has wo...,...,0.002833,0.0,0.0,0.0,war,left,1.0,0.0,45700.0,175200
4,1022,{'general'},0,0,tk0,{'nl'},{'en'},{'anti-war candidate european elections 2024 n...,7305171428117155104,What do you think of this? Let's treat each ot...,...,0.0,0.0,0.0,0.0,war,left,0.666667,0.333333,,287


Store the result

In [77]:
df.to_csv("final_dataset_with_likes.csv", index=False)