# Imports

In [5]:
import pandas as pd
import json
import math
import re
from collections import defaultdict
from pathlib import Path
from typing import Callable  
import requests
import os
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
from textblob import TextBlob
from empath import Empath
import numpy as np
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time

# Data screening

Upload the general and topic questions dataset

In [6]:
# Load datasets to be used
df_general = pd.read_csv("../../data/alexlab-storage-eu24-tk0__2024-09-16T08_59_07.939920__general.csv")
df_topic = pd.read_csv("../../data/alexlab-storage-eu24-tk0__2024-09-16T09_02_22.143201__topic.csv")
df_party = pd.read_csv("../../data/alexlab-storage-eu24-tk0__2024-09-16T09_01_35.858836__party.csv")

Extract data to be used

In [7]:
# Keywords to identify the desired entries
# keywords = ["Putin", "war", "abortion", "anti-war", "anti-abortion"]

# df_combined = pd.concat([df_general, df_topic], ignore_index=True)
# df = df_combined[df_combined["search_queries"].str.contains('|'.join(keywords), case=False, na=False)]

df = df_party
# Drop entries with any null column
df = df.dropna()

# Keep only the english content
df = df[df["languages"] == "{'en'}"]

Determine the topic of each content (None, war-related or abortion-related)

In [8]:
# Keywords for war and abortion topics
war_keywords = ["Putin", "war", "anti-war"]
abortion_keywords = ["abortion", "anti-abortion"]

# Function to determine the topic based on search queries
def determine_topic(search_queries):
    if pd.isna(search_queries):
        return None
    search_queries = str(search_queries).lower()  # Convert to lowercase to handle case-insensitivity
    if any(keyword.lower() in search_queries for keyword in war_keywords):
        return "war"
    elif any(keyword.lower() in search_queries for keyword in abortion_keywords):
        return "abortion"
    return None

# Transcription

Adaptation of the given script

In [9]:
def group_by(lst: list, key_extractor: Callable):
    d = defaultdict(list)
    for item in lst:
        d[key_extractor(item)].append(item)
    return d

# Script adapted from the one given by AIForesincs
def get_transcripts_for_tiktok_video(video_id: str, transcripts_dir: Path):
    video_url = f"https://www.tiktok.com/@unknown/video/{video_id}"
    user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/118.0"
    headers = {
        "User-Agent": user_agent,
        "Referer": "https://www.tiktok.com/",
        "Accept-Language": "en-US,en;q=0.9",
        "Connection": "keep-alive",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Upgrade-Insecure-Requests": "1"
    }

    print(f"Fetching video URL: {video_url}")
    try:
        response = requests.get(video_url, headers=headers)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching video URL: {e}")
        return None

    html_content = response.text

    # Extracting the JSON object from the HTML file
    json_match = re.search(r'(?<="__DEFAULT_SCOPE__":)[^<]*', html_content)
    if not json_match:
        print("JSON data not found in the HTML content.")
        return None

    json_data = json.loads(json_match.group(0).strip()[:-1])  # manually removing last character
    
    # Validate JSON structure before accessing keys
    if ("webapp.video-detail" not in json_data or
        "itemInfo" not in json_data["webapp.video-detail"] or
        "itemStruct" not in json_data["webapp.video-detail"]["itemInfo"] or
        "video" not in json_data["webapp.video-detail"]["itemInfo"]["itemStruct"] or
        "subtitleInfos" not in json_data["webapp.video-detail"]["itemInfo"]["itemStruct"]["video"]):
        
        print(f"Unexpected JSON structure for video {video_id}. Skipping...")
        return None
    
    transcripts_infos = json_data["webapp.video-detail"]["itemInfo"]["itemStruct"]["video"]["subtitleInfos"]

    language_code_priority = [
        "eng-US",
        "fra-FR",
        "deu-DE",
        "spa-ES",
    ]
    subtitle_infos_by_format = group_by(transcripts_infos, lambda info: info["Format"])
    
    captions = ""
    
    for subtitle_format, infos_list in subtitle_infos_by_format.items():
        sorted_transcripts_infos_list = sorted(transcripts_infos,
                                               key=lambda info: language_code_priority.index(
                                                   info["LanguageCodeName"]) if
                                               info["LanguageCodeName"] in language_code_priority else math.inf)
        transcripts_info = sorted_transcripts_infos_list[0]
        url = transcripts_info["Url"]
        language = transcripts_info["LanguageCodeName"]
        source = transcripts_info["Source"]

        suffix = "vtt" if subtitle_format == "webvtt" else "json" if subtitle_format == "creator_caption" else None

        filename = f"{video_id}_{subtitle_format}_{language}_{source}"
        if suffix:
            filename += f".{suffix}"
        try:
            file_response = requests.get(url, headers=headers)
            file_response.raise_for_status()
        except requests.exceptions.RequestException as e:
            print(f"Failed to download transcripts for video {video_id}, language {language}: {e}")
            continue

        # Storing content of the vtt file directly in the captions
        captions = file_response.text

        # Save file to disk
        video_dir = transcripts_dir / str(video_id)

        video_dir.mkdir(exist_ok=True)
        with open(video_dir / filename, "wb+") as f:
            f.write(file_response.content)
            print(f"Saved file: {video_dir / filename}")
    
    return captions

def add_captions_to_folder(df):
        
    for video_id in df['video_id']:
        get_transcripts_for_tiktok_video(video_id, Path('./transcripts'))

    return df

In [10]:
df = add_captions_to_folder(df)

Fetching video URL: https://www.tiktok.com/@unknown/video/6954332991380262150
Fetching video URL: https://www.tiktok.com/@unknown/video/7028171334630837530
Fetching video URL: https://www.tiktok.com/@unknown/video/7065019012500983046
Fetching video URL: https://www.tiktok.com/@unknown/video/7099633825587744046
Saved file: transcripts/7099633825587744046/7099633825587744046_webvtt_eng-US_ASR.vtt
Fetching video URL: https://www.tiktok.com/@unknown/video/7130902863529741610
Fetching video URL: https://www.tiktok.com/@unknown/video/7151404990010100997
Saved file: transcripts/7151404990010100997/7151404990010100997_webvtt_eng-US_MT.vtt
Fetching video URL: https://www.tiktok.com/@unknown/video/7297748671377018154
Saved file: transcripts/7297748671377018154/7297748671377018154_webvtt_eng-US_ASR.vtt
Fetching video URL: https://www.tiktok.com/@unknown/video/7303618548369198369
Saved file: transcripts/7303618548369198369/7303618548369198369_webvtt_eng-US_ASR.vtt
Fetching video URL: https://www.t

KeyboardInterrupt: 

Addition of the "captions" column

In [11]:
def add_captions_to_df_vtt(df):
    captions = []
    
    for video_id in df['video_id']:
        # Find all VTT files for the given video_id. Since the video can end in different codes it is needed to end the direction with an *
        vtt_files = list(Path(f'./transcripts/{video_id}').glob(f'{video_id}_webvtt_*.vtt'))
        
        if not vtt_files:
            print(f"Warning: No VTT files found for video {video_id}.")
            captions.append("")  # Append empty caption if no files are found
            continue
        
        # Use the first VTT file found (shouldn't be needed since every video just generates one caption)
        vtt_file = vtt_files[0]
        
        # Check if the file is empty
        if os.path.getsize(vtt_file) == 0:
            print(f"Warning: VTT file for video {video_id} is empty.")
            captions.append("")  # Append empty caption if the file is empty
            continue
        
        try:
            with open(vtt_file, 'r', encoding='utf-8') as file:
                vtt_lines = file.readlines()
                
                # Remove lines containing timestamps and 'WEBVTT' or if it contains a timestamp
                caption_lines = [line.strip() for line in vtt_lines if '-->' not in line and line.strip() != 'WEBVTT']
                
                # Join the caption lines into one string
                captions_text = ' '.join(caption_lines)
                captions.append(captions_text)
        
        except Exception as e:
            print(f"Unexpected error for video {video_id}: {e}")
            captions.append("")  # Append an empty caption for any other errors

    df['captions'] = captions
    return df


In [12]:
df = add_captions_to_df_vtt(df)

# Clean the results without captions
df = df[df["captions"] != ""]



In [15]:
df.languages.unique()

array(["{'en'}"], dtype=object)

# VADER: Sentiment Analysis

Download the VADER lexicon (if not installed)

In [16]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/adri/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

Create the VADER sentiment analyzer

In [17]:
sia = SentimentIntensityAnalyzer()

Add the binary and ternary classification sentiment score and the valence scoring

In [18]:
def classify_sentiment(row):
    scores = {'pos': row['pos'], 'neg': row['neg'], 'neu': row['neu']}
    dominant = max(scores, key=scores.get)
    return 1 if dominant == 'pos' else -1 if dominant == 'neg' else 0

In [19]:
# Get full VADER score dictionary
df['scores'] = df['captions'].astype(str).apply(lambda text: sia.polarity_scores(text))

# Extract individual scores
df['sentiment_score_compound'] = df['scores'].apply(lambda score_dict: score_dict['compound'])
df['pos'] = df['scores'].apply(lambda score_dict: score_dict['pos'])
df['neg'] = df['scores'].apply(lambda score_dict: score_dict['neg'])
df['neu'] = df['scores'].apply(lambda score_dict: score_dict['neu'])

# Asign binary sentiment score: 1 positive and -1 negative, no neutral values taked into account
df['sentiment_score_binary'] = df.apply(lambda row: 1 if row['pos'] > row['neg'] else -1, axis=1)

# Assign ternary sentiment score: 1 for positive, -1 for negative, 0 for neutral
df['sentiment_score_ternary'] = df.apply(classify_sentiment, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['scores'] = df['captions'].astype(str).apply(lambda text: sia.polarity_scores(text))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sentiment_score_compound'] = df['scores'].apply(lambda score_dict: score_dict['compound'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['pos'] = df['scores

# Subjectivity Score

Define the function that returns the subectivity score from textblob library

In [20]:
def get_subjectivity(text):
    return TextBlob(text).sentiment.subjectivity

In [21]:
df['subjectivity_score'] = df['captions'].astype(str).apply(get_subjectivity)

# Content analysis

Initialize the Empath lexicon

In [22]:
empath_lexicon = Empath()

Definition of the custom topic categories I want to compute the appearence score of

In [23]:
def create_custom_categories():
    categories = {
        "pro_stance" :  ["support", "approval", "agreement", "endorsement", "favor", "positive_opinion", "advocate", "in_favor", "backing"], # Adding my own words per category
        "moral_dilemma": ["ethics", "morality", "controversy", "decision_making", "right_vs_wrong"],
        "misinformation": ["fake_news", "disinformation", "propaganda", "conspiracy", "false_claims"],
        "human_rights": ["freedom", "equality", "discrimination", "justice", "civil_rights"],
        "abortion_rights": ["pro_choice", "pro_life", "reproductive_rights", "bodily_autonomy", "abortion_laws"],
        "war_justification": ["military_intervention", "self_defense", "war_crimes", "peace_treaty", "conflict_resolution"],
        "womens_rights": ["gender_equality", "feminism", "reproductive_rights", "pay_gap", "domestic_violence"],
        "disagreement": ["oppose", "disagree", "rebuttal", "contradict", "objection", "refute", "challenge", "criticism", "conflict", "debate", "hate"]
    }

    model = "nytimes"  # Using "nytimes" for a more policy-related vocabulary

    for category, keywords in categories.items():
        empath_lexicon.create_category(category, keywords, model=model)

In [24]:
create_custom_categories()

["support", "endorsement", "backing", "strong_support", "opposition", "broad_support", "wide_support", "approval", "political_support", "strong_backing", "broad_coalition", "initiative", "full_support", "strong_opposition", "major_concessions", "endorsing", "organized_labor", "opposing", "bipartisan_support", "trade_pact", "lobbying_efforts", "proposal", "political_cover", "business_groups", "overwhelming_support", "labor_unions", "new_proposal", "other_parties", "significant_support", "continued_support", "little_support", "approving", "widespread_support", "enough_support", "Republican_support", "supporting", "more_support", "backers", "Congressional_support", "civil_rights_groups", "new_majority", "public_support", "pact", "coalition", "backing", "major_victory", "ratification", "Congressional_approval", "fierce_opposition", "endorsed", "compromise_proposal", "political_muscle", "assent", "constitutional_changes", "significant_concessions", "qualified_support", "compromise", "trade_

List of all the expeted categories to be added, including the ones from Empath and the ones I crated

In [25]:
categories = [
    "pain", "movement", "negative_emotion", "religion", "violence", "government", 
    "independence", "fear", "trust", "leader", "pro_stance", "moral_dilemma", 
    "misinformation", "human_rights", "abortion_rights", "war_justification", "womens_rights",
    "disagreement"
]

Execute the content analysis for the whole dataset

In [26]:
def analyze_text(text):
    if pd.isna(text):  # Handle null values
        return {category: 0.0 for category in categories}
    return empath_lexicon.analyze(text, categories=categories, normalize=True)

In [27]:
df = df.join(df['captions'].apply(analyze_text).apply(pd.Series))


# Political Parties Analysis

Create al list of the political entities according to their ideologies

In [29]:
# Some abbrevatrions (such as PS for Belgium and France) might overlap
# While there might be parties from all european countries, I have foccused on the countries included on the experiment

left = [
    "Die Linke", "IU", "Podemos", "PCE", "PCF", "LFI", "PRC", "SI", "Syriza",
    "BE", "Vänsterpartiet", "Vasemmistoliitto", "AKEL", "PTB", "KPÖ",
    "SP", "Enhedslisten", "Rødt", "PST/POP", "PIE", "The Left", "Razem", "EFA",
    "S&D", "Renew Europe", "PSOE", "Sumar", "PES", "PS", "APSD", "SD", "SAP",
    "Labour", "SPÖ", "Vooruit", "SPD", "NL", "PvdA", "Socialist Party",
    "Democratic Party", "Labour Party", "PASOK", "SLD", "Nouvelle Donne", "PRG",
    "Inicjatywa Polska", "Grüne", "Greens"
]

right = [
    "EPP", "ECR", "PiS", "VOX", "ID", "RN", "Lega", "FPÖ", "Fidesz", "Patriots", 
    "ESN", "AfD", "Republika", "Reconquête", "NOWA NADZIEJA", "Mi Hazánk",
    "PP", "Partido Popular", "CDU", "Agir", "MoDem", "Ensemble", "LFA", "RE",
    "LR", "CDA", "NSC", "IDP", "CSU", "FDP", "FW", "Junts", "ZP", "NPD", "PVV",
    "FvD", "European People's Party", "Progressive Alliance of Socialists & Democrats", "PO",
    "PSL", "BBB", "CDA", "NSC", "Familie", "ÖDP", "UDR"]

# It is possible that it is needed to change the abbr. of the parties with their full name so they are detected

Socialist members are extracted from: https://en.wikipedia.org/wiki/Party_of_European_Socialists

Center-left member are extracted from: https://en.wikipedia.org/wiki/Category:Centre-right_parties_in_Europe

Indicate the % of right and left parties mentioned, and lists all of the political parties mentioned

In [30]:
def analyze_ideology(description):
    if not isinstance(description, str):  # Return default values if no description
        return pd.Series([np.nan, 0.0, 0.0, []])

    description_lower = description.lower()
    left_lower = [party.lower() for party in left]
    right_lower = [party.lower() for party in right]

    found_left = [left[i] for i, party in enumerate(left_lower) if party in description_lower]
    found_right = [right[i] for i, party in enumerate(right_lower) if party in description_lower]

    parties_mentioned = found_left + found_right
    total_found = len(parties_mentioned)

    if total_found == 0:
        perc_left = perc_right = 0.0
        ideology = "no mention"
    else:
        perc_left = len(found_left) / total_found
        perc_right = len(found_right) / total_found
        if perc_left > perc_right:
            ideology = "left"
        elif perc_right > perc_left:
            ideology = "right"
        else:
            ideology = "mixed"

    return pd.Series([ideology, perc_left, perc_right, parties_mentioned])

In [31]:
df[["ideology", "% left", "% right", "parties_mentioned"]] = df["description"].apply(analyze_ideology)

# Engagement Metrics: Likes and followers

Function to translate letters on numbers to only numbers (1K --> 1000)

In [32]:
def parse_number(text):
    text = text.upper().strip()
    if 'K' in text:
        return int(float(text.replace('K', '')) * 1_000)
    elif 'M' in text:
        return int(float(text.replace('M', '')) * 1_000_000)
    elif 'B' in text:
        return int(float(text.replace('B', '')) * 1_000_000_000)
    return int(text.replace(',', ''))

Get the likes and followers of each entry using the Selenium library

In [33]:
def get_video_likes_and_creator_followers_selenium(driver, video_id: str, creator_id: str):
    video_url = f"https://www.tiktok.com/@{creator_id}/video/{video_id}"
    creator_url = f"https://www.tiktok.com/@{creator_id}"

    likes = None
    followers = None

    try:
        # Charging the video page
        driver.get(video_url)
        time.sleep(5)  # Wait to ensure is all loaded

        # Likes
        like_element = driver.find_element(By.CSS_SELECTOR, '[data-e2e="like-count"]')
        likes = parse_number(like_element.text)
    except Exception as e:
        print(f"Error obtaining likes for {video_id}: {e}")

    try:
        
        driver.get(creator_url)
        time.sleep(5)

        # Followers
        follower_element = driver.find_element(By.CSS_SELECTOR, '[data-e2e="followers-count"]')
        followers = parse_number(follower_element.text)
    except Exception as e:
        print(f"Error obtaining followers for  {creator_id}: {e}")

    return likes, followers

In [34]:
def add_likes_and_followers_to_dataframe(df):
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--window-size=1920,1080")
    
    driver = webdriver.Chrome(options=options)

    df = df.copy()
    df["likes"] = None
    df["followers"] = None

    for index, row in df.iterrows():
        video_id = row["video_id"]
        creator_id = row["creator_id"]
        print(f"Processing @{creator_id} / video {video_id}...")

        likes, followers = get_video_likes_and_creator_followers_selenium(driver, video_id, creator_id)

        df.at[index, "likes"] = likes
        df.at[index, "followers"] = followers

    driver.quit()
    return df

In [None]:
df = add_likes_and_followers_to_dataframe(df)

# Results storage

In [41]:
df.to_csv("Parties_TikTokEuropeanElections_Abortion_War.csv", index=False)

# Fix

In [None]:
df = pd.read_csv("../dataset/TikTokEuropeanElections_Abortion_War.csv")

In [36]:
# Some abbrevatrions (such as PS for Belgium and France) might overlap
# While there might be parties from all european countries, I have foccused on the countries included on the experiment

left = [
    "Die Linke", "IU", "Podemos", "PCE", "PCF", "LFI", "PRC", "Syriza", 
    "Vänsterpartiet", "Vasemmistoliitto", "AKEL", "PTB", "KPÖ", "Enhedslisten", 
    "Rødt", "PST/POP", "PIE", "The Left", "Razem", "EFA", "S&D", "Renew Europe", 
    "PSOE", "Partido Socialista Obrero Español", "Sumar", "PES", "PS", 
    "Parti Socialiste", "APSD", "SD", "SAP", "Labour", "SPÖ", "Vooruit", "SPD", 
    "Sozialdemokratische Partei Deutschlands", "NL", "Nowa Lewica", "PvdA", 
    "Partij van de Arbeid", "Socialist Party", "Democratic Party", "Labour Party", 
    "PASOK", "SLD", "Nouvelle Donne", "PRG", "Inicjatywa Polska", "Grüne", 
    "Greens", "ERC", "Esquerra Republicana de Catalunya", "EGP", "The Greens", 
    "BNG", "Bloque Nacionalista Galego", "LE", "Les Écologistes", "The greens", 
    "GL", "GroenLinks", "SMR", "Bildu", "Euskal Herria Bildu", "Left Party", "PvdD", 
    "Partij voor de Dieren"
]

right = [
    "EPP", "European People's Party", "ECR", "PiS", "Law and Justice", "VOX", 
    "RN", "National Rally", "FPÖ", "Fidesz", "Patriots", "ESN", 
    "Europe of Sovereign Nations", "AfD", "Alternative für Deutschland", "Republika", 
    "Reconquête", "NOWA NADZIEJA", "New Hope", "Mi Hazánk", "PP", "Partido Popular", 
    "CDU", "Christlich Demokratische Union Deutschlands", "Agir", "MoDem", 
    "Mouvement Démocrate", "Ensemble", "LFA", "RE", "Renaissance", "LR", 
    "Les Républicains", "CDA", "Christen-Democratisch Appèl", "NSC", 
    "New Social Contract", "IDP", "CSU", "Christlich-Soziale Union in Bayern", 
    "FDP", "Freie Demokratische Partei", "FW", "Freie Wähler", "Junts", "ZP", "NPD", 
    "PVV", "Partij voor de Vrijheid", "FvD", "European People's Party", 
    "Progressive Alliance of Socialists & Democrats", "PO", "Platforma Obywatelska", 
    "PSL", "Polskie Stronnictwo Ludowe", "BBB", "BoerBurgerBeweging", "Familie", 
    "ÖDP", "Ökologisch-Demokratische Partei", "UDR", "Union des Démocrates et Indépendants", 
    "PfE", "Patriots of Europe", "D66", "Democraten 66", "PL2050", "Polska 2050", 
    "RECONQUÊTE", "R!", "NN", "Nieuwe Nationale Partij"
]

# It is possible that it is needed to change the abbr. of the parties with their full name so they are detected

In [37]:
party_group_mapping = {
    'EPP' : 'EPP',                      # -------- European People's Party --------
    'PP': 'EPP',                        # Spain - Partido Popular
    'PSL': 'EPP',                       # Poland
    'PO': 'EPP',                        # Poland
    'BBB': 'EPP',                       # Netherlands
    'CDA': 'EPP',                       # Netherlands
    'PVV': 'EPP',                       # Netherlands
    'CDU': 'EPP',                       # Germany
    'ÖDP': 'EPP',                       # Germany
    'CSU': 'EPP',                       # Germany
    'FAMILIE': 'EPP',                   # Germany
    'LR': 'EPP',                        # France
    'UDR' : 'EPP',                      # France
    'S&D': 'S&D',                       # -------- European --------
    'PSOE': 'S&D',                      # Spain
    'SPD': 'S&D',                       # Germany
    'NL' : 'S&D',                       # Poland - New Left
    'PS' : 'S&D',                       # France - Parti Socialiste
    'PvdA': 'S&D',                      # Netherlands - Labour Party
    'PfE' : 'PfE',                      # -------- European Patriots of Europe --------
    'VOX' : 'PfE',                      # Spain - Vox
    'RN' : 'PfE',                       # France - National Rally || Poland - National Movement
    'PVV' : 'PfE',                      # Netherlands - Party for Freedom
    'ECR' : 'ECR',                      # -------- European Conservatives and Reformists --------
    'SALF' : 'ECR',                     # Spain - Se acabo la fiesta
    'IDL' : 'ECR',                      # France - Identity and Liberty
    'PiS' : 'ECR',                      # Poland - Law and Justice
    'SGP' : 'ECR',                      # Netherlands - Reformed Political Party
    'Renew Europe' : 'Renew Europe',    # -------- Renew Europe --------
    'PNV' : 'Renew Europe',             # Spain - Basque Nationalist Party
    'MoDem' : 'Renew Europe',           # France - Democratic Movement
    'RE' : 'Renew Europe',              # France - Renaissance
    'UDI' : 'Renew Europe',             # France - Union of Democrats and Independents
    'FDP' : 'Renew Europe',             # Germany - Free Democratic Party
    'FW' : 'Renew Europe',              # Germany - Free Voters
    'VVD' : 'Renew Europe',             # Netherlands - People's Party for Freedom and Democracy
    'D66' : 'Renew Europe',             # Netherlands - Democrats 66
    'PL2050' : 'Renew Europe',          # Poland - Poland 2050
    'Greens' : 'Greens',                # -------- The Greens --------
    'EFA' : 'Greens',                   # -------- European Free Alliance --------
    'ERC' : 'Greens',                   # Spain - Republican Left of Catalonia
    'BNG' : 'Greens',                   # Spain - Galician Nationalist Bloc
    'EGP' : 'Greens',                   # European Green Party 
    'LE' : 'Greens',                    # France - Les Écologistes - The Greens 
    'The greens' : 'Greens',            # Germany - The Greens
    'GL' : 'Greens',                    # Netherlands - GroenLinks
    'The Left' : 'The Left',            # -------- The Left --------
    'Podemos' : 'The Left',             # Spain - Podemos
    'Sumar' : 'The Left',               # Spain - Sumar
    'SMR' : 'The Left',                 # Spain - Sumar
    'Bildu' : 'The Left',               # Spain - Bildu
    'LFI' : 'The Left',                 # France - La France Insoumise
    'Die Linke' : 'The Left',           # Germany - The Left
    'Left party' : 'The Left',          # Germany - The Left
    'PvdD' : 'The Left',                # Netherlands - Party for the Animals
    'ESN' : 'ESN',                      # -------- Europe of Sovereign Nations --------
    'RECONQUÊTE' : 'ESN',               # France - Reconquête
    'R!' : 'ESN',                       # France - Reconquête
    'AfD' : 'ESN',                      # Germany - Alternative for Germany
    'NN' : 'ESN',                       # Netherlands - New Hope
}

In [38]:
def analyze_ideology(description):
    if not isinstance(description, str):
        return pd.Series([np.nan, 0.0, 0.0, []])

    found_left = [
        party for party in left
        if re.search(r'\b' + re.escape(party) + r'\b', description, flags=re.IGNORECASE)
    ]
    found_right = [
        party for party in right
        if re.search(r'\b' + re.escape(party) + r'\b', description, flags=re.IGNORECASE)
    ]

    parties_mentioned = found_left + found_right
    total_found = len(parties_mentioned)

    if total_found == 0:
        perc_left = perc_right = 0.0
        ideology = "no mention"
    else:
        perc_left = len(found_left) / total_found
        perc_right = len(found_right) / total_found
        if perc_left > perc_right:
            ideology = "left"
        elif perc_right > perc_left:
            ideology = "right"
        else:
            ideology = "mixed"

    return pd.Series([ideology, perc_left, perc_right, parties_mentioned])


In [39]:
df[["ideology", "% left", "% right", "parties_mentioned"]] = df["description"].apply(analyze_ideology)

In [40]:
df.head()

Unnamed: 0.1,Unnamed: 0,experiments,total_experiments,total_samples,interface,countries,languages,search_queries,video_id,video_title,...,misinformation,human_rights,abortion_rights,war_justification,womens_rights,disagreement,ideology,% left,% right,parties_mentioned
3,3,{'party'},0,0,tk0,{'fr'},{'en'},"{""renaissance's main stances""}",7099633825587744046,Reply to @otter_4c answering your #question a...,...,0.0,0.0,0.0,0.0,0.0,0.0,right,0.0,1.0,[Renaissance]
5,5,{'party'},0,0,tk0,{'fr'},{'en'},"{""renaissance's main stances""}",7151404990010100997,@olivier_rousteing explains his new massive @...,...,0.0,0.0,0.0,0.0,0.0,0.0,right,0.0,1.0,[Renaissance]
6,6,{'party'},0,0,tk0,{'fr'},{'en'},"{""renaissance's main stances""}",7297748671377018154,should i do a mens version next? #renfaire #re...,...,0.0,0.0,0.0,0.0,0.0,0.0,right,0.0,1.0,[Renaissance]
7,7,{'party'},0,0,tk0,{'fr'},{'en'},"{""renaissance's main stances""}",7303618548369198369,How to draw in the Renassaince style! #drawin...,...,0.0,0.0,0.0,0.0,0.0,0.0,no mention,0.0,0.0,[]
8,8,{'party'},0,0,tk0,{'fr'},{'en'},"{""renaissance's main stances""}",7336610459996458273,Qui a deja trouver des pépites chez Renaissanc...,...,0.0,0.0,0.0,0.0,0.0,0.0,right,0.0,1.0,[Renaissance]


In [None]:
df.to_csv("TikTokEuropeanElections_Abortion_War.csv", index=False)