# Web Scrapping


In [4]:
# Require Libraries
import requests
from bs4 import BeautifulSoup
import time
import csv
import re

# Browser request to avoid being blocked
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.5",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
}


# Function to extract article headline, author, date, and cleaned content from a single article URL

def extract_article_content_single_paragraph(url):
    try:
        response = requests.get(url, headers=HEADERS)
        soup = BeautifulSoup(response.text, "html.parser")

        # Extract headline
        title_tag = soup.find("h1", class_="ar__ti")
        headline = title_tag.get_text(strip=True) if title_tag else "No headline"

        # Extract author
        #author_tag = soup.find("dl", class_="byline list-inline font-sans d-inline mt-2 mb-0 black-70")
        #author = author_tag.get_text(strip=True) if author_tag else "No author"

        # Extract publication date
        date_tag = soup.find("time", class_="published")
        publication_date = date_tag.get_text(strip=True) if date_tag else "No publication date"

        # Extract article paragraphs
        paragraphs = []
        for tag in soup.find_all(True):
            if tag.name == "div" and tag.get("class") == ["truth-post-content-after"]:
                break
            if tag.name == "p":
                text = tag.get_text(separator=' ', strip=True)
                if text:
                    paragraphs.append(text)

        # Clean the text
        raw_text = ' '.join(paragraphs)
        cleaned_text = re.sub(r'\[.*?\]', '', raw_text)  # Remove [image or embed]
        cleaned_text = re.sub(r'http\S+', '', cleaned_text)  # Remove URLs
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()  # Normalize whitespace

        return {
            "headline": headline,
            #"author": author,
            "publication_date": publication_date,
            "link": url,
            "content": cleaned_text
        }

    except Exception as e:
        return {"error": str(e), "link": url}


# Function to collect all article links from the Elon Musk topic pages

def get_all_article_links():
    all_links = []
    current_page = 1
    while True:
        url = f"https://truthout.org/topics/elon-musk/page/{current_page}/"
        response = requests.get(url, headers=HEADERS)
        soup = BeautifulSoup(response.text, "html.parser")
        
        # Stop if we reach a "Page not found"
        if soup.title and "Page not found" in soup.title.text:
            break
        
        # Find all article links on the current page
        article_blocks = soup.find_all("h2", class_="ar-list__ti mb-3 mb-md-2")
        links = [block.find("a")["href"] for block in article_blocks if block.find("a")]

        if not links:
            break

        all_links.extend(links)
        current_page += 1
        time.sleep(1) # polite delay

    return all_links

# Get article links
article_links = get_all_article_links()

# Save to CSV files
with open("elon_musk_truthout_corpus.csv", "w", encoding="utf-8") as content_file, \
     open("elon_musk_truthout_metadata.csv", "w", encoding="utf-8", newline='') as metadata_file:
    
     # Write corpus header
    content_file.write("Article\n")
    
    csv_writer = csv.writer(metadata_file)
    csv_writer.writerow(["Headline", "Publication Date", "Link"])

    for i, link in enumerate(article_links, 1):
        data = extract_article_content_single_paragraph(link)
        if "error" in data:
            print(f"Error in Article {i}: {data['error']}")
            continue

        print(f"Article {i}: {data['headline']}")

        # Write corpus
        content_file.write(f"{data['content']}\n")

        # Write metadata
        csv_writer.writerow([data['headline'], data['publication_date'], data['link']])

        time.sleep(1.5) # Delay to avoid overloading the server


Article 1: Former SSA Chief: “Elon Musk Is the Biggest Fraud, Not Social Security”
Article 2: Trump Admin Wields Foreign Policy Apparatus to Advance Musk’s Business Interests
Article 3: Trump’s Censorship Campaign Draws on Decades of Infrastructure Built by Big Tech
Article 4: This Musk Adviser and DOGE Employee May Be Violating Ethics Rules, Experts Say
Article 5: Trump Is Bending the Refugee Program to Fit His White Nationalist Agenda
Article 6: Judge Blocks Trump-Musk’s “Unconstitutional Dismantling” of Federal Government
Article 7: Corporate-Led Attack on Free Speech in Texas Meets Bipartisan Resistance
Article 8: Trump Officials Pressure Nations to Approve Musk’s Starlink During Tariff Talks
Article 9: Amid Near-Total Asylum Ban, Trump Welcomes White South Africans
Article 10: Living Holders of Millions of Social Security Numbers Have Been Marked as Dead
Article 11: Alliance Between Far Right and Silicon Valley Gives Rise to “End Times Fascism”
Article 12: Trump and Musk Are Tryin

Article 96: “DOGE” Government Website Was Temporarily Alterable by Pretty Much Anyone
Article 97: Jayapal Unveils Bill to End “Citizens United,” Slamming “Shadow President” Musk
Article 98: German Political Leader Promises Inquiry Into Musk’s Interference in Elections
Article 99: By Refusing to Obey Court Orders, Trump Is Provoking a Constitutional Crisis
Article 100: How Is the Trump Family Profiting From the New Cryptocurrency Memecoin?
Article 101: GOP Proposes $4.5T Tax Giveaway to Rich While Slashing Food Stamps, Medicaid
Article 102: DOGE’s Illegal Takeover Pulls From Fascist Playbooks
Article 103: White House Revoked Access to AP for Refusing to Comply With Censorship
Article 104: Omar Slams Musk for Slashing Cancer Funding While Keeping SpaceX Contracts
Article 105: A Third DOGE Staffer’s Racist and Misogynistic Online Footprint Comes to Light
Article 106: Agency Tasked With Regulating Commercial Space Flight Braces for Musk’s DOGE
Article 107: Judge Rules Trump Officials Are D

Article 189: US’s Wealthiest Collectively Held $8.5 Trillion in Untaxed Assets in 2022
Article 190: “Free-Speech Absolutist” Musk Is Abetting Modi’s Suppression of Voices in India
Article 191: It’s Time to Break With the ADL as a Source for News and Research on Extremism
Article 192: Musk to Advertisers Who Left “X” Due to Antisemitic Content: “Go F*** Yourself”
Article 193: Elon Musk’s Backing of Antisemitic Content May Cost “X” Up to $75M in Ad Revenue
Article 194: Elon Musks’s “X” Sues Media Matters for Its Report on Antisemitic Content
Article 195: Report Finds X Failed to Remove 98 Percent of Posts Flagged Hateful
Article 196: Elon Musk Wants Me Fired. The People of San Francisco Want Him Taxed.
Article 197: In Latest Attack on Journalism, Musk Removes Headlines From Posts on “X”
Article 198: To Fight Big Tech, We Must Seize the Means of Computation
Article 199: Warren Demands Review of Contracts With Musk Amid Actions Against Ukraine
Article 200: Rocket-Launching Billionaires Pro

# Topic Modeling


In [1]:
# Import Library

import pandas as pd

# Load File
corpus = pd.read_csv("elon_musk_truthout_corpus.csv", sep="\t", engine="python")

# Preview the dataset (optional)
corpus.head()

Unnamed: 0,Article
0,An internal document bolsters warnings that th...
1,Trump’s State Department has intervened on beh...
2,Trump’s escalation of censorship and retaliati...
3,Christopher Young earns up to $1 million annua...
4,Trump is using the baseless claim of “white ge...


In [2]:
#corpus['Article'][0]
#len(corpus)

In [3]:
# Preprocess text with CountVectorizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Extend default stop words
custom_stop_words = set(ENGLISH_STOP_WORDS)
custom_stop_words.update(["ve", "like", "just", "really", "said"])

# CountVectorizer parameters
cv = CountVectorizer(max_df=0.90, min_df=2, stop_words=list(custom_stop_words))

# Fit and transform DTM(Document-Term Matrix)
dtm = cv.fit_transform(corpus['Article'])

# Fit LDA model to extract topics
from sklearn.decomposition import LatentDirichletAllocation

LDA = LatentDirichletAllocation(n_components=5,random_state = 42)

LDA.fit(dtm)

# Display the top 15 words per topic

for i,topic in enumerate (LDA.components_):
    print(f"THE TOP 15 WORDS FOR TOPIC #{i}")
    print([cv.get_feature_names_out()[index] for index in topic.argsort()[-15:]])
    print('\n')


THE TOP 15 WORDS FOR TOPIC #0
['want', 'tech', 'way', 'ai', 'future', 'world', 'know', 'lot', 'going', 'things', 'right', 'space', 'kind', 'think', 'people']


THE TOP 15 WORDS FOR TOPIC #1
['post', 'content', 'social', 'trump', 'far', 'speech', 'free', 'users', 'site', 'state', 'platform', 'company', 'media', 'right', 'twitter']


THE TOP 15 WORDS FOR TOPIC #2
['world', 'income', 'money', 'million', 'twitter', 'year', 'percent', 'billion', 'workers', 'trump', 'wealth', 'billionaires', 'people', 'tax', 'tesla']


THE TOP 15 WORDS FOR TOPIC #3
['right', 'company', 'cfpb', 'new', 'federal', 'president', 'know', 'party', 'house', 'republicans', 'people', 'democrats', 'spacex', 'trump', 'government']


THE TOP 15 WORDS FOR TOPIC #4
['new', 'public', 'workers', 'employees', 'agency', 'department', 'president', 'people', 'security', 'administration', 'social', 'doge', 'federal', 'government', 'trump']




In [7]:
# Get topic distribution for each article

topic_result = LDA.transform(dtm)
corpus['Topic'] = topic_result.argmax(axis=1)
#corpus.head(20)

# Create a dictionary to map topic numbers to theme names
mytopic_dict = {
    0: 'Tech & AI',
    1: 'Social Media',
    2: 'Economics',
    3: 'Politics',
    4: 'DOGE/Administration'
}

# Add a new column
corpus['Topic Label'] = corpus['Topic'].map(mytopic_dict)

# Merge metadata (Publication Date)

metadata = pd.read_csv("elon_musk_truthout_metadata.csv")
corpus["Publication Date"] = metadata["Publication Date"]

# Convert 'Publication Date' to datetime using full month format
corpus["Publication Date"] = pd.to_datetime(corpus["Publication Date"], format="%B %d, %Y")

# Define U.S presidential election date
election_date = pd.to_datetime("November 5, 2024")

# Label based on election date
corpus["Election Label"] = corpus["Publication Date"].apply(
    lambda x: "Post–U.S. Election 2024" if x > election_date else "Pre–U.S. Election 2024"
)

# Save the labeled data to Excel
corpus.to_excel("elon_musk_truthout_topic_modelling.xlsx", index=False)

# Reformat date for display
corpus["Publication Date"] = corpus["Publication Date"].dt.strftime("%b %d, %Y")


In [8]:
corpus.head()

Unnamed: 0,Article,Topic,Topic Label,Publication Date,Election Label
0,An internal document bolsters warnings that th...,4,DOGE/Administration,"May 16, 2025",Post–U.S. Election 2024
1,Trump’s State Department has intervened on beh...,1,Social Media,"May 15, 2025",Post–U.S. Election 2024
2,Trump’s escalation of censorship and retaliati...,1,Social Media,"May 14, 2025",Post–U.S. Election 2024
3,Christopher Young earns up to $1 million annua...,4,DOGE/Administration,"May 14, 2025",Post–U.S. Election 2024
4,Trump is using the baseless claim of “white ge...,4,DOGE/Administration,"May 12, 2025",Post–U.S. Election 2024
