In [2]:
import os

GOOGLE_TRANSLATE_API_KEY = "AIzaSyASpA9jny10TqtZJRCVT-H0YYp68cfa2S4"


In [3]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options

options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")

driver = webdriver.Chrome(options=options)
driver.get("https://elpais.com/")


In [4]:
import time

try:
    opinion_link = driver.find_element(By.LINK_TEXT, "Opinión")
    opinion_link.click()
    time.sleep(3)
except Exception as e:
    print("Error navigating:", e)
    driver.quit()


In [5]:
driver.execute_script("window.scrollTo(0, 1000);")
time.sleep(2)

articles = driver.find_elements(By.CSS_SELECTOR, "article a")
article_links = []
for a in articles:
    href = a.get_attribute("href")
    if href and href not in article_links:
        article_links.append(href)
    if len(article_links) >= 5:
        break

print("Collected article links:")
for link in article_links:
    print(link)


Collected article links:
https://elpais.com/opinion/editoriales/
https://elpais.com/opinion/2025-08-08/un-ano-normal-en-cataluna.html
https://elpais.com/opinion/2025-08-08/limites-a-la-externalizacion-migratoria.html
https://elpais.com/opinion/tribunas/
https://elpais.com/opinion/2025-08-08/palmeros-y-munecos-rotos-en-politica.html


In [6]:
import requests
from bs4 import BeautifulSoup

from collections import Counter
import pandas as pd

os.makedirs("images", exist_ok=True)

def translate_text(text, source='es', target='en'):
    url = "https://translation.googleapis.com/language/translate/v2"
    params = {
        'q': text,
        'source': source,
        'target': target,
        'key': GOOGLE_TRANSLATE_API_KEY
    }
    response = requests.post(url, data=params)
    if response.status_code == 200:
        return response.json()['data']['translations'][0]['translatedText']
    else:
        print("Translation failed:", response.text)
        return text

# Collect data
translated_titles = []
data = []

for idx, link in enumerate(article_links):
    try:
        driver.get(link)
        time.sleep(2)

        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # Title
        title_tag = soup.find("h1")
        title_es = title_tag.text.strip() if title_tag else "Sin título"
        print(f"\n Title {idx+1} (ES): {title_es}")

        # Content
        paragraphs = soup.find_all("p")
        content = '\n'.join(p.text.strip() for p in paragraphs if p.text.strip())
        content_preview = content[:500]
        print(f" Content Preview:\n{content_preview}\n")

        # Image
        image_filename = ""
        image_tag = soup.find("img")
        if image_tag and image_tag.get("src"):
            try:
                image_url = image_tag["src"]
                img_data = requests.get(image_url).content
                ext = os.path.splitext(image_url.split("?")[0])[-1] or ".jpg"
                image_filename = f"images/article_{idx+1}{ext}"
                with open(image_filename, "wb") as f:
                    f.write(img_data)
                print(f" Image saved: {image_filename}")
            except Exception as e:
                print(" Image download failed:", e)

        # Translation
        title_en = translate_text(title_es)
        print(f" Translated Title {idx+1}: {title_en}")
        translated_titles.append(title_en)

        # Save
        data.append({
            "Title_ES": title_es,
            "Title_EN": title_en,
            "Content_Preview": content_preview,
            "Image_File": image_filename,
            "URL": link
        })

    except Exception as e:
        print(" Error scraping article:", e)



 Title 1 (ES): Opinión
 Content Preview:
La Generalitat de Salvador Illa ha tenido un evidente efecto estabilizador y está superando poco a poco la política de bloques
La Justicia europea pone trabas a los intentos de crear sistemas de deportación como el que busca Italia con Albania
La enorme cifra de operaciones de compraventa en la primera mitad del año se produce en un mercado inmobiliario disfuncional
El PP ampara el discurso discriminatorio de Vox en las instituciones públicas al rechazar desde un ayuntamiento una celebración mus

 Image saved: images/article_1.svg
 Translated Title 1: Opinion

 Title 2 (ES): Un año normal en Cataluña
 Content Preview:
Salvador Illa ha salido a correr casi cada mañana en este primer año como presidente de la Generalitat y ha divulgado sus marcas personales. Su política también ha estado marcada por carreras de fondo, pero a sabiendas de que necesitaba marcar el ritmo desde el primer momento. La razón es que la difícil estabilidad de su Gobierno,

In [7]:
df = pd.DataFrame(data)

In [8]:
from collections import Counter
import re

# Analyze Translated Titles
all_titles_text = " ".join(translated_titles)

# Tokenize and clean (remove punctuation, lowercase)
words = re.findall(r'\b\w+\b', all_titles_text.lower())

# Count word frequencies
word_counts = Counter(words)

# Filter to only words occurring more than twice
repeated_words = {word: count for word, count in word_counts.items() if count > 2}

# Print repeated words
print("\nRepeated Words (more than twice):")
for word, count in repeated_words.items():
    print(f"{word}: {count}")

# This will add the top repeated words to every row of the CSV
df["Repeated_Words"] = ", ".join([f"{word}:{count}" for word, count in repeated_words.items()])


Repeated Words (more than twice):


In [9]:
df.to_csv(r"C:\Users\aryas\Selenium\elpais_opinion_articles.csv", index=False, encoding="utf-8-sig")
print(" Saved to elpais_opinion_articles.csv")
df.head()


 Saved to elpais_opinion_articles.csv


Unnamed: 0,Title_ES,Title_EN,Content_Preview,Image_File,URL,Repeated_Words
0,Opinión,Opinion,La Generalitat de Salvador Illa ha tenido un e...,images/article_1.svg,https://elpais.com/opinion/editoriales/,
1,Un año normal en Cataluña,A normal year in Catalonia,Salvador Illa ha salido a correr casi cada mañ...,images/article_2.svg,https://elpais.com/opinion/2025-08-08/un-ano-n...,
2,Límites a la externalización migratoria,Limits to migration externalization,El Tribunal de Justicia de la Unión Europea ha...,images/article_3.svg,https://elpais.com/opinion/2025-08-08/limites-...,
3,Opinión,Opinion,"Lo que debería soliviantarnos, en realidad, es...",images/article_4.svg,https://elpais.com/opinion/tribunas/,
4,Palmeros y muñecos rotos en política,Palmeros and broken dolls in politics,El escándalo por los falsos títulos de los pol...,images/article_5.svg,https://elpais.com/opinion/2025-08-08/palmeros...,


In [10]:
all_words = " ".join(translated_titles).lower().split()
stopwords = {"the", "and", "or", "of", "in", "to", "for", "a", "on", "at", "by", "is", "are", "was", "be", "an"}

filtered_words = [word.strip(".,!?|") for word in all_words if word not in stopwords]
word_counts = Counter(filtered_words)

repeated = {word: count for word, count in word_counts.items() if count > 2}

print("\n Repeated words in translated titles (appeared more than twice):")
if repeated:
    for word, count in repeated.items():
        print(f"  - '{word}': {count} times")
else:
    print(" No repeated words found more than twice.")



 Repeated words in translated titles (appeared more than twice):
 No repeated words found more than twice.


In [11]:
driver.quit()
print(" Browser closed.")


 Browser closed.


In [12]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import time


BROWSERSTACK_USERNAME = 'aryasharma_CA1R9w'
BROWSERSTACK_ACCESS_KEY = 'cFunTaqWYcPmmj7uKF9P'



In [13]:
from selenium import webdriver
import pytest
import json

USERNAME = "aryasharma_CA1R9w"
ACCESS_KEY = "cFunTaqWYcPmmj7uKF9P"

capabilities = [
    {
        'browserName': 'Chrome',
        'browserVersion': 'latest',
        'bstack:options': {
            'os': 'Windows',
            'osVersion': '10',
            'buildName': 'Parallel Build',
            'sessionName': 'Chrome Test'
        }
    },
    {
        'browserName': 'Firefox',
        'browserVersion': 'latest',
        'bstack:options': {
            'os': 'Windows',
            'osVersion': '10',
            'buildName': 'Parallel Build',
            'sessionName': 'Firefox Test'
        }
    },
    {
        'browserName': 'Safari',
        'browserVersion': 'latest',
        'bstack:options': {
            'os': 'OS X',
            'osVersion': 'Ventura',
            'buildName': 'Parallel Build',
            'sessionName': 'Safari Test'
        }
    },
    {
        'browserName': 'iPhone',
        'bstack:options': {
            'deviceName': 'iPhone 14',
            'osVersion': '16',
            'realMobile': 'true',
            'buildName': 'Parallel Build',
            'sessionName': 'iPhone Safari Test'
        }
    },
    {
        'browserName': 'Android',
        'bstack:options': {
            'deviceName': 'Samsung Galaxy S23',
            'osVersion': '13.0',
            'realMobile': 'true',
            'buildName': 'Parallel Build',
            'sessionName': 'Android Chrome Test'
        }
    }
]

@pytest.mark.parametrize("caps", capabilities)
def test_browserstack(caps):
    
    options = webdriver.ChromeOptions()
    options.set_capability("browserName", caps.get("browserName"))
    if "browserVersion" in caps:
        options.set_capability("browserVersion", caps.get("browserVersion"))
    for key, value in caps.items():
        if key != "browserName" and key != "browserVersion":
            options.set_capability(key, value)

    driver = webdriver.Remote(
        command_executor=f"https://{USERNAME}:{ACCESS_KEY}@hub-cloud.browserstack.com/wd/hub",
        options=options
    )

    driver.get("https://elpais.com/opinion/")
    assert "Opinión" in driver.title

    
    driver.execute_script(
        'browserstack_executor: {"action": "setSessionStatus", "arguments": {"status":"passed","reason": "Title verified"}}'
    )

    driver.quit()
