In [59]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import numpy as np
from requests.exceptions import ConnectionError, HTTPError, Timeout

In [65]:
BASE_URL = "https://corola.racai.ro/lista.html"
def get_soup(url: str = BASE_URL, retries: int = 3, backoff_factor: int = 1) -> BeautifulSoup:
    print(f"Fetching: {url}")
    # response = requests.get(url)
    for attempt in range(retries):
        try:
            # Attempt the request
            response = requests.get(url, timeout=10)  # Add timeout to avoid hanging requests
            response.raise_for_status()  # Raise HTTPError for bad responses (4xx, 5xx)
            soup = BeautifulSoup(response.content, "html.parser")
            # with open("../data/soup.html", "w", encoding="utf-8") as f:
            #     f.write(str(soup))
            return soup
        except (ConnectionError, HTTPError, Timeout) as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            
            # If it's the last attempt, raise the error
            if attempt == retries - 1:
                raise
            
            # Wait before retrying (exponential backoff)
            wait_time = backoff_factor * (2 ** attempt)
            print(f"Retrying in {wait_time} seconds...")
            time.sleep(wait_time)


In [None]:
soup = get_soup()


Fetching: https://corola.racai.ro/lista.html


In [16]:
word_list = []
word_type = []
for link in soup.select("a"):
    try:
        base_word = link.select_one("strong").text.strip()
        additional_info = link.text.replace(base_word, "").strip()[1:]
        word_list.append(base_word)
        word_type.append(additional_info)
    except:
        pass

In [17]:
word_list[:10], word_type[:10]

(['aba',
  'abac',
  'abacă',
  'abacterian',
  'abager',
  'abagerie',
  'abagiu',
  'abajur',
  'abandon',
  'abandona'],
 ['substantiv',
  'substantiv',
  'substantiv',
  'adjectiv',
  'substantiv',
  'substantiv',
  'substantiv',
  'substantiv',
  'substantiv',
  'verb'])

In [19]:
# Save the data to csv files
pd.DataFrame({"word": word_list, "type": word_type}).to_csv("../data/romanian_words.csv", index=False)

In [23]:
df = pd.read_csv("../data/romanian_words.csv")

In [66]:
def get_syllables(word):
    url = f"https://www.silabe.ro/desparte-in-silabe-{word}.html"
    soup = get_soup(url)
    # with open("../data/soup.html", "w", encoding="utf-8") as f:
    #     f.write(str(soup))
    desp_div = soup.select_one("div.desp")

    if desp_div:
        content = desp_div.text.split('= ')[1].strip()
        words = content.replace("•", "").split()
        formatted_output = "-".join(words)
        return words, formatted_output
    print(f"Failed to fetch: word={word}")
    return [], "not-found"

In [67]:
syllables = []
joint_syllables = []

for i, row in df.iterrows():
    if i < 5201:
        continue

    syllable_list, joint = get_syllables(row['word'])
    
    syllables.append(syllable_list)
    joint_syllables.append(joint)
    
    i += 1
    if i % 13 == 0:
        time.sleep(3)
    if i % 10 == 0:
        words = list(df.head(i)['word'])[5201:]
        types = list(df.head(i)['type'])[5201:]
        print(len(words), len(types), len(syllables), len(joint_syllables))
        pd.DataFrame({"word": words, "type": types, "syllables" : syllables, "joint_syllables": joint_syllables}).to_csv("../data/temporary_romanian_words5.csv", index=False)
    
    

Fetching: https://www.silabe.ro/desparte-in-silabe-artropatie.html
Fetching: https://www.silabe.ro/desparte-in-silabe-artroplastie.html
Fetching: https://www.silabe.ro/desparte-in-silabe-artropod.html
Fetching: https://www.silabe.ro/desparte-in-silabe-artroscop.html
Failed to fetch: word=artroscop
Fetching: https://www.silabe.ro/desparte-in-silabe-artroscopie.html
Failed to fetch: word=artroscopie
Fetching: https://www.silabe.ro/desparte-in-silabe-artrotomie.html
Failed to fetch: word=artrotomie
Fetching: https://www.silabe.ro/desparte-in-silabe-artroză.html
Fetching: https://www.silabe.ro/desparte-in-silabe-artrozic.html
Fetching: https://www.silabe.ro/desparte-in-silabe-artterapie.html
Failed to fetch: word=artterapie
9 9 9 9
Fetching: https://www.silabe.ro/desparte-in-silabe-arunca.html
Fetching: https://www.silabe.ro/desparte-in-silabe-aruncare.html
Fetching: https://www.silabe.ro/desparte-in-silabe-aruncat.html
Fetching: https://www.silabe.ro/desparte-in-silabe-aruncător.html
Fetc

ConnectTimeout: HTTPSConnectionPool(host='www.silabe.ro', port=443): Max retries exceeded with url: /desparte-in-silabe-coalescent.html (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x1274c0910>, 'Connection to www.silabe.ro timed out. (connect timeout=10)'))

In [None]:
df['syllables'] = syllables
df['syllables_number'] = df['syllables'].apply(lambda x: len(x))

df.to_csv("../data/romanian_words.csv")