In [1]:
import requests
from bs4 import BeautifulSoup
import re

def get_text_from(source, language = 'en'):
    '''
    source: 0 - vikidia, 1 - wikipedia
    language: 'en', 'ru', 'ar', 'ca', 'de', 'el', 'es', 'eu', 'fr', 'hy', 'it', 'oc', 'pt'
    '''
    if source == 0:
        url = f'https://{language}.vikidia.org/wiki/Special:Random'
    else:
        url = f'https://{language}.wikipedia.org/wiki/Special:Random'

    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    #content = soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
    content = soup.find_all(['p'])

    article = ''

    for element in content:
        if element.text.find('Vikidia') != -1:
            continue
        if element.text == '':
            continue
        if element.text.find('more stubs') != -1:
            continue

        article += element.text.removesuffix('\n') + ' '

    article = re.sub(r'\xa0|\\|/', '', article.strip())
    article = re.sub('нет изображения', '', article)
    article = article.replace('\n', ' ')
    article = re.sub(r'\s+', ' ', article)
    article = re.sub(r'\[.*?\]', '', article)
    article = re.sub(r'Celebra con vikidia la navidad y año nuevo, y vive los ultimos momentos felices del año ¡Feliz año 2024!', '', article)
    article = re.sub(r'Aqueste article es regidit en lengadocian.', '', article)
    article = re.sub(r'Crie uma conta, colabore, e se divirta!', '', article)

    if article == "" or article.find('Our servers are currently under maintenance') != -1:
        return get_text_from(source, language)
    else:
        return article

In [2]:
import asyncio
import pandas as pd
from tqdm.notebook import tqdm

# Define an asynchronous function to run the get_text_from function 1000 times
async def get_texts_async(num_texts=1000, source=0, language='en'):
    loop = asyncio.get_event_loop()
    futures = [loop.run_in_executor(None, get_text_from, source, language) for _ in range(num_texts)]
    results = await asyncio.gather(*futures)
    return results

# Run the asynchronous function and create a DataFrame
async def main(language):
    texts_vikidia = await get_texts_async(2000, 0, language)
    texts_wikipedia = await get_texts_async(2000, 1, language)

    df_vikidia = pd.DataFrame({'text': texts_vikidia, 'class': 0})
    df_wikipedia = pd.DataFrame({'text': texts_wikipedia, 'class': 1})

    df = pd.concat([df_vikidia, df_wikipedia], ignore_index=True)
    return df


In [3]:
for language in tqdm(['en'], desc="Languages"): #tqdm(['en', 'ru', 'ca', 'el', 'es', 'eu', 'fr', 'it', 'oc', 'pt'], desc="Languages"):

    # Execute the main function
    df = await main(language)

    df.to_csv(f"./data_{language}.csv", encoding="UTF-8", index=False)


Languages:   0%|          | 0/1 [00:00<?, ?it/s]