In [66]:
import requests
import re
import os
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import time
from bs4 import BeautifulSoup

In [None]:
# Function to create a session with retries and timeout
def create_session():
    session = requests.Session()
    retries = Retry(total=5,  # Number of retries
                    backoff_factor=0.3,  # Wait between retries
                    status_forcelist=[500, 502, 503, 504])  # Retry on specific HTTP status codes
    session.mount('https://', HTTPAdapter(max_retries=retries))
    return session

# Function to get the HTML content of a webpage with retries and timeout
def get_html(url, session):
    try:
        response = session.get(url, timeout=10)
        if response.status_code == 200:
            return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
    return None

# Function to extract individual article links from the news page using regex
def extract_article_links(news_page_html):
    # Regex pattern to find article links in the form of '/news/xxxxxx-title'
    pattern = re.compile(r'href="(/[\d]+/[^"]+)"')
    article_links = pattern.findall(news_page_html)
    full_links = ['https://www.vlr.gg' + link for link in article_links]
    return full_links

# Function to extract content from individual article pages using string manipulation
def extract_article_content(article_html):
    # Extract title using regex or manual string operations
    title_start = article_html.find('<h1 class="wf-title">')
    title_end = article_html.find('</h1>', title_start)
    title = article_html[title_start + len('<h1 class="wf-title">'):title_end].strip()

    # Extract the date using a similar approach
    date_start = article_html.find('class="article-header__date">')
    date_end = article_html.find('</div>', date_start)
    date = article_html[date_start + len('class="article-header__date">'):date_end].strip()

    # Extract the main content (this might need adjusting based on actual content structure)
    content_start = article_html.find('<div class="article-content">')
    content_end = article_html.find('</div>', content_start)
    content = article_html[content_start + len('<div class="article-content">'):content_end].strip()

    return title, date, content

# Function to sanitize the file name by replacing special characters
def sanitize_filename(filename):
    # Replace special characters like : and / with underscores or safe characters
    return re.sub(r'[<>:"/\\|?*]', '_', filename)

# Function to save article content to a text file, ensuring valid data
def save_article_to_file(title, date, content, article_url, folder_path='articles'):
    if not title or not content or len(content) < 100:  # Check if the title or content is valid and reasonable
        print(f"Skipping saving article from {article_url} due to missing or incomplete content.")
        return

    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    # Sanitize the title to avoid filesystem issues (replace special characters)
    file_name = sanitize_filename(title) if title else article_url.split('/')[-1]
    file_name = file_name[:100]  # Limit the filename to avoid filesystem issues
    file_path = os.path.join(folder_path, file_name + '.txt')

    # Write the article content to the file
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(f"Title: {title}\n")
        file.write(f"Date: {date}\n")
        file.write(f"\n{content}")
    print(f"Saved article: {file_name}")

# Main function to crawl multiple pages of the news site and save articles
def crawl_vlr_news(max_pages=5):
    base_url = 'https://www.vlr.gg/news/?page='
    session = create_session()

    for page in range(1, max_pages + 1):
        news_page_url = base_url + str(page)
        news_page_html = get_html(news_page_url, session)

        if news_page_html:
            article_links = extract_article_links(news_page_html)

            if not article_links:
                print(f"No more articles found on page {page}. Stopping.")
                break

            for article_url in article_links:
                article_html = get_html(article_url, session)
                if article_html:
                    title, date, content = extract_article_content(article_html)
                    if title and content:
                        save_article_to_file(title, date, content, article_url)
                else:
                    print(f"Failed to retrieve article: {article_url}")

                time.sleep(1)  # Add a small delay between requests to avoid overwhelming the server
        else:
            print(f"Failed to retrieve page: {news_page_url}")

In [8]:
crawl_vlr_news(max_pages=5)

Saved article: Team Liquid announces kamo, LohaN, yaotziN
Saved article: DFM announce full 2025 roster
Saved article: JDG Esports signs sword9 and hypnotizing
Saved article: Wolves Esports signs Lysoar and Fayde
Saved article: Gen.G re-signs Munchkin
Saved article: T1 adds BuZz
Saved article: Chet suspended through Masters Bangkok
Saved article: BBL Esports bids farewell to QutionerX, Brave, and reazy
Saved article: T1 picks up Meteor
Saved article: LEVIATÁN summons Demon1
Saved article: Sentinels adds Reduxx as their sixth
Saved article: DRX signs HYUNMIN and freeing
Saved article: NRG completes roster, signs mada, Verno, and bonkar
Saved article: BBL Esports signs LewN
Saved article: RRQ round off roster with Kush and Warbirds
Saved article: JDG parts ways with Anaks and Billyo
Saved article: NRG releases Demon1
Saved article: T1 signs Sylvan, Indigo, and CheongGak
Saved article: BOOM Esports part ways with ZesBeeW and Meow
Saved article: Cloud9 signs Rossy and v1c
Saved article: 100

In [86]:

def clean_html_content(html_content):
    # Parse the HTML with Beautiful Soup
    soup = BeautifulSoup(html_content, "html.parser")

    # Define a regular expression pattern to identify the span with a specific class or attributes
    pattern = re.compile(r'\bwf-hover-card\b')

    # Find all spans with the pattern that matches the target
    for span in soup.find_all("span", class_=pattern):
        span.decompose()
        
    # Get the modified HTML
    modified_html = str(soup)
    
    return modified_html
    
# Function to extract title, author, date, and article content from the provided HTML
def extract_article_details(file_path):
    # Read the content of the file
    with open(file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()
        
    # Extract title
    title_match = re.search(r'<h1 class="wf-title">\s*(.*?)\s*</h1>', html_content)
    title = title_match.group(1).strip() if title_match else "Unknown Title"
    
    # Extract author
    author_match = re.search(r'class="article-meta-author".*?>(.*?)</a>', html_content)
    author = author_match.group(1).strip() if author_match else "Unknown Author"

    # Extract date
    date_match = re.search(r'<span class="js-date-toggle".*?title="(.*?)">', html_content)
    date = date_match.group(1).strip() if date_match else "Unknown Date"
    
    html_content = clean_html_content(html_content)
        
    html_content = re.split(r'<div class="wf-label comments-label"', html_content, 1)[0]
    # Extract all content inside <p> tags
    paragraphs = re.findall(r'<p>(.*?)</p>', html_content, re.DOTALL)
    content = "\n\n".join([re.sub(r'<.*?>', '', p).strip() for p in paragraphs])
    # Remove inline tags that are unnecessary
    content = re.sub(r'<.*?>', '', content)
    # Remove excess whitespace and line breaks
    # Step 1: Remove spaces or tabs between line breaks
#    content = re.sub(r'\n\s*\n', '\n', content)

    # Step 2: Replace multiple consecutive line breaks with a single line break
#    content = re.sub(r'\n+', '\n', content).strip()
    #content = re.sub(r'\s+', ' ', content).strip()
    return title, author, date, content

# Function to extract title, author, date, and all content inside <p> tags, stopping before a specific div
def extract_article_details(file_path):
    # Read the content of the file
    with open(file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Define a regular expression pattern to identify the span with a specific class or attributes
    pattern = re.compile(r'\bwf-hover-card\b')

    # Find all spans with the pattern that matches the target
    for span in soup.find_all("span", class_=pattern):
        span.decompose()

    pattern = re.compile(r'\bpost-container\b')

    # Find all spans with the pattern that matches the target
    for span in soup.find_all("div", class_=pattern):
        span.decompose()
        
    # Extract the title, author, date, and content
    title = soup.find('title').text.strip()
    author = soup.find('a', class_='article-meta-author').text.strip()
    date = soup.find('span', class_='js-date-toggle').get('title').strip()
    content = "\n".join(p.text for p in soup.find_all(['p','li']))

    content = re.sub(r'\n+', '\n', content).strip()
    content = re.sub(r'\s+', ' ', content).strip()
    return title, author, date, content

# Example function to display extracted details
def display_article_details(title, author, date, content):
    display = ''
    display+=f"Title: {title}\n"
    display+=f"Author: {author}\n"
    display+=f"Date: {date}\n"
    display+=f"\nContent:\n{content}\n"
    return display

def process_folder(input_folder, output_folder):
    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Iterate over all .txt files in the input folder
    for filename in os.listdir(input_folder):
        if filename.endswith('.txt'):
            input_file_path = os.path.join(input_folder, filename)
            output_file_path = os.path.join(output_folder, filename)

            # Extract article content
            title, author, date, content = extract_article_details(input_file_path)
            article_details = display_article_details(title, author, date, content)
            # Save the extracted content to the output file
            with open(output_file_path, 'w', encoding='utf-8') as output_file:
                output_file.write(article_details)
            
            print(f'Processed: {filename}')

# Example usage:
input_folder = 'articles/original'
output_folder = 'articles/fixed'

process_folder(input_folder, output_folder)



Processed: 100 Thieves pick up zander.txt
Processed: 100 Thieves release bang.txt
Processed: 100T say goodbye to Mikes.txt
Processed: 2Game Esports lift Americas Ascension trophy.txt
Processed: Acend pull out of Valorant.txt
Processed: alecks addresses mistakes made against G2 in press conference.txt
Processed: alecks_ _I just let them ride the high_.txt
Processed: Apeks win VCT EMEA Ascension.txt
Processed: Ardiis enters restricted free agency.txt
Processed: Assistant coach DrewSpark departs Sentinels.txt
Processed: BBL allows pAura to explore options.txt
Processed: BBL Esports bids farewell to QutionerX, Brave, and reazy.txt
Processed: BBL Esports signs LewN.txt
Processed: benjyfishy_ _One of my goals was to have my family watch me at an international event_.txt
Processed: Bilibili Gaming looks back on 2024 season in press conference.txt
Processed: BLEED announce roster overhaul, bench four players.txt
Processed: Boaster revels on team qualities heading into Champions playoffs in pre

In [64]:

# Example usage:
input_folder = 'articles'
output_folder = 'articles/original'

def temper(input_folder, output_folder):
    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Iterate over all .txt files in the input folder
    for filename in os.listdir(input_folder):
        if filename.endswith('.txt'):
            input_file_path = os.path.join(input_folder, filename)
            output_file_path = os.path.join(output_folder, filename)

            with open(input_file_path, 'r', encoding='utf-8') as file:
                html_content = file.read()
            html_content = html_content[html_content.index("<head>"):]
            with open(output_file_path, 'w', encoding='utf-8') as output_file:
                output_file.write(html_content)
            
            print(f'Processed: {filename}')
            
temper(input_folder, output_folder)

Processed: 100 Thieves pick up zander.txt
Processed: 100 Thieves release bang.txt
Processed: 100T say goodbye to Mikes.txt
Processed: 2Game Esports lift Americas Ascension trophy.txt
Processed: Acend pull out of Valorant.txt
Processed: alecks addresses mistakes made against G2 in press conference.txt
Processed: alecks_ _I just let them ride the high_.txt
Processed: Apeks win VCT EMEA Ascension.txt
Processed: Ardiis enters restricted free agency.txt
Processed: Assistant coach DrewSpark departs Sentinels.txt
Processed: BBL allows pAura to explore options.txt
Processed: BBL Esports bids farewell to QutionerX, Brave, and reazy.txt
Processed: BBL Esports signs LewN.txt
Processed: benjyfishy_ _One of my goals was to have my family watch me at an international event_.txt
Processed: Bilibili Gaming looks back on 2024 season in press conference.txt
Processed: BLEED announce roster overhaul, bench four players.txt
Processed: Boaster revels on team qualities heading into Champions playoffs in pre