In [1]:
import requests
import re
import os
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import time
from bs4 import BeautifulSoup
import pandas as pd
import json
# Suppress SSL warnings if SSL verification is disabled
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

In [None]:
regions = ["na", "eu", "ap", "sa", "jp", "oce", "mn", "kr", "gc", "cg"]        

# Create a list to store player names and profile URLs
players_data = []
for region in regions:
    
    # URL of the stats page
    stats_url = "https://www.vlr.gg/stats/?timespan=all&region=" + region
    
    # Send a request to the page
    response = requests.get(stats_url, verify=False)
    
    # Check if the page was successfully fetched
    if response.status_code == 200:
        # Parse the content of the page
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find all player links (assuming they are within <a> tags with a certain class or structure)
        player_links = soup.find_all('a', style="display: flex; align-items: center; padding-right: 0;")

        # Loop through each player link and extract the name and URL
        for player in player_links:
            player_name = player.text.strip()
            player_url = "https://www.vlr.gg" + player['href']
            players_data.append({"Player Name": player_name, "Profile URL": player_url, "Player Region": region})
        print(region)
        print(len(player_links))
    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")

# Convert the list to a DataFrame
players_df = pd.DataFrame(players_data)
        
# Save the DataFrame to a CSV file
players_df.to_csv('vlr_gg_players.csv', index=False)
    
print("Data has been saved to 'vlr_gg_players.csv'.")

na
1740


In [11]:
def clean_spaces(s):
    return s.replace('\n', '').replace('\t', '').strip()

def get_agent_stats(soup):
    agent_stats = {}

    stats_table = soup.find('table', class_='wf-table')
    agent_stats = []
    if stats_table is not None:
        for row in stats_table.find_all('tr')[1:]:  # Skip header row
            cols = row.find_all('td')
            agent_name = cols[0].find('img')['alt']  # The agent's name is in the alt attribute of the image
            stats = {
                    'Agent': agent_name,
                    'Use %': cols[1].text.strip(),
                    'Rounds Played': cols[2].text.strip(),
                    'Rating': cols[3].text.strip(),
                    'ACS': cols[4].text.strip(),
                    'K/D': cols[5].text.strip(),
                    'ADR': cols[6].text.strip(),
                    'KAST %': cols[7].text.strip(),
                    'KPR': cols[8].text.strip(),
                    'APR': cols[9].text.strip(),
                    'FKPR': cols[10].text.strip(),
                    'FDPR': cols[11].text.strip(),
                    'Kills': cols[12].text.strip(),
                    'Deaths': cols[13].text.strip(),
                    'Assists': cols[14].text.strip(),
                    'First Bloods': cols[15].text.strip(),
                    'First Deaths': cols[16].text.strip(),
            }
            period_stats = stats
            agent_stats.append(period_stats)
    else:
            agent_stats = [{'record': 'record not found'}]
    return agent_stats
    

In [13]:
def read_player_html(file_path, file_path_all):
    # Load the player page HTML from the file
    with open(file_path, 'r', encoding='utf-8') as file:
        page_content = file.read()
    with open(file_path_all, 'r', encoding='utf-8') as file:
        page_content_all = file.read()
    
    # Parse the HTML using BeautifulSoup
    soup = BeautifulSoup(page_content, 'html.parser')
    soup_all = BeautifulSoup(page_content_all, 'html.parser')
    
    # Extract Player ID and Real Name
    player_id = soup.find('h1', class_="wf-title").text.strip()
    real_name = soup.find('h2', class_='player-real-name').text.strip()

    agent_stats_60 = get_agent_stats(soup)
    agent_stats_all = get_agent_stats(soup_all)
    
    # Extract Current Team
    current_team_tag = soup.find('h2', string=lambda text: text and 'Current Teams' in text.strip())
    if current_team_tag is None:
        current_team = {
        'Team Name': 'Team not found',
        'Joined Date': 'Team not found'
    }
    else:  
        current_team_tag = current_team_tag.find_next('div', class_='wf-card')
        current_team_name = current_team_tag.find('div', style='font-weight: 500;').text.strip()
        current_team_duration = current_team_tag.find('div', style='font-weight: 500;').find_next('div', class_='ge-text-light').find_next('div', class_='ge-text-light').text.strip()
        current_team = {
            'Team Name': current_team_name,
            'Joined Date': current_team_duration
        }
    
    # Extract Recent Matches
    recent_matches = []
    recent_matches_tag = soup.find('h2', string=lambda text: text and 'Recent Results' in text.strip()).find_next('div')
    match_cards = recent_matches_tag.find_all('a', class_='wf-card')
    
    for match in match_cards:
        match_date = match.find('div', class_='m-item-date').text.strip()
        match_date = clean_spaces(match_date)
        event_name = match.find('div', class_='m-item-event').text.strip()
        event_name = clean_spaces(event_name)
        result = match.find('div', class_='m-item-result').text.strip().replace('\n', '')
        opponent = match.find_all('span', class_='m-item-team-name')[1].text.strip()
        recent_matches.append({
            'Event': event_name,
            'Opponent': opponent,
            'Result': result,
            'Date': match_date
        })
    
    # Extract Past Teams
    past_teams = []
    past_teams_tag = soup.find('h2', string=lambda text: text and 'Past Teams' in text.strip())
    if past_teams_tag is None:
        team_items = []
    else:
        
        past_teams_tag = past_teams_tag.find_next('div', class_='wf-card')
        team_items = past_teams_tag.find_all('a', class_='wf-module-item')
    
    for team in team_items:
        team_name = team.find('div', style='font-weight: 500;').text.strip()
        duration = team.find('div', style='font-weight: 500;').find_next('div', class_='ge-text-light').find_next('div', class_='ge-text-light').text.strip()
        past_teams.append({
            'Team Name': team_name,
            'Duration': duration
        })
    
    # Extract Event Placements
    event_placements = []
    event_placements_tag = soup.find('h2', string=lambda text: text and 'Event Placements' in text.strip()).find_next('div', class_='wf-card')
    event_items = event_placements_tag.find_all('a', class_='wf-module-item')
    
    for event in event_items:
        event_name = event.find('div', class_='text-of').text.strip()
        placement = event.find('span', class_='ge-text-light').text.strip()
        placement = clean_spaces(placement)
        event_placements.append({
            'Event': event_name,
            'Placement': placement
        })
    
    # Output the extracted data
    player_data = pd.DataFrame({
        'Player ID': [player_id],
        'Real Name': [real_name],
        'Current Team': [current_team],
        'Recent Matches': [recent_matches],
        'Past Teams': [past_teams],
        'Event Placements': [event_placements],
        'All Time Agent Stats': [agent_stats_all],
        '60 days Agent Stats': [agent_stats_60]
    })
    
    # Output the extracted data
    player_data = {
        'Player ID': player_id,
        'Real Name': real_name,
        'Current Team': current_team,
        'Recent Matches': recent_matches,
        'Past Teams': past_teams,
        'Event Placements': event_placements,
        'All Time Agent Stats': agent_stats_all,
        '60 days Agent Stats': agent_stats_60
    }

    return player_data
    
    # If you'd like to save it to a CSV file for further inspection:
    # player_data.to_csv('player_data_example.csv', index=False)

In [7]:


# Read the CSV file containing player names and profile URLs
csv_file = 'vlr_gg_players.csv'
players_df = pd.read_csv(csv_file)

# Create a directory to store the HTML content as text files
output_directory = 'player_profiles_html_30'
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Loop through each row in the DataFrame
for index, row in players_df.iterrows():
    player_name = row['Player Name']
    player_url = row['Profile URL']

    # Replace special characters in player name for safe file name
    player_name = player_name.split('\n')[0]
    safe_player_name = player_name.replace(" ", "_").replace("/", "_").replace("\\", "_")

    file_path = os.path.join(output_directory, f"{safe_player_name}.txt")
    if os.path.exists(file_path):
        print(f"{safe_player_name}.txt already exists. Skipping")
        continue
    # Send a request to fetch the player's profile page with SSL verification disabled
    try:
        response = requests.get(player_url, verify=False)

        if response.status_code == 200:
            # Save the HTML content to a text file
            with open(file_path, 'w', encoding='utf-8') as file:
                file.write(response.text)
            
            print(f"Saved {player_name}'s profile to {file_path}")
        else:
            print(f"Failed to retrieve {player_name}'s profile. Status code: {response.status_code}")
    
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {player_name}'s profile: {e}")

print("All player profiles have been processed.")

keenc.txt already exists. Skipping
florescent.txt already exists. Skipping
woddy.txt already exists. Skipping
dodonut.txt already exists. Skipping
voriac.txt already exists. Skipping
WARDELL.txt already exists. Skipping
yay.txt already exists. Skipping
malibu.txt already exists. Skipping
kiara.txt already exists. Skipping
Verno.txt already exists. Skipping
StarBound.txt already exists. Skipping
kaho.txt already exists. Skipping
Kyu.txt already exists. Skipping
Smoke.txt already exists. Skipping
N4RRATE.txt already exists. Skipping
PLAYER.txt already exists. Skipping
Cryocells.txt already exists. Skipping
draystar.txt already exists. Skipping
sinatraa.txt already exists. Skipping
dicey.txt already exists. Skipping
happy.txt already exists. Skipping
Cheddy.txt already exists. Skipping
Demon1.txt already exists. Skipping
Spaz.txt already exists. Skipping
c4Lypso.txt already exists. Skipping
Sushie.txt already exists. Skipping
TenZ.txt already exists. Skipping
mimiFPS.txt already exists. S

In [17]:

import os
from tqdm import tqdm
def process_folder(input_folder, input_folder_all, output_path):
    all_data = []
    # Iterate over all .txt files in the input folder
    for filename in tqdm(os.listdir(input_folder)):
        if filename.endswith('.txt'):
            input_file_path = os.path.join(input_folder, filename)
            input_file_all_path = os.path.join(input_folder_all, filename)

            # Extract article content
            all_data.append(read_player_html(input_file_path, input_file_all_path))

    all_data = pd.DataFrame(all_data)
    all_data.to_csv(output_path, index=False)
# Example usage:
input_folder = 'player_profiles_html_60'
input_folder_all = 'player_profiles_html_all'
output_path = 'player_profiles.csv'

process_folder(input_folder, input_folder_all, output_path)

100%|██████████| 5777/5777 [06:55<00:00, 13.90it/s]


In [25]:
print(read_player_html("player_profiles_html_30/ZmjjKK.txt", "player_profiles_html_all/ZmjjKK.txt"))

{'Player ID': 'ZmjjKK', 'Real Name': 'Zheng Yongkang (郑永康)', 'Current Team': {'Team Name': 'EDward Gaming', 'Joined Date': 'joined in September 2021'}, 'Recent Matches': [{'Event': 'VCT 2024: ChampionsPlayoffs ⋅GF', 'Opponent': 'Team Heretics', 'Result': '3:2', 'Date': '2024/08/252:00 am'}, {'Event': 'VCT 2024: ChampionsPlayoffs ⋅UBF', 'Opponent': 'LEVIATÁN', 'Result': '2:1', 'Date': '2024/08/232:00 am'}, {'Event': 'VCT 2024: ChampionsPlayoffs ⋅UBSF', 'Opponent': 'Sentinels', 'Result': '2:1', 'Date': '2024/08/173:00 am'}, {'Event': 'VCT 2024: ChampionsPlayoffs ⋅UBQF', 'Opponent': 'Trace Esports', 'Result': '2:0', 'Date': '2024/08/145:25 am'}, {'Event': 'VCT 2024: ChampionsGroup Stage ⋅Decider (D)', 'Opponent': 'Paper Rex', 'Result': '2:1', 'Date': '2024/08/115:15 am'}], 'Past Teams': [{'Team Name': 'Qing Jiu Club', 'Duration': ''}, {'Team Name': 'CBT Gaming', 'Duration': ''}, {'Team Name': 'Royal Never Give Up', 'Duration': 'joined in May 2020'}], 'Event Placements': [{'Event': 'Valora

In [45]:
# Read the main and minor CSV files
main_df = pd.read_csv('player_profiles.csv')
minor_df = pd.read_csv('vlr_gg_players.csv')

minor_df['Player Name'] = minor_df['Player Name'].apply(lambda x: x.split('\n')[0])
# First, group by 'name' in the minor CSV and concatenate all values in 'minor_column'
minor_df_grouped = minor_df.groupby('Player Name')['Player Region'].apply(list).reset_index()
# Now, merge the grouped data back into the main DataFrame, matching 'ID' in main_df with 'name' in minor_df_grouped
merged_df = pd.merge(main_df, minor_df_grouped, left_on='Player ID', right_on='Player Name', how='left')

merged_df.drop(columns=['Player Name'], inplace=True)

# Insert the new column as the fourth column
cols = list(merged_df.columns)  # Get all column names
new_col = 'Player Region'        # This is the new column added after merging

# Reorder columns: take the columns before and after the new column and insert the new column at position 3 (fourth column)
cols.insert(5, cols.pop(cols.index(new_col)))  # Move the new column to the fourth position

# Apply the new column order to the DataFrame
merged_df = merged_df[cols]

# Display the merged DataFrame with the new column order
print(merged_df.head())
# Save the updated DataFrame to a new CSV file
merged_df.to_csv('player_profiles_region.csv', index=False)

  Player ID           Real Name  \
0      0n1y     He Yiting (何奕霆)   
1      1001                 NaN   
2       10K         Théo Navaro   
3       10X  Choi Jin-woo (최진우)   
4       123                 NaN   

                                        Current Team  \
0  {'Team Name': 'Team not found', 'Joined Date':...   
1   {'Team Name': 'Panny Sitter', 'Joined Date': ''}   
2  {'Team Name': 'Team not found', 'Joined Date':...   
3  {'Team Name': 'Team not found', 'Joined Date':...   
4   {'Team Name': 'Team Kobolds', 'Joined Date': ''}   

                                      Recent Matches  \
0  [{'Event': 'VCT CN EVO A3: HeritabilityLR1', '...   
1  [{'Event': 'GC 24 SEA: Stage 2Group Stage ⋅Gro...   
2  [{'Event': 'VCL FR Revolution: Split 2Regular ...   
3  [{'Event': 'UTAGE S5Playoffs ⋅QF', 'Opponent':...   
4  [{'Event': 'MVT 3: S1Playoffs ⋅LBF', 'Opponent...   

                                          Past Teams Player Region  \
0  [{'Team Name': 'KeepBest Gaming', 'Duratio

In [37]:

# Loop through each row and perform operations
for index, row in df.iterrows():
    # Perform some operation on each row
    # For example, let's calculate the difference between two columns
    result = row['column1'] - row['column2']
    
    # Store the result in the new column
    df.at[index, 'new_column'] = result

# Display the updated DataFrame
print(df.head())

# Save the updated DataFrame back to a CSV file (if needed)
df.to_csv('updated_file.csv', index=False)

KeyError: 'column1'