In [1]:
import requests
import time
import pandas as pd
from datetime import datetime
from random import choice
import threading
import os

In [2]:
# Конфигурация
DAILY_MATCHES_LIMIT = 3000
TOTAL_DAYS = 20
API_DELAY = 1.2
MAX_RETRIES = 3
PROXY_FILE = "proxy.txt"
DATA_DIR = "collected_data"

# Глобальные переменные
proxies = []
proxy_lock = threading.Lock()
current_proxy = None

In [None]:
def load_proxies():
    """Загрузка списка прокси из файла"""
    global proxies
    try:
        with open(PROXY_FILE, 'r') as f:
            lines = f.readlines()
            for line in lines:
                parts = line.strip().split(':')
                if len(parts) == 4:
                    ip, port, user, pwd = parts
                    proxy = {
                        'http': f'http://{user}:{pwd}@{ip}:{port}',
                        'https': f'http://{user}:{pwd}@{ip}:{port}'
                    }
                    proxies.append(proxy)
        print(f"Загружено {len(proxies)} прокси")
    except Exception as e:
        print(f"Ошибка загрузки прокси: {e}")
        proxies = []

def get_proxy():
    """Получение случайного прокси с блокировкой"""
    global current_proxy
    with proxy_lock:
        if not proxies:
            return None
        current_proxy = choice(proxies)
        return current_proxy

def rotate_proxy():
    """Ротация прокси при ошибках"""
    global current_proxy
    with proxy_lock:
        if current_proxy in proxies:
            proxies.remove(current_proxy)
            print(f"Удален проблемный прокси: {current_proxy}")
        current_proxy = None

def make_api_request(url, params=None, retry=0):
    """Выполнение запроса с прокси и обработкой ошибок"""
    if retry >= MAX_RETRIES:
        print(f"Достигнут максимум попыток для {url}")
        return None

    proxy = get_proxy()
    try:
        response = requests.get(
            url,
            params=params,
            proxies=proxy,
            timeout=10
        )
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Ошибка запроса ({retry+1}/{MAX_RETRIES}): {e}")
        if proxy:
            rotate_proxy()
        time.sleep(2)
        return make_api_request(url, params, retry+1)

def fetch_pro_matches(last_match_id=None, limit=100):
    """Получение списка профессиональных матчей старше указанного ID"""
    pro_matches_url = 'https://api.opendota.com/api/proMatches'
    matches = []
    
    while len(matches) < limit:
        params = {'less_than_match_id': last_match_id} if last_match_id else {}
        data = make_api_request(pro_matches_url, params)
        
        if not data:
            break
            
        # Если указан last_match_id, фильтруем только матчи с меньшим ID
        if last_match_id:
            data = [m for m in data if m['match_id'] < last_match_id]
            if not data:
                break
                
        matches.extend(data)
        last_match_id = data[-1]['match_id']
        print(f"Получено {len(matches)}/{limit} матчей (старее {last_match_id})")
        time.sleep(API_DELAY)
            
    return matches[:limit], last_match_id

def fetch_match_details(match_id):
    """Получение детальной информации о матче"""
    match_url = f'https://api.opendota.com/api/matches/{match_id}'
    data = make_api_request(match_url)
    time.sleep(API_DELAY)
    return data

def fetch_heroes_data():
    """Получение данных о героях"""
    heroes_url = 'https://api.opendota.com/api/heroStats'
    heroes_list = make_api_request(heroes_url)
    return {hero['id']: hero for hero in heroes_list} if heroes_list else {}

def fetch_patch_mapping():
    """Получение информации о патчах"""
    patches_url = 'https://raw.githubusercontent.com/odota/dotaconstants/master/build/patch.json'
    patches_list = make_api_request(patches_url)
    return {int(p['id']): p['name'] for p in patches_list} if patches_list else {}

def process_match(match_details, heroes_data, patch_mapping):
    """Обработка данных матча"""
    if not match_details or 'players' not in match_details:
        return None

    match_info = {
        'match_id': match_details.get('match_id'),
        'date': pd.to_datetime(match_details.get('start_time'), unit='s') if match_details.get('start_time') else None,
        'tournament': match_details.get('league', {}).get('name'),
        'radiant_team': match_details.get('radiant_name'),
        'dire_team': match_details.get('dire_name'),
        'game_patch': patch_mapping.get(match_details.get('patch')),
        'rad_winner': int(match_details.get('radiant_win', False))
    }

    # Инициализация структуры данных
    for team in ['radiant', 'dire']:
        for i in range(1, 6):
            match_info[f'{team}_player_{i}_name'] = None
            match_info[f'{team}_hero_{i}_name'] = None
            match_info[f'{team}_hero_{i}_primary_attr'] = None
            match_info[f'{team}_hero_{i}_attack_type'] = None
            match_info[f'{team}_hero_{i}_roles'] = None
            match_info[f'{team}_hero_{i}_move_speed'] = None

    # Обработка игроков
    radiant_idx = dire_idx = 1
    for player in match_details['players']:
        if player['player_slot'] < 128 and radiant_idx <= 5:
            team = 'radiant'
            idx = radiant_idx
            radiant_idx += 1
        elif dire_idx <= 5:
            team = 'dire'
            idx = dire_idx
            dire_idx += 1
        else:
            continue

        # Получение имени игрока
        account_id = player.get('account_id')
        if account_id and account_id != 4294967295:
            try:
                player_url = f'https://api.opendota.com/api/players/{account_id}'
                player_data = make_api_request(player_url)
                if player_data:
                    match_info[f'{team}_player_{idx}_name'] = player_data.get('profile', {}).get('personaname')
            except Exception as e:
                print(f"Ошибка получения имени игрока: {e}")

        # Данные героя
        hero_id = player.get('hero_id')
        hero_data = heroes_data.get(hero_id, {})
        if hero_data:
            for attr in ['localized_name', 'primary_attr', 'attack_type', 'move_speed']:
                match_info[f'{team}_hero_{idx}_{attr}'] = hero_data.get(attr)
            match_info[f'{team}_hero_{idx}_roles'] = ','.join(hero_data.get('roles', []))

    return match_info

def collect_daily_matches(last_match_id=None):
    """Сбор и обработка данных за один день"""
    matches, new_last_id = fetch_pro_matches(last_match_id, DAILY_MATCHES_LIMIT)
    if not matches:
        return None, last_match_id  # Возвращаем исходный ID, если не получили матчи

    heroes_data = fetch_heroes_data()
    patch_mapping = fetch_patch_mapping()
    processed_matches = []

    for match in matches:
        details = fetch_match_details(match['match_id'])
        if not details:
            continue
            
        processed = process_match(details, heroes_data, patch_mapping)
        if processed:
            processed_matches.append(processed)
            print(f"Обработан матч {match['match_id']}")

    if not os.path.exists(DATA_DIR):
        os.makedirs(DATA_DIR)
    
    filename = os.path.join(DATA_DIR, f"matches_data_{datetime.now().strftime('%Y-%m-%d')}.csv")
    df = pd.DataFrame(processed_matches)
    df.to_csv(filename, index=False)
    
    return len(processed_matches), new_last_id

def main():
    # Инициализация прокси
    load_proxies()
    
    # Запрос начального match_id у пользователя
    start_match_id = input("Введите начальный match_id (максимальный ID, с которого начинать сбор старых матчей): ")
    try:
        start_match_id = int(start_match_id) if start_match_id else None
    except ValueError:
        print("Некорректный match_id. Будет использован None (самые свежие матчи).")
        start_match_id = None
    
    current_match_id = start_match_id
    
    for day in range(1, TOTAL_DAYS + 1):
        start_time = datetime.now()
        print(f"\nДень {day}/{TOTAL_DAYS} - Начало сбора данных")
        print(f"Текущий верхний граничный match_id: {current_match_id}")
        
        count, current_match_id = collect_daily_matches(current_match_id)
        
        elapsed = (datetime.now() - start_time).total_seconds()
        print(f"День {day} завершен. Обработано матчей: {count}")
        print(f"Новый верхний граничный ID: {current_match_id}")
        print(f"Время выполнения: {elapsed:.2f} секунд")
        
        if day < TOTAL_DAYS and count > 0:
            sleep_time = max(86400 - elapsed, 0)
            print(f"Ожидание следующего дня ({sleep_time/3600:.2f} часов)...")
            time.sleep(sleep_time)

if __name__ == '__main__':
    main()

Загружено 200 прокси

День 1/20 - Начало сбора данных
Текущий верхний граничный match_id: None
Получено 100/100 матчей (старее 8309841783)


KeyboardInterrupt: 