In [1]:
import requests
import pandas as pd
from typing import Optional, Dict, List, Any
import logging

In [2]:
# Настройка логирования
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [3]:
class SportsDataProcessor41:
    BASE_URL = "https://st-fn-cdn001.akamaized.net/olimp/ru/Europe:Berlin/gismo"
    HEADERS = {
        "accept": "*/*",
        "accept-encoding": "gzip, deflate, br, zstd",
        "accept-language": "ru,en;q=0.9,en-GB;q=0.8,en-US;q=0.7",
        "cache-control": "no-cache",
        "origin": "https://st-cdn001.akamaized.net",
        "pragma": "no-cache",
        "referer": "https://st-cdn001.akamaized.net",
        "sec-ch-ua": '"Microsoft Edge";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": '"Windows"',
        "sec-fetch-dest": "empty",
        "sec-fetch-mode": "cors",
        "sec-fetch-site": "cross-site",
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0"
    }

    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update(self.HEADERS)

    def fetch_data(self, url_suffix: str) -> Optional[Dict]:
        """Perform a GET request and return JSON data."""
        try:
            response = self.session.get(f"{self.BASE_URL}{url_suffix}")
            response.raise_for_status()
            return response.json()
        except requests.exceptions.RequestException as e:
            logging.error(f"Request error: {e}")
            return None

    def _get_nested_data(self, data: Dict, keys: List[str]) -> Any:
        """Recursively retrieve nested data from a dictionary."""
        try:
            for key in keys:
                if isinstance(data, dict) and key in data:
                    data = data[key]
                elif isinstance(data, list) and isinstance(key, int) and 0 <= key < len(data):
                    data = data[key]
                else:
                    return None
            return data
        except (KeyError, TypeError, IndexError):
            logging.error("Error retrieving nested data.")
            return None

    def create_dataframe(self, data: Dict, path: List[str], columns_to_drop: List[str] = None) -> Optional[pd.DataFrame]:
        """Create a DataFrame from JSON data at the specified path."""
        result = self._get_nested_data(data, ['doc', 0, 'data'] + path)
        if not result:
            return None
        df = pd.json_normalize(result)
        if columns_to_drop:
            df.drop(columns=columns_to_drop, errors='ignore', inplace=True)
        return df

    def process_sports_hierarchy(self) -> Optional[pd.DataFrame]:
        """Process sports hierarchy data."""
        data = self.fetch_data("/config_sports/41/0/")
        if not data:
            return None

        df = self.create_dataframe(data, path=[], columns_to_drop=['_sk', '_doc'])
        if df is not None:
            df['urls0'] = "/config_tree_mini/41/0/" + df['_sid'].astype(str)
        return df

    def process_second_level(self, url_suffix: str) -> Optional[pd.DataFrame]:
        """Process second-level data."""
        data = self.fetch_data(url_suffix)
        if not data:
            return None

        df = self.create_dataframe(
            data,
            path=[0, 'realcategories'],
            columns_to_drop=['_sk', '_doc', 'uniquetournaments', 'tournaments']
        )
        if df is not None:
            df['url1'] = "/category_get/" + df['_rcid'].astype(str)
            df['url2'] = f"{url_suffix}/" + df['_rcid'].astype(str)
            self._expand_cc_column(df)
            if '_isk' in df.columns:
                df = df[df['_isk'] != 1000]
        return df

    def process_third_level(self, url_suffix: str) -> Optional[pd.DataFrame]:
        """Process third-level data."""
        data = self.fetch_data(url_suffix)
        if not data:
            return None

        return self.create_dataframe(
            data,
            path=[0, 'realcategories', 0, 'tournaments'],
            columns_to_drop=['_sk', '_doc']
        )

    @staticmethod
    def _expand_cc_column(df: pd.DataFrame) -> None:
        """Expand the 'cc' column in a DataFrame."""
        if 'cc' in df.columns:
            expanded_cc = pd.json_normalize(df['cc']).add_prefix('cc_').fillna('None')
            df.drop(columns=['cc'], inplace=True)
            df = pd.concat([df, expanded_cc], axis=1)
            
    def full_pipeline(self) -> Dict[str, Any]:
        """Full pipeline to process all data."""
        results = {}

        frame0 = self.process_sports_hierarchy()
        if frame0 is None:
            return results
        results['Frame'] = frame0

        second = {}
        if not frame0.empty:
            for i, url in enumerate(frame0['urls0']):
                frame1_key = f"Frame1_{i}"
                frame1 = self.process_second_level(url)
                if frame1 is not None:
                    second[frame1_key] = frame1
            results['Frame1'] = second
            frame1 = results['Frame1']['Frame1_0']

            third = {}
            if not frame1.empty and 'url2' in frame1.columns:
                for i, url in enumerate(frame1['url2'][:11]):
                    key_f = f"{frame1['name'][i]}"
                    logging.info(f"Processing {key_f}")
                    frame2 = self.process_third_level(url)
                    if frame2 is not None:
                        quant = {}
                        if not frame2.empty and 'currentseason' in frame2.columns:
                            frame2 = frame2[~frame2['currentseason'].duplicated()].reset_index(drop=True)
                            third[key_f] = [frame2,]
                                
        return results, third

In [4]:
processor = SportsDataProcessor41()
results, th = processor.full_pipeline()

2025-04-07 11:39:54,125 - INFO - Processing Международный
2025-04-07 11:39:54,705 - INFO - Processing Международные матчи (клубы)
2025-04-07 11:39:55,197 - INFO - Processing Англия
2025-04-07 11:39:55,796 - INFO - Processing Германия
2025-04-07 11:39:56,338 - INFO - Processing Испания
2025-04-07 11:39:58,106 - INFO - Processing Италия
2025-04-07 11:39:58,573 - INFO - Processing Франция
2025-04-07 11:39:59,121 - INFO - Processing Австралия
2025-04-07 11:39:59,590 - INFO - Processing Австрия
2025-04-07 11:40:00,083 - INFO - Processing Австрия любители
2025-04-07 11:40:00,737 - INFO - Processing Азербайджан


## TEST

In [14]:
results['Frame1']['Frame1_0']

Unnamed: 0,_id,_sid,_rcid,name,cc._doc,cc._id,cc.a2,cc.name,cc.a3,cc.ioc,cc.continentid,cc.continent,cc.population,url1,url2
0,4,1,4,Международный,,,,,,,,,,/category_get/4,/config_tree_mini/41/0/1/4
1,393,1,393,Международные матчи (клубы),,,,,,,,,,/category_get/393,/config_tree_mini/41/0/1/393
2,1,1,1,Англия,countrycode,240.0,en,Англия,АНГ,ENG,1.0,Европа,51500000.0,/category_get/1,/config_tree_mini/41/0/1/1
3,30,1,30,Германия,countrycode,80.0,de,Германия,ГЕР,GER,1.0,Европа,82500000.0,/category_get/30,/config_tree_mini/41/0/1/30
4,32,1,32,Испания,countrycode,199.0,es,Испания,ИСП,ESP,1.0,Европа,46000000.0,/category_get/32,/config_tree_mini/41/0/1/32
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132,92,1,92,Эстония,countrycode,67.0,ee,Эстония,ЭСТ,EST,1.0,Европа,1315635.0,/category_get/92,/config_tree_mini/41/0/1/92
133,914,1,914,Эфиопия,countrycode,68.0,et,Эфиопия,ЭФИ,ETH,4.0,Африка,102403000.0,/category_get/914,/config_tree_mini/41/0/1/914
134,322,1,322,Южная Африка,countrycode,197.0,za,Южная Африка,ЗАФ,RSA,4.0,Африка,49300000.0,/category_get/322,/config_tree_mini/41/0/1/322
135,502,1,502,Ямайка,countrycode,106.0,jm,Ямайка,ЯМА,JAM,2.0,Северная Америка,2900000.0,/category_get/502,/config_tree_mini/41/0/1/502


In [20]:
th['Международный'][0]

Unnamed: 0,_id,_sid,_rcid,_isk,_tid,_utid,seasonid,currentseason,name,roundbyround
0,2307,1,4,1,2307,436,105587.0,105587,"Олимпийские игры, группа А",True
1,6996,1,4,5,6996,2515,,111974,Квалификация Олимпийсих игр (женщины),False
2,147109,1,4,5,147109,437,105589.0,105589,"Олимпийские игры, Женщины, Группа A",True
3,3948,1,4,10,3948,16,101177.0,101177,"Кубок Мира, Плей-офф",True
4,1757,1,4,20,1757,11,127075.0,127075,"Кубок Мира Квалификация, УЕФА, Группа А",True
5,6922,1,4,39,6922,308,108589.0,108589,"Кубок Мира Квалификация, АФК, Раунд 1",True
6,6692,1,4,51,6692,309,10874.0,121357,"Кубок Мира Квалификация, ОФК 1ый раунд",True
7,7271,1,4,62,7271,14,115355.0,115355,"Кубок Мира Квалификация, КОНКАКАФ 2ый раунд, Г...",True
8,1434,1,4,77,1434,295,109025.0,109025,"Кубок Мира Квалификация, КОНМЕБОЛ",True
9,9360,1,4,100,9360,13,112442.0,112442,"FIFA World Cup, Qualification CAF, Group A",False


## Second Part

In [6]:
class SportsDataProcessor(SportsDataProcessor41):
        def process_season_tables(self, season_id: int) -> Optional[pd.DataFrame]:
            """Process season tables."""
            data = self.fetch_data(f"/stats_season_tables/{season_id}")
            if not data:
                return None
    
            try:
                tablerows = self._get_nested_data(data, ['doc', 0, 'data', 'tables', 0, 'tablerows'])
                if tablerows is None:
                    df = pd.DataFrame()
                else:
                    df = pd.json_normalize(tablerows)
    
                for col in ['promotion', 'team']:
                    if col in df.columns:
                        expanded = pd.json_normalize(df[col]).add_prefix(f"{col}_")
                        df = pd.concat([df.drop(columns=[col]), expanded], axis=1)
                return df.drop(['_doc','promotion._id','promotion.cssclass', 'promotion.position', 'team._doc', 'team._sid','team.nickname', 'team.iscountry','changeTotal', 'changeHome', 'changeAway',  # Изменения позиций за сезон не так важны
                                'sortPositionTotal', 'sortPositionHome', 'sortPositionAway',  # Дублирует pos*
                                'team._id','team.virtual', 'team.haslogo',  # Технические ID, логотип не нужен
                                'team.mediumname', 'team.abbr',  # Название дублируется в `team.name`
                                'promotion._doc', 'promotion.shortname'  # Достаточно `promotion.name` 
                               ], axis=1, errors='ignore').infer_objects(copy=False).fillna(0)
            except (KeyError, TypeError) as e:
                print(f"Season tables error: {e}")
                return None
            
        def process_season_data(self, season_id: int) -> Dict[str, Optional[pd.DataFrame]]:
            """Process all season data."""
            logging.info(f"Processing season data for season ID: {season_id}")
             # Обработка таблиц
            tables = self.process_season_tables(season_id)
            if tables is None or tables.empty:
                logging.warning(f"Tables data is empty for season ID: {season_id}")
                tables = pd.DataFrame()  # Используем пустой DataFrame, если данные отсутствуют
            if len(tables)==0:
                tables = [1]
            return {
            'Table': tables,
            #'season_goals': combined1,
            #'over_under': self.process_over_under_stats(season_id),
            #'last_matches': self.process_last_matches(season_id),
            #'next_matches': self.process_next_matches(season_id),
            #'top_goals': self.process_top_goals(season_id),
            #'top_assists': self.process_top_assists(season_id),
            #'top_cards': self.process_top_cards(season_id),
            #'injuries': self.process_injuries(season_id),
            #'season_teamsstats': self.process_season_teamsstats(season_id),
            #'season_fixtures': self.process_season_fixtures(season_id),
            #'match_details': self.process_match_details(season_id),
            #'match_timelines': self.process_match_timelines(season_id),
            #'match_details_data': self.process_match_details_data(season_id),
            #'match_lineups': self.process_match_lineups(season_id),
            #'usual_lineup': self.process_usual_lineup(season_id),
            #'team_squad': self.process_team_squad(season_id),
        }
        def full_pipeline2(self, frame2) -> Dict[str, Any]:
            """Full pipeline to process all data."""
            results = {}
            if frame2 is not None:
                if not frame2.empty and 'currentseason' in frame2.columns:
                    frame2 = frame2[~frame2['currentseason'].duplicated()].reset_index(drop=True)
                    for i, season_id in enumerate(frame2['currentseason']):
                        leag_season = f"{season_id}_{frame2['name'][i]}"
                        print('proceding  next season: ',leag_season)
                        results[leag_season] = self.process_season_data(season_id)
    
            return results


In [8]:
processor1 = SportsDataProcessor()
resultsth = processor1.full_pipeline2(th['Англия'][0])

2025-04-07 13:20:56,274 - INFO - Processing season data for season ID: 118689


proceding  next season:  118689_Английская Премьер-Лига


2025-04-07 13:20:57,931 - INFO - Processing season data for season ID: 118699
2025-04-07 13:20:58,132 - INFO - Processing season data for season ID: 118703


proceding  next season:  118699_Чемпионшип
proceding  next season:  118703_Лига 1


2025-04-07 13:20:58,369 - INFO - Processing season data for season ID: 118701


proceding  next season:  118701_Лига 2


2025-04-07 13:20:58,584 - INFO - Processing season data for season ID: 118709


proceding  next season:  118709_Национальная Конференция


2025-04-07 13:20:58,797 - INFO - Processing season data for season ID: 118713


proceding  next season:  118713_Кубок ФА
proceding  next season:  118705_Кубок Футбольной лиги


2025-04-07 13:20:58,991 - INFO - Processing season data for season ID: 118705
2025-04-07 13:20:59,180 - INFO - Processing season data for season ID: 118711
2025-04-07 13:20:59,361 - INFO - Processing season data for season ID: 122727


proceding  next season:  118711_Суперкубок
proceding  next season:  122727_Кубок ФА, Квалификация


2025-04-07 13:20:59,553 - INFO - Processing season data for season ID: 120691


proceding  next season:  120691_Трофей Футбольной лиги


2025-04-07 13:20:59,756 - INFO - Processing season data for season ID: 126303


proceding  next season:  126303_National League Cup, Group A


In [7]:
th['Англия'][0]

Unnamed: 0,_id,_sid,_rcid,_isk,_tid,_utid,name,roundbyround,seasonid,currentseason
0,1,1,1,1,1,17,Английская Премьер-Лига,True,118689.0,118689
1,2,1,1,2,2,18,Чемпионшип,True,118699.0,118699
2,3,1,1,3,3,24,Лига 1,True,118703.0,118703
3,84,1,1,4,84,25,Лига 2,True,118701.0,118701
4,72,1,1,5,72,173,Национальная Конференция,True,118709.0,118709
5,16,1,1,6,16,19,Кубок ФА,True,118713.0,118713
6,17,1,1,7,17,21,Кубок Футбольной лиги,True,118705.0,118705
7,1307,1,1,8,1307,346,Суперкубок,True,118711.0,118711
8,34954,1,1,1000,34954,1696,"Кубок ФА, Квалификация",True,122727.0,122727
9,161,1,1,1000,161,334,Трофей Футбольной лиги,True,120691.0,120691


In [9]:
resultsth['118689_Английская Премьер-Лига']['Table']

Unnamed: 0,_id,drawTotal,drawHome,drawAway,goalDiffTotal,goalDiffHome,goalDiffAway,goalsAgainstTotal,goalsAgainstHome,goalsAgainstAway,...,pointsAway,pos,posHome,posAway,winTotal,winHome,winAway,promotion.name,team.uid,team.name
0,168162957,7,2,5,42,21,21,30,11,19,...,35,1,1,1,22,12,10,Лига чемпионов,44,Ливерпуль
1,168162969,11,4,7,30,18,12,26,12,14,...,28,2,2,2,17,10,7,Лига чемпионов,42,Арсенал
2,168162975,6,4,2,14,14,0,37,10,27,...,26,3,3,3,17,9,8,Лига чемпионов,14,Ноттингем Ф
3,168162985,8,4,4,17,13,4,37,15,22,...,22,4,4,8,15,9,6,Лига чемпионов,38,Челси
4,168162971,7,3,4,17,13,4,40,19,21,...,22,5,6,9,15,9,6,УЕФА Лига Европы,17,Манчестер Сити
5,168162983,9,7,2,0,8,-8,46,19,27,...,20,6,5,11,14,8,6,0,40,Астон Вилла
6,168162979,5,2,3,10,8,2,39,18,21,...,24,7,7,5,15,8,7,0,39,Ньюкасл
7,168162961,9,5,4,5,2,3,42,23,19,...,22,8,8,10,13,7,6,0,43,Фулхэм
8,168162953,11,6,5,2,2,0,47,19,28,...,23,9,10,7,12,6,6,0,30,Брайтон
9,168162989,9,3,6,11,5,6,40,14,26,...,24,10,12,4,12,6,6,0,60,Борнмут


In [37]:
processor1.process_season_tables(118713)

NotImplementedError: 

In [41]:
processor1._get_nested_data(processor1.fetch_data(f"/stats_season_tables/{118713}"), ['doc', 0, 'data', 'tables', 0, 'tablerows'])