In [1]:
#API
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed

#ETL
import pandas as pd
from pandas import json_normalize
import time
import json
import numpy as np
from datetime import datetime, timezone
from pathlib import Path

#DB
from sqlalchemy import create_engine, text
from sqlalchemy.exc import SQLAlchemyError

#Other
import os
from dotenv import load_dotenv

In [2]:
load_dotenv()
api_key = os.getenv('api_key')
db_password = os.getenv('db_password')
db_user = os.getenv('db_user')
db_host = os.getenv('db_host')
db_port = os.getenv('db_port')
db_name = os.getenv('db_name')

In [3]:
headers = {"Authorization": f"Bearer {api_key}"}

In [4]:
championship_map = {
    '786da1fa-a9b5-4d17-b40e-416588272db4' : 'Advanced',
    'bd7957b3-1114-435f-af43-22645a508cc0': 'Main',
    '4410aa73-9584-4500-a7ca-1504ed6da707': 'Intermediate',
    'e61be909-1727-4de3-8201-419d89893401':'Entry',
    'f62c4c75-f620-40a2-bef8-0be2cb85d2d3': 'Open10',
    '0ce757ef-9bad-4afa-bbce-58d647824222':'Open9',
    '3cd81a7e-ed93-43ba-be08-2a1ee7ad7237':'Open1-8'
}

## Call API pour r√©cup√©rer les donn√©es

In [5]:
data_dir = r"D:\Data project\Faceit dashboard\FaceitApi_Dataproject\data"
season = "Season 54"
region = "EU"
stage = "Regular"
start_season_unix = 1751328000

In [6]:
limit = 100
delay = 0.5
max_threads = 5

### List Teams and Team_leaders from championship api call

In [7]:
def fetch_incremental_matches(championship_map, headers,
                              season=season, region=region, stage=stage,
                              data_dir=data_dir, limit=100, delay=0.3):
    base_path = os.path.join(data_dir, season, region, stage)
    os.makedirs(base_path, exist_ok=True)

    # Load existing team_details.json
    team_file = os.path.join(base_path, "team_list.json")
    try:
        with open(team_file, "r") as f:
            team_list = json.load(f)
    except FileNotFoundError:
        team_list = []

    existing_team_ids = {team["team_id"] for team in team_list}

    # Load existing master.json
    master_file = os.path.join(base_path, "master.json")
    try:
        with open(master_file, "r") as f:
            master_matches = json.load(f)
    except FileNotFoundError:
        master_matches = []

    existing_match_ids = {match["match_id"] for match in master_matches}

    parameters = ["upcoming", "past", "all"]

    for champ_id, champ_name in championship_map.items():
        champ_folder = os.path.join(base_path, champ_name)
        os.makedirs(champ_folder, exist_ok=True)
        champ_file = os.path.join(champ_folder, "matches.json")

        # Load existing matches
        try:
            with open(champ_file, "r") as f:
                champ_matches = json.load(f)
        except FileNotFoundError:
            champ_matches = []

        champ_match_ids = {m["match_id"] for m in champ_matches}

        for param in parameters:
            offset = 0
            while True:
                url = f"https://open.faceit.com/data/v4/championships/{champ_id}/matches?region={region}&type={param}&limit={limit}&offset={offset}"
                response = requests.get(url, headers=headers)
                if response.status_code != 200:
                    print(f"Error fetching {champ_name} ({champ_id}) offset {offset} param={param}: {response.status_code}")
                    break

                data = response.json()
                items = data.get("items", [])
                if not items:
                    break

                # Filter new matches
                new_items = [m for m in items if m["match_id"] not in champ_match_ids]
                if new_items:
                    champ_matches.extend(new_items)
                    champ_match_ids.update([m["match_id"] for m in new_items])
                    print(f"Added {len(new_items)} new matches to {champ_file}")

                    # Update master.json
                    for m in new_items:
                        if m["match_id"] not in existing_match_ids:
                            master_matches.append(m)
                            existing_match_ids.add(m["match_id"])

                        # Extract teams and leaders
                        teams = m.get("teams", {})
                        for faction in teams.values():
                            if isinstance(faction, dict):
                                team_id = faction.get("faction_id")
                                if team_id and team_id not in existing_team_ids:
                                    team_info = {
                                        "team_id": team_id,
                                        "name": faction.get("name"),
                                        "avatar": faction.get("avatar"),
                                        "type": faction.get("type"),
                                        "leader_id": faction.get("leader")
                                    }
                                    team_list.append(team_info)
                                    existing_team_ids.add(team_id)

                offset += limit
                time.sleep(delay)

        # Save championship matches
        with open(champ_file, "w") as f:
            json.dump(champ_matches, f, indent=2)

    # Save master.json and team_list.json
    with open(master_file, "w") as f:
        json.dump(master_matches, f, indent=2)

    with open(team_file, "w") as f:
        json.dump(team_list, f, indent=2)

    print(f"Incremental fetch finished. Total matches in master: {len(master_matches)}")
    print(f"Total teams in team_list.json: {len(team_list)}")

    return master_matches, team_list

# -----------------------------
# RUN INCREMENTAL FETCH
# -----------------------------
master_matches, team_list = fetch_incremental_matches(championship_map, headers)

Error fetching Main (bd7957b3-1114-435f-af43-22645a508cc0) offset 600 param=past: 400
Error fetching Main (bd7957b3-1114-435f-af43-22645a508cc0) offset 600 param=all: 400
Error fetching Intermediate (4410aa73-9584-4500-a7ca-1504ed6da707) offset 600 param=past: 400
Error fetching Intermediate (4410aa73-9584-4500-a7ca-1504ed6da707) offset 600 param=all: 400
Error fetching Entry (e61be909-1727-4de3-8201-419d89893401) offset 600 param=past: 400
Error fetching Entry (e61be909-1727-4de3-8201-419d89893401) offset 600 param=all: 400
Error fetching Open10 (f62c4c75-f620-40a2-bef8-0be2cb85d2d3) offset 600 param=past: 400
Error fetching Open10 (f62c4c75-f620-40a2-bef8-0be2cb85d2d3) offset 600 param=all: 400
Error fetching Open9 (0ce757ef-9bad-4afa-bbce-58d647824222) offset 600 param=past: 400
Error fetching Open9 (0ce757ef-9bad-4afa-bbce-58d647824222) offset 600 param=all: 400
Error fetching Open1-8 (3cd81a7e-ed93-43ba-be08-2a1ee7ad7237) offset 600 param=past: 400
Error fetching Open1-8 (3cd81a7e

### List all recent matches from captains and filter those being ESEA related

In [8]:
# -----------------------------
# FILE PATHS
# -----------------------------
team_details_file = os.path.join(data_dir, season, region, stage, "team_list.json")
output_file = os.path.join(data_dir, season, region, stage, "all_captain_matches.json")
os.makedirs(os.path.dirname(output_file), exist_ok=True)

# -----------------------------
# HELPER FUNCTION: FETCH ONE CAPTAIN
# -----------------------------
def fetch_captain_matches(player_id, existing_matches, from_timestamp, api_key, limit, delay):
    headers = {"Authorization": f"Bearer {api_key}"}
    match_info = existing_matches.copy()
    offset = 0

    while True:
        url = f"https://open.faceit.com/data/v4/players/{player_id}/history?game=cs2&from={from_timestamp}&limit={limit}&offset={offset}"
        response = requests.get(url, headers=headers)

        if response.status_code != 200:
            print(f"Error for {player_id}: {response.status_code}")
            break

        data = response.json()
        matches = data.get("items", [])

        if not matches:
            break

        new_count = 0
        for match in matches:
            match_id = match["match_id"]
            if match_id not in match_info:
                competition_id = match.get("competition_id")
                started_at = match.get("started_at")
                faction1_team_id = match.get("teams", {}).get("faction1", {}).get("team_id")
                faction2_team_id = match.get("teams", {}).get("faction2", {}).get("team_id")
                winner = match.get("results", {}).get("winner")

                match_info[match_id] = {
                    "competition_id": competition_id,
                    "started_at": started_at,
                    "faction1_team_id": faction1_team_id,
                    "faction2_team_id": faction2_team_id,
                    "winner": winner
                }
                new_count += 1

        print(f"{player_id} offset {offset}: {len(matches)} fetched, {new_count} new matches added")
        offset += limit

        if len(matches) < limit:
            break

        time.sleep(delay)

    return player_id, match_info

# -----------------------------
# MAIN FUNCTION: INCREMENTAL FETCH WITH THREADS
# -----------------------------
def fetch_incremental_captain_matches(captains_ids, api_key, start_season_unix,
                                      filename=output_file, limit=100, delay=0.5, max_threads=5):

    # Load existing data
    if os.path.exists(filename):
        with open(filename, "r") as f:
            all_captain_matches = json.load(f)
        print(f"Loaded existing captain matches from {filename}")
    else:
        all_captain_matches = {}

    # Determine global latest timestamp among all existing captains
    latest_global_timestamp = 0
    for matches in all_captain_matches.values():
        for match in matches.values():
            ts = match.get("started_at", 0)
            if ts > latest_global_timestamp:
                latest_global_timestamp = ts

    print(f"Global latest started_at timestamp: {latest_global_timestamp}")

    # Counter for new matches across all captains
    total_new_matches = 0

    # Prepare tasks
    with ThreadPoolExecutor(max_workers=max_threads) as executor:
        futures = {}
        for player_id in captains_ids:
            existing_matches = all_captain_matches.get(player_id, {})
            if existing_matches:
                from_timestamp = latest_global_timestamp + 1
            else:
                from_timestamp = int(start_season_unix)

            future = executor.submit(
                fetch_captain_matches,
                player_id,
                existing_matches,
                from_timestamp,
                api_key,
                limit,
                delay
            )
            futures[future] = player_id

        # Collect results
        for future in as_completed(futures):
            player_id, match_info = future.result()
            before = len(all_captain_matches.get(player_id, {}))
            after = len(match_info)
            new_for_player = after - before
            total_new_matches += new_for_player

            all_captain_matches[player_id] = match_info
            print(f"{player_id}: {new_for_player} new matches, total now {after}")

    # Save updated JSON
    with open(filename, "w") as f:
        json.dump(all_captain_matches, f, indent=2)

    print(f"‚úÖ All captain matches updated in {filename}")
    print(f"üéØ Total new matches fetched this run: {total_new_matches}")

    return all_captain_matches, total_new_matches

# -----------------------------
# EXECUTION
# -----------------------------
with open(team_details_file, "r") as f:
    teams = json.load(f)

captains_ids = list({team["leader_id"] for team in teams if "leader_id" in team})
print(f"Found {len(captains_ids)} captains to fetch.")

all_captain_matches, total_new_matches = fetch_incremental_captain_matches(
    captains_ids, api_key, start_season_unix, limit=limit, delay=delay, max_threads=max_threads
)


Found 1339 captains to fetch.
Loaded existing captain matches from D:\Data project\Faceit dashboard\FaceitApi_Dataproject\data\Season 54\EU\Regular\all_captain_matches.json
Global latest started_at timestamp: 1756101734
Error for : 404
: 0 new matches, total now 0
5c396e71-66ae-43b6-b9c0-d1f09fe0cfdc: 0 new matches, total now 192
61cd4ed7-81c1-45bc-b7a9-b6ea8940128b: 0 new matches, total now 5
ae8bd4b7-5321-43c5-aa48-c2b3f3a804d6: 0 new matches, total now 73
8f180385-bd39-4ea6-89f3-c524b423ea96 offset 0: 5 fetched, 5 new matches added
8f180385-bd39-4ea6-89f3-c524b423ea96: 5 new matches, total now 129
92fa44f8-f5b0-4b55-988f-96ec564da9bc offset 0: 6 fetched, 6 new matches added
92fa44f8-f5b0-4b55-988f-96ec564da9bc: 6 new matches, total now 183
ab0a0e7f-2951-4206-b046-7939a1636d29 offset 0: 3 fetched, 3 new matches added
ab0a0e7f-2951-4206-b046-7939a1636d29: 3 new matches, total now 378
0f730532-6bd0-4cec-be0b-5d3f949ae342 offset 0: 4 fetched, 4 new matches added
0f730532-6bd0-4cec-be0b-

In [9]:
# with open(r"data\Season 54\EU\Regular\all_captain_matches.json", "r", encoding="utf-8") as f:
#     all_captain_matches = json.load(f)

In [10]:
rows = []
for player_id, matches in all_captain_matches.items():
    for match_id, match_info in matches.items():
        rows.append({
            "match_id": match_id,
            "competition_id": match_info.get("competition_id"),
            "match_day": match_info.get("started_at"),
            "faction1_team_id" : match_info.get("faction1_team_id"),
            "faction2_team_id": match_info.get("faction2_team_id"),
            "winner": match_info.get("winner"),
        })

# Convert to DataFrame
all_captain_matches_df = pd.DataFrame(rows)

# Optional: convert started_at to datetime
all_captain_matches_df["match_day"] = pd.to_datetime(all_captain_matches_df["match_day"], unit="s")

In [11]:
filtered_captain_matches_df = all_captain_matches_df[all_captain_matches_df['competition_id'].isin(championship_map)]
filtered_captain_matches_df = filtered_captain_matches_df.drop_duplicates(subset=['match_id'], keep='first')
filtered_captain_matches_df['winner_id'] = np.where(
    filtered_captain_matches_df['winner'] == 'faction1',
    filtered_captain_matches_df['faction1_team_id'],
    filtered_captain_matches_df['faction2_team_id']
)
filtered_captain_matches_df['loser_id'] = np.where(
    filtered_captain_matches_df['winner'] == 'faction1',
    filtered_captain_matches_df['faction2_team_id'],
    filtered_captain_matches_df['faction1_team_id']
)

filtered_captain_matches_df = filtered_captain_matches_df.drop(columns={'faction1_team_id','faction2_team_id','winner'})
filtered_captain_matches_df = filtered_captain_matches_df.reset_index(drop=True)

In [12]:
filtered_captain_matches_df

Unnamed: 0,match_id,competition_id,match_day,winner_id,loser_id
0,1-f197f4c1-6c52-46f6-b66b-edd2f5830813,e61be909-1727-4de3-8201-419d89893401,2025-08-18 18:07:19,83756527-b019-449f-9660-adc29f01416f,cfdfa687-0075-4f23-8129-28eb01b37a2a
1,1-8d4a43ae-90a0-48bb-a575-c089699bca24,e61be909-1727-4de3-8201-419d89893401,2025-08-13 18:41:24,97f7d131-9ffe-4dde-a8e5-e2dcee2113ea,cfdfa687-0075-4f23-8129-28eb01b37a2a
2,1-89b9853e-781a-4ca6-bb34-5ee736e20a50,e61be909-1727-4de3-8201-419d89893401,2025-08-11 18:39:34,cfdfa687-0075-4f23-8129-28eb01b37a2a,db190109-606c-4845-a773-574dde647db3
3,1-d606f510-8000-4f18-a438-5712554f4d49,e61be909-1727-4de3-8201-419d89893401,2025-08-10 18:36:18,aeff8fa0-734c-45bb-8b9b-f18847d630f6,cfdfa687-0075-4f23-8129-28eb01b37a2a
4,1-5517b202-c958-4244-8688-d123d637e178,e61be909-1727-4de3-8201-419d89893401,2025-08-04 18:39:52,cfdfa687-0075-4f23-8129-28eb01b37a2a,dc0203c1-9604-4ad0-bdea-644b117609b3
...,...,...,...,...,...
7795,1-514ebf59-d0b6-442b-abbf-6a7251f7314f,786da1fa-a9b5-4d17-b40e-416588272db4,2025-07-31 05:18:18,5f2e9987-a2a3-4901-94cf-8fa9457f307b,85322397-fbb0-4cfc-a092-d9fd8a9edb1a
7796,1-4be15fc7-60f2-42e2-bc5a-da19504534cb,786da1fa-a9b5-4d17-b40e-416588272db4,2025-07-25 16:52:56,85322397-fbb0-4cfc-a092-d9fd8a9edb1a,bye
7797,1-40b28985-c35e-4124-9f2a-30a543d730dd,4410aa73-9584-4500-a7ca-1504ed6da707,2025-08-12 18:14:10,b3e21e6e-7e76-4cba-b3af-b65b91833d6c,df862c73-6fc9-4f84-8160-22dd06293fd3
7798,1-b202e7b4-9f28-433f-8043-bcdff80d4048,e61be909-1727-4de3-8201-419d89893401,2025-07-23 18:39:19,4269aee6-d662-4d1b-80c9-495904bd65ed,7895ce74-408e-421d-b83a-05337ac700d4


In [13]:
# Define the path to your stats file
stats_file = Path(r"data\Season 54\EU\Regular\detailed_match_stats.json")

# Load existing stats or start empty
if stats_file.exists():
    with stats_file.open("r", encoding="utf-8") as f:
        all_stats = json.load(f)
else:
    all_stats = {}

# Keep track of already fetched match IDs
existing_match_ids = set(all_stats.keys())

In [14]:
new_match_ids = filtered_captain_matches_df[
    ~filtered_captain_matches_df["match_id"].isin(existing_match_ids)
]["match_id"].tolist()

print(f"üîπ {len(new_match_ids)} new matches to fetch stats for.")


üîπ 1830 new matches to fetch stats for.


### Pull detailed stats of all ESEA matches found

In [15]:
def fetch_single_match(match_id):
    url = f"https://open.faceit.com/data/v4/matches/{match_id}/stats"
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 404:
            return match_id, None
        response.raise_for_status()
        time.sleep(delay)
        return match_id, response.json()
    except requests.exceptions.RequestException:
        return match_id, None

# --------------------------
# 6. Threaded fetching
# --------------------------
if new_match_ids:
    with ThreadPoolExecutor(max_workers=max_threads) as executor:
        futures = {executor.submit(fetch_single_match, mid): mid for mid in new_match_ids}
        for i, future in enumerate(as_completed(futures), start=1):
            match_id, data = future.result()
            if data is not None:
                all_stats[match_id] = data
                print(f"[{i}/{len(new_match_ids)}] {match_id} fetched ‚úÖ")
            else:
                print(f"[{i}/{len(new_match_ids)}] {match_id} has no stats ‚ùå")
else:
    print("No new matches to fetch.")

# --------------------------
# 7. Save updated stats
# --------------------------
with stats_file.open("w", encoding="utf-8") as f:
    json.dump(all_stats, f, indent=2)

print(f"\n‚úÖ Done! Total stored matches in detailed_match_stats.json: {len(all_stats)}")

[1/1830] 1-07ea9bcc-4278-4ace-b358-bea44d0d5b69 has no stats ‚ùå
[2/1830] 1-1ad2265b-87df-422d-89a1-64085b55049f has no stats ‚ùå
[3/1830] 1-6ac0fa5a-8485-4263-9d91-4db12aebad94 has no stats ‚ùå
[4/1830] 1-42fb9f0a-bc5c-4012-9a07-b283fdbe1f9a has no stats ‚ùå
[5/1830] 1-697862c4-4cfa-4e46-96e2-1e251d0133c5 has no stats ‚ùå
[6/1830] 1-45bef5ed-c015-4b68-813f-8640e2d52931 has no stats ‚ùå
[7/1830] 1-1706fa4d-8768-4779-87c7-6d6e2205fd90 has no stats ‚ùå
[8/1830] 1-74a39db1-537d-4e42-bc65-9616cdda7b49 has no stats ‚ùå
[9/1830] 1-f85b2bd4-ceaa-43c3-90d6-13fa2dfb9297 has no stats ‚ùå
[10/1830] 1-26642931-ac9b-49ce-b9bc-bf2b594e6d08 has no stats ‚ùå
[11/1830] 1-b82f94d7-02c3-497a-b3fb-95ff20d896a9 has no stats ‚ùå
[12/1830] 1-73b2dbcb-5b00-469f-b7c9-2c7010479172 has no stats ‚ùå
[13/1830] 1-6cea49c9-c7fa-4a7b-a7cb-3d95770dbbdb has no stats ‚ùå
[14/1830] 1-57bed81b-2e88-480d-b9fa-ffcdd5baee9e has no stats ‚ùå
[15/1830] 1-b1dd2bf6-7a47-4e03-8dab-15da95abd044 has no stats ‚ùå
[16/1830] 1-23d51cb

In [16]:
team_ids = set()  # set pour √©viter les doublons

for match_id, match_data in all_stats.items():
    rounds = match_data.get('rounds', [])
    for round_data in rounds:
        teams = round_data.get('teams', [])
        for team in teams:
            team_id = team.get('team_id')
            if team_id:
                team_ids.add(team_id)

team_ids = list(team_ids)

### Pull details of each team involved in ESEA matches

In [17]:
team_ids_to_fetch = [team["team_id"] for team in team_list if team["team_id"]]

def fetch_team_details(team_ids, headers, team_file="team_details.json", delay=0.3, max_threads=5):
    team_path = Path(team_file)

    # Load existing data if present
    if team_path.exists():
        with team_path.open("r", encoding="utf-8") as f:
            all_teams = json.load(f)
    else:
        all_teams = {}

    # Filter team_ids that haven't been fetched yet
    to_fetch = [tid for tid in team_ids if tid not in all_teams]
    print(f"üîç {len(to_fetch)} new teams to fetch among {len(team_ids)} total.\n")

    def fetch_single_team(team_id):
        url = f"https://open.faceit.com/data/v4/teams/{team_id}"
        try:
            response = requests.get(url, headers=headers)
            if response.status_code == 404:
                print(f"{team_id} ‚Üí Team not found (404) ‚ùå")
                return team_id, None

            response.raise_for_status()
            time.sleep(delay)
            return team_id, response.json()
        except requests.exceptions.RequestException as e:
            print(f"Error fetching {team_id} ‚ùå : {e}")
            return team_id, None

    # Threaded fetching
    if to_fetch:
        with ThreadPoolExecutor(max_workers=max_threads) as executor:
            futures = {executor.submit(fetch_single_team, tid): tid for tid in to_fetch}
            for i, future in enumerate(as_completed(futures), start=1):
                team_id, data = future.result()
                all_teams[team_id] = data
                status = "fetched ‚úÖ" if data else "failed ‚ùå"
                print(f"[{i}/{len(to_fetch)}] {team_id} {status}")
    else:
        print("No new teams to fetch.")

    # Save updated teams
    with team_path.open("w", encoding="utf-8") as f:
        json.dump(all_teams, f, indent=2)

    print(f"\n‚úÖ Done! Total teams stored: {len(all_teams)}")
    return all_teams

all_teams_data = fetch_team_details(team_ids_to_fetch, headers, team_file="team_details.json", delay=delay)

üîç 0 new teams to fetch among 1339 total.

No new teams to fetch.

‚úÖ Done! Total teams stored: 1340


## Tables building

### dim_championship table

In [18]:
championship_details_list = []

for data in master_matches:  # juste it√©rer sur la liste
    if data is None:
        continue  # skip entries with no data

    row = {
        'competition_id': data.get('competition_id'),
        'competition_name': data.get('competition_name'),
        'competition_type': data.get('competition_type'),
        'region': data.get('region'),
        'started_at': data.get('finished_at'), #on utilise la valeur de finished_at comme jour de match car si le match a √©t√© dodge il n'y a pas de start_date
    }
    championship_details_list.append(row)

dim_championships = pd.DataFrame(championship_details_list)

In [19]:
dim_championships['started_at'] = pd.to_datetime(dim_championships['started_at'], unit='s', utc=True).dt.date


In [20]:
dim_championships[['organizer','season','region','division','sub_region','state','state2','state3']] = dim_championships['competition_name'].str.split(' ',expand=True)
dim_championships['state'] = dim_championships['state2'] + ' ' + dim_championships['state3']
dim_championships.drop(columns=['state2', 'state3'], inplace=True)

In [21]:
dim_championships = dim_championships.groupby(["competition_id"]).agg({
    "organizer": "first", 
    "competition_type": "first", 
    'region':'first',
    'sub_region':'first',
    "season": "first",       
    "division": "first",               
    "state": "first",
    "started_at" :"first",
}).reset_index()

In [22]:
dim_championships

Unnamed: 0,competition_id,organizer,competition_type,region,sub_region,season,division,state,started_at
0,0ce757ef-9bad-4afa-bbce-58d647824222,ESEA,championship,EU,Central,S54,Open9,Regular Season,2025-08-08
1,3cd81a7e-ed93-43ba-be08-2a1ee7ad7237,ESEA,championship,EU,Central,S54,Open1-8,Regular Season,2025-08-10
2,4410aa73-9584-4500-a7ca-1504ed6da707,ESEA,championship,EU,Central,S54,Intermediate,Regular Season,2025-08-11
3,786da1fa-a9b5-4d17-b40e-416588272db4,ESEA,championship,EU,Central,S54,Advanced,Regular Season,2025-08-11
4,bd7957b3-1114-435f-af43-22645a508cc0,ESEA,championship,EU,Central,S54,Main,Regular Season,2025-08-08
5,e61be909-1727-4de3-8201-419d89893401,ESEA,championship,EU,Central,S54,Entry,Regular Season,2025-08-08
6,f62c4c75-f620-40a2-bef8-0be2cb85d2d3,ESEA,championship,EU,Central,S54,Open10,Regular Season,2025-08-08


### dim_teams and dim_players tables

In [23]:
team_details_list = []

for team_ids, team_data in all_teams_data.items():
    if team_data is None:
        continue  # üëà Skip entries with no data

    row = {
        'team_id': team_data.get('team_id'),
        'team_nickname': team_data.get('nickname'),
        'team_name': team_data.get('name'),
        'team_avatar': team_data.get('avatar'),
        'team_faceit_url': team_data.get('faceit_url'),
    }
    team_details_list.append(row)

dim_teams = pd.DataFrame(team_details_list)
dim_teams = dim_teams.drop_duplicates(subset=['team_id'], keep='last')
dim_teams['team_faceit_url'] = dim_teams['team_faceit_url'].str.replace('/{lang}','')

In [24]:
#get lis of teams & players from championship request
players_details_list = []

for match_id, match_data in all_stats.items():
    for round_data in match_data['rounds']:
        for team in round_data['teams']:
            team_id = team.get("team_id")
            for player in team["players"]:
                player_id = player["player_id"]
                nickname = player["nickname"]

                row = {
                    "player_id": player_id,
                    "team_id": team_id,
                    "player_name": nickname
                }

                row.update()
                players_details_list.append(row)

players_matches = pd.DataFrame(players_details_list)
players_matches = players_matches.drop_duplicates(subset=['player_id'], keep='last')


In [25]:
#enrich with detail of players from team_detail request
players_details_list = []

for team_id, team_data in all_teams_data.items():
    if team_data is None or 'members' not in team_data:
        continue 

    for member in team_data['members']:
        row = {
            'player_id': member.get('user_id'),
            'player_country': member.get('country'),
            'player_faceit_url': member.get('faceit_url'),
            'player_avatar' : member.get('avatar')
        }
        players_details_list.append(row)

# Create DataFrame
players_team_details = pd.DataFrame(players_details_list)
players_team_details = players_team_details.drop_duplicates(subset=['player_id'], keep='last')

In [26]:
dim_players = pd.merge(players_matches, players_team_details, on='player_id',how='left')
dim_players['player_faceit_url'] = dim_players['player_faceit_url'].str.replace('/{lang}','')

In [27]:
dim_players

Unnamed: 0,player_id,team_id,player_name,player_country,player_faceit_url,player_avatar
0,b89a3885-c9be-4dae-a593-1ad03bd48784,934a2bd0-fa66-4c02-ac92-00f742e7ec55,wolterz,ru,https://www.faceit.com/players/wolterz,https://distribution.faceit-cdn.net/images/93f...
1,117cb564-1753-47c0-8a3c-7f00f2e17534,1b637421-544a-4c74-9b70-e5ee5cf541f9,-FaaT,pl,https://www.faceit.com/players/-FaaT,https://distribution.faceit-cdn.net/images/1a9...
2,d2a9afea-2411-49c2-9d7b-6283dcccef40,df862c73-6fc9-4f84-8160-22dd06293fd3,shepzor,ru,https://www.faceit.com/players/shepzor,https://distribution.faceit-cdn.net/images/62c...
3,b065a172-1af3-4ce8-9b1b-6e6a4e0297cc,45f3397b-4dae-426c-b7ec-5c743ab13373,Dante12,,,
4,9f95b1c2-cbd7-472a-9189-e0adfea5097a,b7d71a56-69f7-409b-ac41-d1113f659153,johny01,hu,https://www.faceit.com/players/johny01,https://distribution.faceit-cdn.net/images/12a...
...,...,...,...,...,...,...
8295,11022d06-2cf4-4221-a0fb-083971be2ab6,f69fbdff-83f7-4ab2-aa99-a1c8742340ec,Flompy,tr,https://www.faceit.com/players/Flompy,https://distribution.faceit-cdn.net/images/938...
8296,ea228795-e3d9-4fb0-b2c4-48ab3aa34976,f69fbdff-83f7-4ab2-aa99-a1c8742340ec,singvayn,TR,https://www.faceit.com/players/singvayn,https://distribution.faceit-cdn.net/images/ce9...
8297,3af5d1af-75db-4cc2-aa8b-76b1f1b76af8,f69fbdff-83f7-4ab2-aa99-a1c8742340ec,xEternaLxx,tr,https://www.faceit.com/players/xEternaLxx,https://distribution.faceit-cdn.net/images/6b2...
8298,492a0844-1694-43a1-b318-a79898a498e5,f69fbdff-83f7-4ab2-aa99-a1c8742340ec,-D1KUBA,pl,https://www.faceit.com/players/-D1KUBA,https://distribution.faceit-cdn.net/images/91b...


### dim_matches table

The idea is to have a composite key with the combination of match_id and match_round being unique : 

- match id being the whole match (either its a Bo1, Bo3 or Bo5)
- match_round being the id of the map played (1 if a single map is played and then increase up to 5 for Bo5)

In [28]:
dim_match_list = []

for match_id, match_data in all_stats.items():
    for round_data in match_data['rounds']:
        for team in round_data['teams']:

            # Get both teams once per match
            teams = round_data['teams']  # Safe because you've pre-filtered

            for round_data in match_data['rounds']:
                round_stats = round_data['round_stats']
                winner_id = round_stats["Winner"]
                
                dim_match_list.append({
                    "match_id": match_id,
                    "match_round": round_data["match_round"],
                    "format": f"bo{round_data['best_of']}",
                    "map": round_stats["Map"],
                    "total_rounds_played": round_stats["Rounds"]
                })

dim_matches_matchquery = pd.DataFrame(dim_match_list)

In [29]:
dim_matches_matchquery

Unnamed: 0,match_id,match_round,format,map,total_rounds_played
0,1-74860e59-e961-4a1c-868c-55ef7d3215b8,1,bo1,de_dust2,12
1,1-74860e59-e961-4a1c-868c-55ef7d3215b8,1,bo1,de_dust2,12
2,1-14e4f8cb-ca01-47f8-aee1-19136f852ea4,1,bo1,de_nuke,23
3,1-14e4f8cb-ca01-47f8-aee1-19136f852ea4,1,bo1,de_nuke,23
4,1-0b1fb845-4c11-42af-8026-743757ba84ea,1,bo1,de_anubis,30
...,...,...,...,...,...
12560,1-c1613561-333c-4f82-aa91-08537895675a,1,bo1,de_nuke,22
12561,1-8d62e81b-58d0-420a-8e2b-a4273e99423b,1,bo1,de_ancient,29
12562,1-8d62e81b-58d0-420a-8e2b-a4273e99423b,1,bo1,de_ancient,29
12563,1-5fd2c46e-07b9-40b0-a86d-0f7e0bcf8898,1,bo1,de_ancient,17


In [30]:
# dim_matches = pd.merge(dim_matches_champ_query[['competition_id','match_id','match_day']], dim_matches_matchquery, on='match_id')
dim_matches = pd.merge(filtered_captain_matches_df, dim_matches_matchquery, on='match_id',how='left')
dim_matches = dim_matches.drop_duplicates(subset=["match_id", "competition_id", "match_day"])

# define format and match_round (working with bo1 and bo3)
dim_matches["round_count"] = dim_matches.groupby(["match_id", "competition_id"])["match_id"].transform("count")
dim_matches["match_round"] = dim_matches.groupby(["match_id", "competition_id"])["match_day"].rank(method="first").astype(int).astype(str)
dim_matches["format"] = dim_matches["round_count"].apply(lambda x: "bo1" if x == 1 else "bo3")
dim_matches = dim_matches.drop(columns=["round_count"])

In [31]:
dim_matches

Unnamed: 0,match_id,competition_id,match_day,winner_id,loser_id,match_round,format,map,total_rounds_played
0,1-f197f4c1-6c52-46f6-b66b-edd2f5830813,e61be909-1727-4de3-8201-419d89893401,2025-08-18 18:07:19,83756527-b019-449f-9660-adc29f01416f,cfdfa687-0075-4f23-8129-28eb01b37a2a,1,bo1,de_ancient,15
2,1-8d4a43ae-90a0-48bb-a575-c089699bca24,e61be909-1727-4de3-8201-419d89893401,2025-08-13 18:41:24,97f7d131-9ffe-4dde-a8e5-e2dcee2113ea,cfdfa687-0075-4f23-8129-28eb01b37a2a,1,bo1,de_overpass,16
4,1-89b9853e-781a-4ca6-bb34-5ee736e20a50,e61be909-1727-4de3-8201-419d89893401,2025-08-11 18:39:34,cfdfa687-0075-4f23-8129-28eb01b37a2a,db190109-606c-4845-a773-574dde647db3,1,bo1,de_mirage,19
6,1-d606f510-8000-4f18-a438-5712554f4d49,e61be909-1727-4de3-8201-419d89893401,2025-08-10 18:36:18,aeff8fa0-734c-45bb-8b9b-f18847d630f6,cfdfa687-0075-4f23-8129-28eb01b37a2a,1,bo1,de_mirage,42
8,1-5517b202-c958-4244-8688-d123d637e178,e61be909-1727-4de3-8201-419d89893401,2025-08-04 18:39:52,cfdfa687-0075-4f23-8129-28eb01b37a2a,dc0203c1-9604-4ad0-bdea-644b117609b3,1,bo1,de_train,30
...,...,...,...,...,...,...,...,...,...
14080,1-514ebf59-d0b6-442b-abbf-6a7251f7314f,786da1fa-a9b5-4d17-b40e-416588272db4,2025-07-31 05:18:18,5f2e9987-a2a3-4901-94cf-8fa9457f307b,85322397-fbb0-4cfc-a092-d9fd8a9edb1a,1,bo1,,
14081,1-4be15fc7-60f2-42e2-bc5a-da19504534cb,786da1fa-a9b5-4d17-b40e-416588272db4,2025-07-25 16:52:56,85322397-fbb0-4cfc-a092-d9fd8a9edb1a,bye,1,bo1,,
14082,1-40b28985-c35e-4124-9f2a-30a543d730dd,4410aa73-9584-4500-a7ca-1504ed6da707,2025-08-12 18:14:10,b3e21e6e-7e76-4cba-b3af-b65b91833d6c,df862c73-6fc9-4f84-8160-22dd06293fd3,1,bo1,de_nuke,22
14084,1-b202e7b4-9f28-433f-8043-bcdff80d4048,e61be909-1727-4de3-8201-419d89893401,2025-07-23 18:39:19,4269aee6-d662-4d1b-80c9-495904bd65ed,7895ce74-408e-421d-b83a-05337ac700d4,1,bo1,de_ancient,22


In [32]:
column_order=['match_id','match_round','competition_id','match_day','format','map','total_rounds_played','winner_id','loser_id']
dim_matches = dim_matches[column_order]
dim_matches = dim_matches.drop_duplicates(subset=['match_id','match_round'], keep='first')
dim_matches = dim_matches.reset_index(drop=True)

In [33]:
dim_matches

Unnamed: 0,match_id,match_round,competition_id,match_day,format,map,total_rounds_played,winner_id,loser_id
0,1-f197f4c1-6c52-46f6-b66b-edd2f5830813,1,e61be909-1727-4de3-8201-419d89893401,2025-08-18 18:07:19,bo1,de_ancient,15,83756527-b019-449f-9660-adc29f01416f,cfdfa687-0075-4f23-8129-28eb01b37a2a
1,1-8d4a43ae-90a0-48bb-a575-c089699bca24,1,e61be909-1727-4de3-8201-419d89893401,2025-08-13 18:41:24,bo1,de_overpass,16,97f7d131-9ffe-4dde-a8e5-e2dcee2113ea,cfdfa687-0075-4f23-8129-28eb01b37a2a
2,1-89b9853e-781a-4ca6-bb34-5ee736e20a50,1,e61be909-1727-4de3-8201-419d89893401,2025-08-11 18:39:34,bo1,de_mirage,19,cfdfa687-0075-4f23-8129-28eb01b37a2a,db190109-606c-4845-a773-574dde647db3
3,1-d606f510-8000-4f18-a438-5712554f4d49,1,e61be909-1727-4de3-8201-419d89893401,2025-08-10 18:36:18,bo1,de_mirage,42,aeff8fa0-734c-45bb-8b9b-f18847d630f6,cfdfa687-0075-4f23-8129-28eb01b37a2a
4,1-5517b202-c958-4244-8688-d123d637e178,1,e61be909-1727-4de3-8201-419d89893401,2025-08-04 18:39:52,bo1,de_train,30,cfdfa687-0075-4f23-8129-28eb01b37a2a,dc0203c1-9604-4ad0-bdea-644b117609b3
...,...,...,...,...,...,...,...,...,...
7795,1-514ebf59-d0b6-442b-abbf-6a7251f7314f,1,786da1fa-a9b5-4d17-b40e-416588272db4,2025-07-31 05:18:18,bo1,,,5f2e9987-a2a3-4901-94cf-8fa9457f307b,85322397-fbb0-4cfc-a092-d9fd8a9edb1a
7796,1-4be15fc7-60f2-42e2-bc5a-da19504534cb,1,786da1fa-a9b5-4d17-b40e-416588272db4,2025-07-25 16:52:56,bo1,,,85322397-fbb0-4cfc-a092-d9fd8a9edb1a,bye
7797,1-40b28985-c35e-4124-9f2a-30a543d730dd,1,4410aa73-9584-4500-a7ca-1504ed6da707,2025-08-12 18:14:10,bo1,de_nuke,22,b3e21e6e-7e76-4cba-b3af-b65b91833d6c,df862c73-6fc9-4f84-8160-22dd06293fd3
7798,1-b202e7b4-9f28-433f-8043-bcdff80d4048,1,e61be909-1727-4de3-8201-419d89893401,2025-07-23 18:39:19,bo1,de_ancient,22,4269aee6-d662-4d1b-80c9-495904bd65ed,7895ce74-408e-421d-b83a-05337ac700d4


In [34]:
dupes = dim_matches[dim_matches.duplicated(subset=["match_id", "match_round"], keep=False)]
print(dupes)

Empty DataFrame
Columns: [match_id, match_round, competition_id, match_day, format, map, total_rounds_played, winner_id, loser_id]
Index: []


In [35]:
dim_matches.dtypes

match_id                       object
match_round                    object
competition_id                 object
match_day              datetime64[ns]
format                         object
map                            object
total_rounds_played            object
winner_id                      object
loser_id                       object
dtype: object

### fact_players_stats table 

In [78]:
player_stats_list = []

for match_id, match_data in all_stats.items():
    for round_data in match_data['rounds']:
        map_name = round_data['round_stats'].get('Map')
        winner_id = round_data['round_stats'].get('Winner')

        for team in round_data['teams']:
            team_id = team.get("team_id")
            team_name = team["team_stats"].get("Team")
            team_win = team["team_stats"].get("Team Win")

            for player in team["players"]:
                player_id = player["player_id"]
                nickname = player["nickname"]
                stats = player["player_stats"]

                row = {
                    "match_id": match_id,
                    "match_round": round_data.get("match_round"),
                    "player_id": player_id,
                    "team_id": team_id,
                }

                row.update(stats)
                player_stats_list.append(row)

fact_players_stats = pd.DataFrame(player_stats_list)


In [79]:
#enrich with competition_id and match_day
fact_players_stats = pd.merge(fact_players_stats, dim_matches[['match_id','competition_id','match_day']], on=['match_id'])

In [80]:
#correction d'un cas ou les donn√©es d'une game sont en doublons, on garde les lignes qui ont les stats les plus importantes, exemple :https://www.faceit.com/fr/cs2/room/1-e4410b46-a6ac-4037-ad35-d7f91f8bf55f/scoreboard
players_stats_sorted = fact_players_stats.sort_values(['match_id', 'player_id', 'Kills'], ascending=[True, True, False])
fact_players_stats = players_stats_sorted.drop_duplicates(subset=['match_id', 'player_id'], keep='first')
#correction d'un bug ou un joueur sur la fiche d'un match mais n'ayant pas jou√© se retrouve avec des stats a 0 (nouvelle logique de r√©cup√©ration des matchs via captains)
fact_players_stats = fact_players_stats[fact_players_stats['Damage']!= 0]

In [43]:
fact_players_stats = fact_players_stats.drop(columns={'Result','Headshots %','K/R Ratio','K/D Ratio','Match Entry Rate','Match 1v1 Win Rate','Flash Success Rate per Match','Sniper Kill Rate per Round','Utility Usage per Round','Flashes per Round in a Match','Sniper Kill Rate per Match','Utility Damage per Round in a Match','Utility Damage Success Rate per Match','Utility Successes','Enemies Flashed per Round in a Match','Utility Success Rate per Match','Match Entry Success Rate','Match 1v2 Win Rate','Utility Enemies'})

In [82]:
fact_players_stats = fact_players_stats.rename(columns={
    'Zeus Kills':'zeus_kills',
    'Utility Count':'utility_count',
    'Double Kills':'double_kills',
    'Flash Successes':'flash_successes',
    'Quadro Kills':'quadro_kills',
    'Entry Count':'entry_count',
    'First Kills':'first_kills',
    'Flash Count':'flash_count',
    'Sniper Kills':'sniper_kills',
    'Damage':'damage',
    'Utility Damage':'utility_damage',
    'Assists':'assists',
    '1v1Count':'count_1v1',
    'Enemies Flashed':'enemies_flashed',
    'Clutch Kills':'clutch_kills',
    'Penta Kills':'ace',
    'MVPs':'mvps',
    'Deaths':'deaths',
    'Entry Wins':'entry_wins',
    'Kills':'kills',
    '1v2Wins':'wins_1v2',
    '1v1Wins':'wins_1v1',
    'Pistol Kills':'pistol_kills',
    'Knife Kills':'knife_kills',
    'ADR':'adr',
    '1v2Count':'count_1v2',
    'Triple Kills':'triple_kills',
    'Headshots':'headshots',
})

In [83]:
column_order=['match_id', 'match_round','player_id', 'competition_id','team_id','zeus_kills',
'utility_count',
'double_kills',
'flash_successes',
'quadro_kills',
'entry_count',
'first_kills',
'flash_count',
'sniper_kills',
'damage',
'utility_damage',
'assists',
'count_1v1',
'enemies_flashed',
'clutch_kills',
'ace',
'mvps',
'deaths',
'entry_wins',
'kills',
'wins_1v2',
'wins_1v1',
'pistol_kills',
'knife_kills',
'adr',
'count_1v2',
'triple_kills',
'headshots', 
'match_day']
fact_players_stats = fact_players_stats[column_order]

In [84]:
fact_players_stats.columns

Index(['match_id', 'match_round', 'player_id', 'competition_id', 'team_id',
       'zeus_kills', 'utility_count', 'double_kills', 'flash_successes',
       'quadro_kills', 'entry_count', 'first_kills', 'flash_count',
       'sniper_kills', 'damage', 'utility_damage', 'assists', 'count_1v1',
       'enemies_flashed', 'clutch_kills', 'ace', 'mvps', 'deaths',
       'entry_wins', 'kills', 'wins_1v2', 'wins_1v1', 'pistol_kills',
       'knife_kills', 'adr', 'count_1v2', 'triple_kills', 'headshots',
       'match_day'],
      dtype='object')

In [85]:
exclude_cols = ['match_id','match_round','player_id','competition_id','team_id','match_day']

for col in fact_players_stats.columns:
    if col not in exclude_cols:
        fact_players_stats[col] = pd.to_numeric(fact_players_stats[col], errors="ignore")


  fact_players_stats[col] = pd.to_numeric(fact_players_stats[col], errors="ignore")


In [86]:
fact_players_stats.dtypes

match_id                   object
match_round                object
player_id                  object
competition_id             object
team_id                    object
zeus_kills                  int64
utility_count               int64
double_kills                int64
flash_successes             int64
quadro_kills                int64
entry_count                 int64
first_kills                 int64
flash_count                 int64
sniper_kills                int64
damage                      int64
utility_damage              int64
assists                     int64
count_1v1                   int64
enemies_flashed             int64
clutch_kills                int64
ace                         int64
mvps                        int64
deaths                      int64
entry_wins                  int64
kills                       int64
wins_1v2                    int64
wins_1v1                    int64
pistol_kills                int64
knife_kills                 int64
adr           

In [87]:
dupes = fact_players_stats[fact_players_stats.duplicated(subset=["match_id", "match_round","player_id"], keep=False)]
print(dupes)

Empty DataFrame
Columns: [match_id, match_round, player_id, competition_id, team_id, zeus_kills, utility_count, double_kills, flash_successes, quadro_kills, entry_count, first_kills, flash_count, sniper_kills, damage, utility_damage, assists, count_1v1, enemies_flashed, clutch_kills, ace, mvps, deaths, entry_wins, kills, wins_1v2, wins_1v1, pistol_kills, knife_kills, adr, count_1v2, triple_kills, headshots, match_day]
Index: []

[0 rows x 34 columns]


### fact_teams_stats table 

In [88]:
team_stats_list = []

for match_id, match_data in all_stats.items():
    for round_data in match_data['rounds']:
        total_rounds_played = round_data['round_stats'].get('Rounds')

        for team in round_data['teams']:
            team_id = team.get("team_id")
            team_name = team["team_stats"].get("Team")
            team_win = team["team_stats"].get("Team Win")

            row = {
                "match_id": match_id,
                "match_round": round_data.get("match_round"),
                "team_id": team_id,
                "team_win": team_win,
                "total_rounds_played": total_rounds_played
            }

            row.update(team["team_stats"])
            team_stats_list.append(row)

fact_teams_stats = pd.DataFrame(team_stats_list)


In [89]:
fact_teams_stats.head(2)

Unnamed: 0,match_id,match_round,team_id,team_win,total_rounds_played,First Half Score,Team,Team Win,Final Score,Team Headshots,Second Half Score,Overtime score
0,1-74860e59-e961-4a1c-868c-55ef7d3215b8,1,30191a9c-ceb2-4da0-a146-ec375782ac45,0,12,1,officers,0,1,3.2,0,0
1,1-74860e59-e961-4a1c-868c-55ef7d3215b8,1,f8ac5861-2bb7-4dd1-8378-afa7a9257920,1,12,11,The Toxics,1,11,4.6,0,0


In [90]:
# Enrich with competition_id and match_day - only merge on match_id because comp_id and match_day doesnt differ depending on match_round
fact_teams_stats = pd.merge(fact_teams_stats, dim_matches[['match_id','competition_id','match_day']], on='match_id')

In [91]:
fact_teams_stats

Unnamed: 0,match_id,match_round,team_id,team_win,total_rounds_played,First Half Score,Team,Team Win,Final Score,Team Headshots,Second Half Score,Overtime score,competition_id,match_day
0,1-74860e59-e961-4a1c-868c-55ef7d3215b8,1,30191a9c-ceb2-4da0-a146-ec375782ac45,0,12,1,officers,0,1,3.2,0,0,3cd81a7e-ed93-43ba-be08-2a1ee7ad7237,2025-07-30 18:40:44
1,1-74860e59-e961-4a1c-868c-55ef7d3215b8,1,f8ac5861-2bb7-4dd1-8378-afa7a9257920,1,12,11,The Toxics,1,11,4.6,0,0,3cd81a7e-ed93-43ba-be08-2a1ee7ad7237,2025-07-30 18:40:44
2,1-14e4f8cb-ca01-47f8-aee1-19136f852ea4,1,6d406cd9-c6a2-4c0d-aa9d-45dfa5812460,0,23,7,5PellMans,0,10,8.6,3,0,4410aa73-9584-4500-a7ca-1504ed6da707,2025-08-19 18:13:11
3,1-14e4f8cb-ca01-47f8-aee1-19136f852ea4,1,36ef04e5-fdbe-43bc-b204-3c6d771dca4e,1,23,5,WHYNOT,1,13,8,8,0,4410aa73-9584-4500-a7ca-1504ed6da707,2025-08-19 18:13:11
4,1-0b1fb845-4c11-42af-8026-743757ba84ea,1,aa598c46-986e-4f63-b06e-b01099a05c37,1,30,6,Tiim,1,16,9,6,4,f62c4c75-f620-40a2-bef8-0be2cb85d2d3,2025-07-16 18:37:21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12552,1-c1613561-333c-4f82-aa91-08537895675a,1,3829366a-38cd-4153-beb6-9b5aef1c3fe7,1,22,4,FAJNE CHLOPAKI,1,13,7.4,9,0,0ce757ef-9bad-4afa-bbce-58d647824222,2025-08-25 20:11:48
12553,1-8d62e81b-58d0-420a-8e2b-a4273e99423b,1,5e89b75e-d947-42fa-ba7f-88b17106f947,0,29,7,Prestige EC,0,13,12.6,5,1,bd7957b3-1114-435f-af43-22645a508cc0,2025-08-25 18:11:52
12554,1-8d62e81b-58d0-420a-8e2b-a4273e99423b,1,974141f2-78a9-46e8-bb93-94d46f5d5045,1,29,5,ZennoX,1,16,13.8,7,4,bd7957b3-1114-435f-af43-22645a508cc0,2025-08-25 18:11:52
12555,1-5fd2c46e-07b9-40b0-a86d-0f7e0bcf8898,1,8197aa39-3f39-4920-8116-fb908ba388f7,0,17,2,flyys,0,4,5.2,2,0,bd7957b3-1114-435f-af43-22645a508cc0,2025-08-25 18:13:18


In [92]:
fact_teams_stats = fact_teams_stats.drop(columns={'Team','Team Win','Team Headshots'})
fact_teams_stats = fact_teams_stats.rename(columns={'Final Score':'final_score','Overtime score':'overtime_score','First Half Score':'first_half_score','Second Half Score':'second_half_score'})

In [93]:
fact_teams_stats.head(2)

Unnamed: 0,match_id,match_round,team_id,team_win,total_rounds_played,first_half_score,final_score,second_half_score,overtime_score,competition_id,match_day
0,1-74860e59-e961-4a1c-868c-55ef7d3215b8,1,30191a9c-ceb2-4da0-a146-ec375782ac45,0,12,1,1,0,0,3cd81a7e-ed93-43ba-be08-2a1ee7ad7237,2025-07-30 18:40:44
1,1-74860e59-e961-4a1c-868c-55ef7d3215b8,1,f8ac5861-2bb7-4dd1-8378-afa7a9257920,1,12,11,11,0,0,3cd81a7e-ed93-43ba-be08-2a1ee7ad7237,2025-07-30 18:40:44


In [94]:
#correction d'un cas ou les donn√©es d'une game sont en doublons, on garde les lignes qui ont les stats les plus importantes, exemple :https://www.faceit.com/fr/cs2/room/1-e4410b46-a6ac-4037-ad35-d7f91f8bf55f/scoreboard
team_stats_sorted = fact_teams_stats.sort_values(['match_id', 'team_id', 'final_score'], ascending=[True, True, False])
fact_teams_stats = team_stats_sorted.drop_duplicates(subset=['match_id', 'team_id'], keep='first')

column_order=['match_id','match_round','team_id','competition_id','team_win','total_rounds_played','first_half_score','second_half_score','overtime_score','final_score']
fact_teams_stats = fact_teams_stats[column_order]

In [95]:
fact_teams_stats.columns

Index(['match_id', 'match_round', 'team_id', 'competition_id', 'team_win',
       'total_rounds_played', 'first_half_score', 'second_half_score',
       'overtime_score', 'final_score'],
      dtype='object')

In [96]:
fact_teams_stats.dtypes

match_id               object
match_round            object
team_id                object
competition_id         object
team_win               object
total_rounds_played    object
first_half_score       object
second_half_score      object
overtime_score         object
final_score            object
dtype: object

## Summary of tables & columns

In [97]:
dim_championships.columns

Index(['competition_id', 'organizer', 'competition_type', 'region',
       'sub_region', 'season', 'division', 'state', 'started_at'],
      dtype='object')

In [98]:
dim_teams.columns

Index(['team_id', 'team_nickname', 'team_name', 'team_avatar',
       'team_faceit_url'],
      dtype='object')

In [99]:
dim_players.columns

Index(['player_id', 'team_id', 'player_name', 'player_country',
       'player_faceit_url', 'player_avatar'],
      dtype='object')

In [100]:
dim_matches.columns

Index(['match_id', 'match_round', 'competition_id', 'match_day', 'format',
       'map', 'total_rounds_played', 'winner_id', 'loser_id'],
      dtype='object')

In [101]:
fact_players_stats.columns

Index(['match_id', 'match_round', 'player_id', 'competition_id', 'team_id',
       'zeus_kills', 'utility_count', 'double_kills', 'flash_successes',
       'quadro_kills', 'entry_count', 'first_kills', 'flash_count',
       'sniper_kills', 'damage', 'utility_damage', 'assists', 'count_1v1',
       'enemies_flashed', 'clutch_kills', 'ace', 'mvps', 'deaths',
       'entry_wins', 'kills', 'wins_1v2', 'wins_1v1', 'pistol_kills',
       'knife_kills', 'adr', 'count_1v2', 'triple_kills', 'headshots',
       'match_day'],
      dtype='object')

In [102]:
fact_teams_stats.columns

Index(['match_id', 'match_round', 'team_id', 'competition_id', 'team_win',
       'total_rounds_played', 'first_half_score', 'second_half_score',
       'overtime_score', 'final_score'],
      dtype='object')

In [103]:
# Find IDs in fact that are missing in dim
missing_in_dim = fact_players_stats.loc[~fact_players_stats["player_id"].isin(dim_players["player_id"]), "player_id"].unique()

# Find IDs in dim that are missing in fact
missing_in_fact = dim_players.loc[~dim_players["player_id"].isin(fact_players_stats["player_id"]), "player_id"].unique()

print(f"IDs in fact but not in dim: {missing_in_dim}")
print(f"IDs in dim but not in fact: {missing_in_fact}")

# Optional: get the actual fact rows with missing IDs
fact_missing_rows = fact_players_stats[fact_players_stats["player_id"].isin(missing_in_dim)]


IDs in fact but not in dim: []
IDs in dim but not in fact: []


## Data ingestion on PostgreSQL DB

In [104]:
engine = create_engine(
    f"postgresql+psycopg2://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}"
)

In [105]:
primary_keys_map = {
    "dim_championships": ["competition_id"],
    "dim_teams": ["team_id"],
    "dim_players": ["player_id"],
    "dim_matches": ["match_id", "match_round"],
    "fact_teams_stats": ["match_id", "match_round", "team_id"],
    "fact_players_stats": ["match_id", "match_round", "player_id"]
}

def insert_new_records(df, table_name, pk_cols, chunk_size=5000):
    try:
        pk_cols_str = ', '.join(pk_cols)
        query = f"SELECT {pk_cols_str} FROM {table_name}"
        
        existing_pks_df = pd.read_sql(query, engine)

        # Normalize keys to string to avoid datatype mismatches
        for col in pk_cols:
            df[col] = df[col].astype(str)
            existing_pks_df[col] = existing_pks_df[col].astype(str)

        if len(pk_cols) == 1:
            col = pk_cols[0]
            mask = ~df[col].isin(existing_pks_df[col])
        else:
            df_keys = df[pk_cols].apply(tuple, axis=1)
            existing_keys = existing_pks_df.apply(tuple, axis=1)
            mask = ~df_keys.isin(existing_keys)
        
        new_records_df = df[mask]
        
        if not new_records_df.empty:
            new_records_df.to_sql(
                table_name,
                con=engine,
                if_exists='append',
                index=False,
                method='multi',
                chunksize=chunk_size  # <-- added chunking
            )
            print(f"Inserted {len(new_records_df)} new records into {table_name}")
        else:
            print(f"No new records to insert for {table_name}")
    except SQLAlchemyError as e:
        print(f"Database error while inserting into {table_name}: {e}")
    except Exception as e:
        print(f"Unexpected error while inserting into {table_name}: {e}")

def main():
    try:
        insert_new_records(dim_championships, "dim_championships", primary_keys_map["dim_championships"])
        insert_new_records(dim_teams, "dim_teams", primary_keys_map["dim_teams"])
        insert_new_records(dim_players, "dim_players", primary_keys_map["dim_players"])
        insert_new_records(dim_matches, "dim_matches", primary_keys_map["dim_matches"])
        insert_new_records(fact_teams_stats, "fact_teams_stats", primary_keys_map["fact_teams_stats"])
        insert_new_records(fact_players_stats, "fact_players_stats", primary_keys_map["fact_players_stats"])
    except Exception as e:
        print(f"Error in main loading process: {e}")

if __name__ == "__main__":
    main()


No new records to insert for dim_championships
No new records to insert for dim_teams
No new records to insert for dim_players
No new records to insert for dim_matches
No new records to insert for fact_teams_stats
Inserted 9 new records into fact_players_stats


In [106]:
print(len(master_matches))
print(len(existing_match_ids))
print(len(dim_matches_matchquery))          # before merge
print(len(filtered_captain_matches_df))     # the DF you merge on
print(len(dim_matches))     

6907
5970
12565
7800
7800
