# Match Data Collection via API

This notebook retrieves detailed football match data from the API-Football platform using the RapidAPI interface. The script queries five major European leagues (Premier League, La Liga, Bundesliga, Serie A, Ligue 1) across selected seasons (2019–2023), fetching both fixture metadata and comprehensive match statistics.

Key operations:
- API requests to retrieve match schedules, results, and in-game statistics
- Extraction of attributes such as team names, goals, possession, shots, cards, passes, and referee details
- Handling missing data and request errors gracefully
- Exporting the aggregated dataset to a CSV file named `all_matches_proper_format.csv` for downstream processing

This script ensures that raw match data is gathered in a structured format suitable for further cleaning and analysis in subsequent stages.


In [None]:
import requests
import pandas as pd
import time
from google.colab import files

# API credentials (REPLACE with your actual key)
API_KEY = '335e07b7a1msh1006cb7deaeca97p1d57bejsn6b0be7b88928'
BASE_URL = 'https://api-football-v1.p.rapidapi.com/v3/fixtures'
STAT_URL = 'https://api-football-v1.p.rapidapi.com/v3/fixtures/statistics'

headers = {
    'x-rapidapi-host': 'api-football-v1.p.rapidapi.com',
    'x-rapidapi-key': API_KEY
}

# Leagues and seasons
leagues = {
    'Premier League': 39,
    'La Liga': 140,
    'Bundesliga': 78,
    'Serie A': 135,
    'Ligue 1': 61
}
seasons = [2019,2020,2021,2023]  # Use [2019, 2020, 2021, 2022, 2023] for full range

# Storage for collected data
all_matches = []

for league_name, league_id in leagues.items():
    for season in seasons:
        print(f"Fetching: {league_name} - {season}")
        url = f"{BASE_URL}?league={league_id}&season={season}"
        res = requests.get(url, headers=headers)
        if res.status_code != 200:
            print(f"Error: {res.status_code} {league_name} {season}")
            continue

        fixtures = res.json()['response']
        print(f"{league_name} {season} - {len(fixtures)} matches found")

        for f in fixtures:
            fixture_id = f['fixture']['id']
            data = {
                'league': league_name,
                'season': season,
                'fixture_id': fixture_id,
                'date': f['fixture']['date'],
                'venue': f['fixture']['venue']['name'],
                'referee': f['fixture']['referee'],
                'home_team': f['teams']['home']['name'],
                'away_team': f['teams']['away']['name'],
                'home_goals': f['goals']['home'],
                'away_goals': f['goals']['away'],
                'status': f['fixture']['status']['short']
            }

            # Fetch additional match statistics
            stats = requests.get(f"{STAT_URL}?fixture={fixture_id}", headers=headers)
            if stats.status_code == 200:
                try:
                    s = stats.json()['response']
                    home_stats = s[0]['statistics']
                    away_stats = s[1]['statistics']

                    def val(lst, name):
                        for i in lst:
                            if i['type'].lower() == name.lower():
                                return i['value']
                        return None

                    data.update({
                        'home_shots_on_target': val(home_stats, 'Shots on Goal'),
                        'away_shots_on_target': val(away_stats, 'Shots on Goal'),
                        'home_shots_total': val(home_stats, 'Total Shots'),
                        'away_shots_total': val(away_stats, 'Total Shots'),
                        'home_possession': val(home_stats, 'Ball Possession'),
                        'away_possession': val(away_stats, 'Ball Possession'),
                        'home_yellow_cards': val(home_stats, 'Yellow Cards'),
                        'away_yellow_cards': val(away_stats, 'Yellow Cards'),
                        'home_red_cards': val(home_stats, 'Red Cards'),
                        'away_red_cards': val(away_stats, 'Red Cards'),
                        'home_passes': val(home_stats, 'Total passes'),
                        'away_passes': val(away_stats, 'Total passes'),
                        'home_pass_accuracy': val(home_stats, 'Passes %'),
                        'away_pass_accuracy': val(away_stats, 'Passes %'),
                    })
                except:
                    print(f"Statistics could not be fetched: {fixture_id}")

            all_matches.append(data)
            time.sleep(0)

        print(f"{league_name} - {season} completed.")

# Save as CSV and download
df = pd.DataFrame(all_matches)
df.to_csv("all_matches_proper_format.csv", index=False)
files.download("all_matches_proper_format.csv")