**Fetching**

In [14]:
# Installing Dependencies
!pip install pandas requests matplotlib seaborn




In [15]:
# Importing Necessary Libraries
import requests
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# API Configuration
API_KEY = "630602dd83c94400bc706b73c29786ed"
HEADERS = {"X-Auth-Token": API_KEY}

# Base URL of the Football-Data API
BASE_URL = "https://api.football-data.org/v4"


In [16]:
# PL = Premier League, PD= La Liga, BL1= Bundesliga, SA= Serie A
# Following Function fetches match data of the given football competitions
def fetch_leagues_current_season(competitions=["PL", "PD", "BL1", "SA"]):
    all_matches = []

    for comp in competitions:
        print(f"Fetching {comp} season 2023...")

        # Build API endpoint URL for the competition
        url = f"{BASE_URL}/competitions/{comp}/matches?season=2023"
        response = requests.get(url, headers=HEADERS)

        # Skip if request fails
        if response.status_code != 200:
            print(f"Failed to fetch {comp}: {response.status_code}")
            continue

        # Extract match data and store relevant fields
        matches = response.json().get("matches", [])
        for match in matches:
            all_matches.append({
                "competition": comp,
                "date": match["utcDate"],
                "home_team": match["homeTeam"]["name"],
                "away_team": match["awayTeam"]["name"],
                "status": match["status"],
                "score_home": match["score"]["fullTime"]["home"],
                "score_away": match["score"]["fullTime"]["away"]
            })
    # Converts the collected data into a pandas DataFrame
    return pd.DataFrame(all_matches)


In [17]:
# Fetch data from API
df_raw = fetch_leagues_current_season()
print("Total matches fetched:", len(df_raw))

# Save the raw data locally for persistence
df_raw.to_csv("raw_data.csv", index=False)

# Load raw data (could skip if df_raw is still available)
df = pd.read_csv("raw_data.csv")


Fetching PL season 2023...
Fetching PD season 2023...
Fetching BL1 season 2023...
Fetching SA season 2023...
Total matches fetched: 1446


In [18]:
# Print Original shape of the data
print("Original shape:", df.shape)

Original shape: (1446, 7)


**Cleaning**

In [19]:
# Check for duplicates
num_duplicates_before = df.duplicated().sum()
print(f"Duplicate rows before dropping: {num_duplicates_before}")

# Drop duplicates
df.drop_duplicates(inplace=True)
print("After removing duplicates:", df.shape)


Duplicate rows before dropping: 0
After removing duplicates: (1446, 7)


In [20]:
# Drop rows with missing essential fields
df.dropna(subset=["score_home", "score_away", "home_team", "away_team", "date"], inplace=True)
print("After dropping missing values:", df.shape)

After dropping missing values: (1446, 7)


In [21]:
# Convert date column to datetime format
df["date"] = pd.to_datetime(df["date"])

# Sort the data by date (ascending)
df.sort_values(by="date", inplace=True)

In [22]:
# Reset index
df.reset_index(drop=True, inplace=True)

# Save cleaned dataset
df.to_csv("cleaned_data.csv", index=False)
print("✅ Cleaned dataset saved as cleaned_data.csv")
df.head()

✅ Cleaned dataset saved as cleaned_data.csv


Unnamed: 0,competition,date,home_team,away_team,status,score_home,score_away
0,PD,2023-08-11 17:30:00+00:00,UD Almería,Rayo Vallecano de Madrid,FINISHED,0,2
1,PL,2023-08-11 19:00:00+00:00,Burnley FC,Manchester City FC,FINISHED,0,3
2,PD,2023-08-11 20:00:00+00:00,Sevilla FC,Valencia CF,FINISHED,1,2
3,PL,2023-08-12 12:00:00+00:00,Arsenal FC,Nottingham Forest FC,FINISHED,2,1
4,PL,2023-08-12 14:00:00+00:00,AFC Bournemouth,West Ham United FC,FINISHED,1,1
