In [44]:
import pandas as pd
import numpy as np
import sqlite3
import schedule
import time
import requests
from bs4 import BeautifulSoup
from datetime import date, datetime

In [45]:
req=requests.get('https://playerstats.football/')
soup=BeautifulSoup(req.text, 'html.parser')
soup.prettify()

'<!DOCTYPE html>\n<html class="h-full" lang="en">\n <head>\n  <meta charset="utf-8"/>\n  <meta content="width=device-width, initial-scale=1" name="viewport"/>\n  <meta content="23ZbifMPCRC7s1xRlEdhxb0ERiRKQ2KfnWCEvSzC" name="csrf-token"/>\n  <link href="https://playerstats.football" hreflang="en" rel="alternate"/>\n  <title>\n   Football Stats | Player Stats | Shots on Target, Passes, Tackles &amp; Fouls\n  </title>\n  <meta content="Individual player stats and team football stats per fixture including; shots, shots on target, passes, tackles, yellow cards, fouls, crosses and more." name="description"/>\n  <style>\n   .no-show {\n            display: none;\n        }\n\n        .show {\n            display: block !important;\n        }\n  </style>\n  <!-- Fonts -->\n  <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;600;700&amp;display=swap" rel="stylesheet"/>\n  <!-- Styles -->\n  <link href="https://d2fnfth59nr9s8.cloudfront.net/cb8a0897-fe14-4ef5-ab3d-9ed449cb417d

In [46]:
data=soup.find_all('div', class_='border-b border-purple-100/80')

In [47]:
def is_time_format(time_str, time_format="%H:%M"):
    try:
        # Try parsing the string with the given format
        datetime.strptime(time_str, time_format)
        return True
    except ValueError:
        # If parsing fails, it's not a valid time
        return False

cups = {}

for item in data:
    cup = item.find('h2').text.strip()
    
    # Find all teams and time elements
    team1_elements = item.find_all('a', class_='flex items-center whitespace-wrap mb-2 text-sm')
    team2_elements = item.find_all('a', class_='flex items-center text-sm')
    time_elements = item.find_all('div', class_='pr-3 sm:pr-6')
    
    for team1, team2, time_element in zip(team1_elements, team2_elements, time_elements):
        # Safely extract team and time details
        first_team = team1.find('div', class_='whitespace-nowrap truncate underline decoration-slate-300 decoration-dotted underline-offset-4 hover:opacity-60')
        second_team = team2.find('div', class_='whitespace-nowrap truncate underline decoration-slate-300 decoration-dotted underline-offset-4 hover:opacity-60')
        time = time_element.find('div', class_='flex items-center text-grey-500')
        
        # Check if time or score is present
        if is_time_format(time.text.strip(), "%H:%M"):
            valid_time = time.text.strip()
            score="Not started Yet"
        else:
            valid_time = "Ended"
            score=time.text.strip()
            
        # Check if the match is on play
        if time_element.find('a', href="/inplay") is not None:
            valid_time='on play'
        
        # Check if all details are present
        if first_team and second_team and time and score:
            match = [
                first_team.text.strip(),
                second_team.text.strip(),
                valid_time,
                score
            ]
            cups.setdefault(cup, []).append(match)

# Print the organized data
print("Scraped Data (Dictionary):")
print(cups)



Scraped Data (Dictionary):
{'Champions League': [['Shakhtar Donetsk', 'Brest', '17:45', 'Not started Yet'], ['RB Leipzig', 'Sporting CP', '17:45', 'Not started Yet'], ['Celtic', 'Young Boys', '20:00', 'Not started Yet'], ['Feyenoord', 'FC Bayern München', '20:00', 'Not started Yet'], ['Milan', 'Girona', '20:00', 'Not started Yet'], ['Paris Saint Germain', 'Manchester City', '20:00', 'Not started Yet'], ['Sparta Praha', 'Inter', '20:00', 'Not started Yet'], ['Arsenal', 'Dinamo Zagreb', '20:00', 'Not started Yet'], ['Real Madrid', 'Salzburg', '20:00', 'Not started Yet']], 'Europa League': [['Beşiktaş', 'Athletic Club', 'on play', '1-1']], 'Championship': [['Sheffield Wednesday', 'Bristol City', '19:45', 'Not started Yet'], ['Portsmouth', 'Stoke City', '19:45', 'Not started Yet'], ['Leeds United', 'Norwich City', '19:45', 'Not started Yet'], ['Plymouth Argyle', 'Burnley', '20:00', 'Not started Yet']], 'A-League Men': [['Adelaide United', 'Auckland', 'Ended', '2-2']], 'Saudi Pro League': [

In [48]:
# Convert the dictionary into a structured DataFrame

rows = []
for cup, matches in cups.items():
    today = date.today()
    today = today.strftime("%d/%m/%Y")
    for match in matches:
        rows.append({'Date':today,'Cup': cup, 'Team 1': match[0], 'Team 2': match[1], 'Time': match[2], 'Score': match[3]})

df = pd.DataFrame(rows)

# Print the DataFrame
print("\nScraped Data (DataFrame):")
print(df)


Scraped Data (DataFrame):
          Date               Cup               Team 1             Team 2  \
0   22/01/2025  Champions League     Shakhtar Donetsk              Brest   
1   22/01/2025  Champions League           RB Leipzig        Sporting CP   
2   22/01/2025  Champions League               Celtic         Young Boys   
3   22/01/2025  Champions League            Feyenoord  FC Bayern München   
4   22/01/2025  Champions League                Milan             Girona   
5   22/01/2025  Champions League  Paris Saint Germain    Manchester City   
6   22/01/2025  Champions League         Sparta Praha              Inter   
7   22/01/2025  Champions League              Arsenal      Dinamo Zagreb   
8   22/01/2025  Champions League          Real Madrid           Salzburg   
9   22/01/2025     Europa League             Beşiktaş      Athletic Club   
10  22/01/2025      Championship  Sheffield Wednesday       Bristol City   
11  22/01/2025      Championship           Portsmouth        

In [None]:
# Schedule the scraping to run every day at a specific time
schedule.every().day.at("10:00").do(scrape_data)

while True:
    schedule.run_pending()
    time.sleep(1)