In [20]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import time
from bs4 import BeautifulSoup as bs
import requests
from tqdm import tqdm
import pandas as pd
import os
import pathlib
class Scraper:
    def __init__(self, league:str, url:str='https://www.besoccer.com/competition', year: int=2022) -> None:
        pathlib.Path(f'Data/Results/{league}').mkdir(parents=True, exist_ok=True) 
        pathlib.Path(f'Data/To_Predict/{league}').mkdir(parents=True, exist_ok=True) 
        self.league = league
        self.url = url
        self.year = year
        r = requests.get(f"{self.url}/scores/{self.league}/{self.year}")
        time.sleep(1)
        soup = bs(r.content, 'html.parser')
        matchday_str = soup.find('div', {'class': 'panel-title'}).text
        self.matchday = [int(s) for s in matchday_str.split() if s.isdigit()][0]

    def get_previous_matches(self):
        results = {'Home_Team': [], 'Away_Team': [], 'Result': [], 'Link': [], 'Season': [], 'Round': [], 'League': [], 'Home_ELO': [], 'Away_ELO': []}
        for matchday in tqdm(range(1, self.matchday)):
            r = requests.get(f"{self.url}/scores/{self.league}/{self.year}/round{matchday}")
            time.sleep(1)
            soup = bs(r.content, 'html.parser')
            matches_box = soup.find('div', {'class': 'panel-body p0 match-list-new'})
            matches = matches_box.find_all('a', {'class': 'match-link'})
            for match in matches:
                try:
                    home_team = match.find('div', {'class': 'team-name ta-r team_left'}).find('div', {'class': 'name'}).text.strip()
                    away_team = match.find('div', {'class': 'team-name ta-l team_right'}).find('div', {'class': 'name'}).text.strip()
                    home_score = match.find('div', {'class': 'marker'}).find('span', {'class': 'r1'}).text.strip()
                    away_score = match.find('div', {'class': 'marker'}).find('span', {'class': 'r2'}).text.strip()
                    results['Home_Team'].append(home_team)
                    results['Away_Team'].append(away_team)
                    results['Result'].append(f'{home_score}-{away_score}')
                    results['Link'].append(match.get('href'))
                    results['Season'].append(self.year)
                    results['Round'].append(matchday)
                    results['League'].append(self.league)
                except AttributeError:
                    continue
            
        for link in tqdm(results['Link']):
            r = requests.get(f'{link}/analysis')
            time.sleep(0.2)
            soup = bs(r.content, 'html.parser')
            home_elo = soup.find('td', {'class': 'team1-c'}).find('span', {'class': 'rating'}).text.strip()
            results['Home_ELO'].append(home_elo)
            away_elo = soup.find('td', {'class': 'team2-c'}).find('span', {'class': 'rating'}).text.strip()
            results['Away_ELO'].append(away_elo)
        
        df = pd.DataFrame(results)
        df.to_csv(f'Data/Results/{self.league}/Results_{self.year}_{self.league}.csv')
    
    def get_next_match(self):
        results = {'Home_Team': [], 'Away_Team': [], 'Link': [], 'Season': [], 'Round': [], 'League': [], 'Home_ELO': [], 'Away_ELO': []}
        try:
            r = requests.get(f"{self.url}/scores/{self.league}/{self.year}/round{self.matchday + 1}")
            time.sleep(1)
            soup = bs(r.content, 'html.parser')
            matches_box = soup.find('div', {'class': 'panel-body p0 match-list-new'})
            matches = matches_box.find_all('a', {'class': 'match-link'})
            home_team = matches[0].find('div', {'class': 'team-name ta-r team_left'}).find('div', {'class': 'name'}).text.strip()
        except AttributeError:
            r = requests.get(f"{self.url}/scores/{self.league}/{self.year}/round{self.matchday + 2}")
            time.sleep(1)
            soup = bs(r.content, 'html.parser')
            matches_box = soup.find('div', {'class': 'panel-body p0 match-list-new'})
            matches = matches_box.find_all('a', {'class': 'match-link'})
        for match in matches:
            home_team = match.find('div', {'class': 'team-name ta-r team_left'}).find('div', {'class': 'name'}).text.strip()
            away_team = match.find('div', {'class': 'team-name ta-l team_right'}).find('div', {'class': 'name'}).text.strip()
            results['Home_Team'].append(home_team)
            results['Away_Team'].append(away_team)
            results['Link'].append(match.get('href'))
            results['Season'].append(self.year)
            results['Round'].append(self.matchday + 1)
            results['League'].append(self.league)

        for link in tqdm(results['Link']):
            r = requests.get(f'{link}/analysis')
            time.sleep(0.2)
            soup = bs(r.content, 'html.parser')
            home_elo = soup.find('td', {'class': 'team1-c'}).find('span', {'class': 'rating'}).text.strip()
            results['Home_ELO'].append(home_elo)
            away_elo = soup.find('td', {'class': 'team2-c'}).find('span', {'class': 'rating'}).text.strip()
            results['Away_ELO'].append(away_elo)

        
        df = pd.DataFrame(results)
        df.to_csv(f'Data/To_Predict/{self.league}/Results_{self.year}_{self.league}.csv')

                


In [22]:
#leagues = ['2_liga', 'bundesliga', 'championship', 'eerste_divisie', 'eredivisie', 'ligue_1', 'ligue_2', 'premier_league', 'primeira_liga', 'primera_division', 'segunda_division', 'segunda_liga', 'serie_a', 'serie_b']

leagues = ['serie_a', 'serie_b']

for league in leagues:
    division = Scraper(league)
    division.get_previous_matches()
    division.get_next_match()

100%|██████████| 30/30 [00:47<00:00,  1.57s/it]
100%|██████████| 80/80 [01:12<00:00,  1.10it/s]
100%|██████████| 10/10 [00:08<00:00,  1.23it/s]
100%|██████████| 32/32 [00:45<00:00,  1.42s/it]
100%|██████████| 103/103 [01:21<00:00,  1.27it/s]
100%|██████████| 10/10 [00:06<00:00,  1.46it/s]
