In [1]:
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from itemloaders.processors import MapCompose, TakeFirst
from scrapy.loader import ItemLoader
import pandas as pd
from openpyxl import Workbook, load_workbook
import os
import re
import logging
import csv

---
# This project is to scrape the 2024-2025 Champions league group stage table

---

In [2]:
class ChampionsLeagueScrapeSpider(scrapy.Spider):
    name = 'championspider'

    # Website spider sends requests to
    start_urls = ['https://www.espn.com/soccer/table/_/league/uefa.champions']
    
    # This will help to save the data generated in CSV format
    custom_settings = {
        'FEEDS': {
            'champions_league_table.json': {
                'format': 'csv',
                'overwrite': True
            }
        },
        'LOG_LEVEL': 'WARNING',  # Set the logging level to WARNING
        #'LOG_FILE': 'scrapy_log.txt'  # Optional: Save log messages to a file
    }

    def parse(self, response):
        print('[ OUR RESPONSE ]')
        
        # Initialize lists to store header and data
        results = {}
        football_club = {}
        data_2 = []

        # Select all rows in the table
        table_rows = response.xpath('//tr')
        data_rows = response.xpath("//tr/td")
        
        for row_table, row_data in zip(table_rows, data_rows):
            club = row_table.xpath('//a[@class = "AnchorLink"]/text()').getall()
            l = row_data.xpath("//span[@class = 'stat-cell']")
        
        club = club[:-8]
        data = [i.xpath("text()").get() for i in l]
        #print(len(data))
        
        # Group the values into chunks of 8
        grouped_values = [data[i:i+8] for i in range(0, len(data), 8)]    
        #print(len(grouped_values))
        #print(len(club))
            
        # Ensure that both the club name and at least 8 stats are available
        for i, j in zip(club, grouped_values):
            results[i] = {
                'Games Played' :     j[0], 
                'Win' :              j[1],  
                'Draw' :             j[2], 
                'Lose' :             j[3], 
                'Goals For' :        j[4],  
                'Goals Against' :    j[5],  
                'Goals Difference' : j[6], 
                'Points' :           j[7],  
            }
        

        
        # Save the results to a CSV file
        csv_file = "champions_league_table.csv"
        with open(csv_file, mode='w', newline='') as file:
            writer = csv.DictWriter(file, fieldnames=['Club', 'Games Played', 'Win', 
                                                      'Draw', 'Lose', 'Goals For', 'Goals Against', 
                                                      'Goals Difference', 'Points'])
            writer.writeheader()
            
            # Write each club's data into the CSV
            for club_name, stats in results.items():
                row = {'Club': club_name}
                row.update(stats)
                writer.writerow(row)


        
        # Yield the results for Scrapy's pipeline or further use
        yield results
                
                
def run_spider():
    process = CrawlerProcess(settings=get_project_settings())
    process.crawl(ChampionsLeagueScrapeSpider)
    process.start()

if __name__ == '__main__':
    # Configure logging
    logging.getLogger('scrapy').setLevel(logging.WARNING)  
    run_spider()

2024-09-13 13:56:12 [scrapy.utils.log] INFO: Scrapy 2.11.2 started (bot: scrapybot)
2024-09-13 13:56:12 [scrapy.utils.log] INFO: Versions: lxml 5.3.0.0, libxml2 2.11.7, cssselect 1.2.0, parsel 1.9.1, w3lib 2.2.1, Twisted 24.7.0, Python 3.10.13 | packaged by Anaconda, Inc. | (main, Sep 11 2023, 13:24:38) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 24.2.1 (OpenSSL 3.3.2 3 Sep 2024), cryptography 43.0.1, Platform Windows-10-10.0.22631-SP0


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler)



[ OUR RESPONSE ]
