# ML with Pyspark: Linear Regresion to predict expected goals in the next n matches by Player based on player's stats per 90 minutes

In [1]:
# pyuspark session

from pyspark.sql import SparkSession

spark = SparkSession.builder \
               .appName('ml') \
               .getOrCreate()

In [3]:
import os
from pyspark.sql.functions import lit

# Path to the folder where the CSVs are stored
folder_path = "../data/football-players-DB/"

# List of all CSV files
csv_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith(".csv")]

# Initialize an empty DataFrame to store the final concatenated result
final_df = None
schema = None

# Iterate over each CSV file
for i, file in enumerate(csv_files):
    file_name = os.path.basename(file)  # Get the file name without the path
    print(f"Processing file {i+1}/{len(csv_files)}: {file_name}")
    
    # Extract the season from the last 4 or 5 characters of the file name (without the '.csv')
    season = file_name[-9:-4] if "-" in file_name[-9:-4] else file_name[-8:-4]
    print(f"Detected season: {season}")
    
    # Read each CSV file with schema inference
    df = spark.read.csv(file, header=True, inferSchema=True)
    
    # Add a new column 'season' with the extracted season value using lit()
    df = df.withColumn("season", lit(season))
    
    # If it's the first file, use its schema as the base schema
    if schema is None:
        schema = df.schema
        final_df = df
    else:
        # Check if the current file has the same schema as the base schema
        if df.schema == schema:
            # Concatenate the DataFrame if the schema matches
            final_df = final_df.unionByName(df)
        else:
            print(f"Schema mismatch found in file: {file_name}")
            # Optionally handle the mismatch by adjusting the schema

Processing file 1/888: 1. HNL 20-21.csv
Detected season: 20-21
Processing file 2/888: 1. HNL 21-22.csv
Detected season: 21-22
Schema mismatch found in file: 1. HNL 21-22.csv
Processing file 3/888: 1. HNL 22-23.csv
Detected season: 22-23
Processing file 4/888: 1. HNL 23-24.csv
Detected season: 23-24
Schema mismatch found in file: 1. HNL 23-24.csv
Processing file 5/888: 1. HNL 24-25.csv
Detected season: 24-25
Schema mismatch found in file: 1. HNL 24-25.csv
Processing file 6/888: 2. Bundesliga 20-21.csv
Detected season: 20-21
Processing file 7/888: 2. Bundesliga 21-22.csv
Detected season: 21-22
Processing file 8/888: 2. Bundesliga 22-23.csv
Detected season: 22-23
Schema mismatch found in file: 2. Bundesliga 22-23.csv
Processing file 9/888: 2. Bundesliga 23-24.csv
Detected season: 23-24
Schema mismatch found in file: 2. Bundesliga 23-24.csv
Processing file 10/888: 2. Bundesliga 24-25.csv
Detected season: 24-25
Schema mismatch found in file: 2. Bundesliga 24-25.csv
Processing file 11/888: 2

In [4]:
df.printSchema()

root
 |-- Player: string (nullable = true)
 |-- Full name: string (nullable = true)
 |-- Wyscout id: integer (nullable = true)
 |-- Team: string (nullable = true)
 |-- Team within selected timeframe: string (nullable = true)
 |-- Team logo: string (nullable = true)
 |-- Competition: string (nullable = true)
 |-- Position: string (nullable = true)
 |-- Primary position: string (nullable = true)
 |-- Primary position, %: integer (nullable = true)
 |-- Secondary position: string (nullable = true)
 |-- Secondary position, %: integer (nullable = true)
 |-- Third position: string (nullable = true)
 |-- Third position, %: integer (nullable = true)
 |-- Age: double (nullable = true)
 |-- Birthday: date (nullable = true)
 |-- Market value: integer (nullable = true)
 |-- Contract expires: date (nullable = true)
 |-- Matches played: integer (nullable = true)
 |-- Minutes played: integer (nullable = true)
 |-- Goals: integer (nullable = true)
 |-- xG: double (nullable = true)
 |-- Assists: integer

In [6]:
df.show()

+----------------+--------------------+----------+------------+------------------------------+--------------------+-----------+----------------+----------------+-------------------+------------------+---------------------+--------------+-----------------+----+----------+------------+----------------+--------------+--------------+-----+------------------+-------+----+------------------+------------+-------------+--------------------+-------+------+------+-------+-----------------------------------+----------------------+----------------------+---------------------+-------------------+----------------------+--------------------+--------------------+--------------------+------------------+------------------+------------+-------------------+---------+----------------+-----------------------------------+------------+-----------------+------------------------+---------+----------+-----------------+-----+------------------+------------------+------------------+--------------+-----------------

In [8]:
# Order by highest 'Goals per 90' and show the top 10 players
df.orderBy(df["Goals per 90"].desc()).show(10)

+-------------+--------------------+----------+------------+------------------------------+--------------------+-----------+-----------------+----------------+-------------------+------------------+---------------------+--------------+-----------------+----+----------+------------+----------------+--------------+--------------+-----+----+-------+-------------------+------------+------------+-------------+----------------+-------+------+------+-------+-----------------------------------+----------------------+----------------------+---------------------+-------------------+----------------------+--------------------+--------------------+--------------------+------------------+------------------+------------+-------------------+---------+----------------+-----------------------------------+------------------+-----------------+------------------------+---------+----------+-----------------+-----+------------+------------------+------------------+-------------------+-------------------+---