In [None]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
from src.analyzer.el_analyzer import ElAnalyzer
import matplotlib.pyplot as plt

In [None]:
# Create a Spark session
spark = SparkSession.builder.appName("EuroleagueAnalysis").getOrCreate()
# Enable Arrow-based columnar data transfers
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

# Read the CSV file into a DataFrame and infer schema
data_path = "data/games.csv"
df = spark.read.csv(data_path, header=True, inferSchema=True)

# extract columns needed for analysis
analysis_columns = ['season', 'team', 'PTS', 'OP_PTS', '3P%', 'OP_3P%', '2P%', 'OP_2P%', 'FTR', 'OP_FTR', 'OREB%', 'DREB%', 'ASTR', 'OP_ASTR', 
                    'TOVR', 'OP_TOVR', 'STLR', 'OP_STLR', 'BLKR', 'OP_BLKR', 'TS%', 'OP_TS%', 'OP_team', 'win']
el_df = df.select(*analysis_columns)

In [None]:
# Create ElAnalyzer object to perform analysis on eurleague DataFrame
el_analyzer = ElAnalyzer(spark, el_df)
avg_df = el_analyzer.calculate_averages()

# Convert DataFrame to Pandas DataFrame for plotting
top_teams_pd = avg_df.toPandas()

# Plotting
plt.figure(figsize=(10, 6))
plt.bar(top_teams_pd["team"], top_teams_pd["AVG_PTS_DIFF"], color='blue')
plt.xlabel('Teams')
plt.ylabel('Average Points Difference')
plt.title('Top Teams based on Average Points Difference')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Stop the Spark session
spark.stop()