In [None]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

from src.analyzer.euroleague_analyzer import EuroLeagueAnalyzer
from src.plotter.euroleague_plotter import EuroLeaguePlotter

In [None]:
# Create a Spark session
spark = SparkSession.builder.appName("EuroLeagueAnalysis").getOrCreate()
# Enable Arrow-based columnar data transfers
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

# Read the CSV file into a DataFrame and infer schema
data_path = "data/games.csv"
df = spark.read.csv(data_path, header=True, inferSchema=True)

# extract columns needed for analysis
analysis_columns = ['season', 'team', 'PTS', 'OP_PTS', '3P%', 'OP_3P%', '2P%', 'OP_2P%', 'FTR', 'OP_FTR', 'OREB%', 'DREB%', 'ASTR', 'OP_ASTR', 
                    'TOVR', 'OP_TOVR', 'STLR', 'OP_STLR', 'BLKR', 'OP_BLKR', 'TS%', 'OP_TS%', 'OP_team', 'win']

el_df = df.select(*analysis_columns).withColumn("game_no", (F.monotonically_increasing_id() % df.count()) + 1)

# create EuroLeaguePlotter object to draw matplotlib graphs
el_plotter = EuroLeaguePlotter()

In [None]:
# Create EuroLeagueAnalyzer object to get average OFF and DEF stats
el_analyzer = EuroLeagueAnalyzer(spark, el_df)
avg_df = el_analyzer.calculate_team_averages()

# Convert DataFrame to Pandas DataFrame for plotting
top_teams_pd = avg_df.toPandas()

# Plot graph
el_plotter.plot_basic_graph(top_teams_pd, "team", "AVG_PTS_DIFF", "Teams", "Average Points Difference", "Top Teams based on Average Points Difference")

In [None]:
# Get largest win streaks and show top 10
streak_df = el_analyzer.calculate_team_win_streaks()
streak_df.show(10)

top_win_streaks_pd = streak_df.toPandas()

# Plot graph
el_plotter.plot_basic_graph(top_win_streaks_pd, "team", "streak_length", "Teams", "Most consecutive wins", "Top Teams based on Consecutive Wins", "green")

# Stop the Spark session
spark.stop()