# Loader Utility
The dataset dynamically changes, so to stay always up-to-date, we are using the Python Requests package.

Also, we are using Git to synchronize our progress easily, so saving and tracking the dataset would be unnecessary.

In [None]:
from python.defs import get_path
from python.utils.loader import Loader

file_path = get_path()
Loader.send_get_request(file_path)

---
# Reader utility
Here we create reader instance to encapsulate dataset operating logic

In [None]:
from python.utils.csv_reader import CSV_Reader
reader = CSV_Reader(file_path, 'nba.csv')

Some functionality example:

In [None]:
reader.get_dataset_head()

In [None]:
reader.get_types()

In [None]:
reader.get_dataset_statistic()

In [None]:
reader.get_column_values("date_game")

In [None]:
reader.get_column_values("team_id")

In [None]:
reader.get_nulls_count()

In [None]:
reader.get_nulls_count_in_column("notes")

In [None]:
reader.get_nulls_count_in_column("team_id")

Lets delete unused and null/nan colums:

In [None]:
reader.delete_column("notes")
reader.get_dataset_head()

Rename some columns to clarify their purpose

In [None]:
reader.rename_column("seasongame", "season_game")
reader.rename_column("pts", "points")
reader.rename_column("_iscopy", "is_copy")
reader.rename_column("lg_id", "league_id")
reader.rename_column("fran_id", "franchise_id")
reader.get_dataset_head()

And lets add some other cleaning and data preparation methods:

In [None]:
reader.change_data_type("is_copy", "bool")
reader.get_types()

In [None]:
old_shape = reader.get_table_shape()
reader.delete_duplicates()

if old_shape == reader.get_table_shape():
		print(f"There are no duplicates in the table")
else:
		print(f"Duplicates successfully deleted")
print("Old shape:", old_shape)
print("Current shape:", reader.get_table_shape())

In [None]:
reader.check_for_outliers_in_column("points")

In [None]:
reader.delete_outliers('points')
reader.get_dataset_head()

Finaly, lets save proceeded data to file

In [None]:
reader.save_to_file('cleaned_dataset.csv')

In [None]:
reader = CSV_Reader(file_path, 'cleaned_dataset.csv')
reader.get_dataset_head()

---
# Data visualization

After data proceeding, lets see how it looks

In [None]:
nba = reader.get_dataset()
nba["franchise_id"].value_counts().head(20).plot(kind="bar")

In [None]:
nba[nba["franchise_id"] == "Lakers"].groupby("year_id")["points"].sum().plot()
# nba[nba["franchise_id"] == "Celtics"].groupby("year_id")["points"].sum().plot()
# nba[nba["franchise_id"] == "Warriors"].groupby("year_id")["points"].sum().plot()
# nba[nba["franchise_id"] == "Nets"].groupby("year_id")["points"].sum().plot()

In [None]:
import matplotlib.pyplot as plt

nba[nba["franchise_id"] == "Lakers"].groupby("year_id")["points"].sum().plot()
nba[nba["franchise_id"] == "Celtics"].groupby("year_id")["points"].sum().plot()
nba[nba["franchise_id"] == "Warriors"].groupby("year_id")["points"].sum().plot()
nba[nba["franchise_id"] == "Nets"].groupby("year_id")["points"].sum().plot()

all_teams_data = nba.groupby(["year_id", "franchise_id"])["points"].sum().unstack()
mean_points = all_teams_data.mean(axis=1)
median_points = all_teams_data.median(axis=1)

plt.axhline(median_points.median(), color='red', linestyle='--')
plt.axhline(mean_points.mean(), color='green', linestyle='-')

In [None]:
import matplotlib.pyplot as plt

year_id = 1999

lakers_data = nba[(nba["franchise_id"] == "Lakers") & (nba["year_id"] == year_id)]
celtics_data = nba[(nba["franchise_id"] == "Celtics") & (nba["year_id"] == year_id)]
warriors_data = nba[(nba["franchise_id"] == "Warriors") & (nba["year_id"] == year_id)]
nets_data = nba[(nba["franchise_id"] == "Nets") & (nba["year_id"] == year_id)]

plt.subplot(2, 2, 1)
lakers_data["game_result"].value_counts().plot(kind="pie")
plt.title("Lakers Game Result")

plt.subplot(2, 2, 2)
celtics_data["game_result"].value_counts().plot(kind="pie")
plt.title("Celtics Game Result")

plt.subplot(2, 2, 3)
warriors_data["game_result"].value_counts().plot(kind="pie")
plt.title("Warriors Game Result")

plt.subplot(2, 2, 4)
nets_data["game_result"].value_counts().plot(kind="pie")
plt.title("Nets Game Result")

Now lets use seaborn lib to create other graphics

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

heatmap_data = nba.head(50000).pivot_table(index='game_location', columns='game_result', values='points', aggfunc='median')

sns.heatmap(heatmap_data, cmap='coolwarm', annot=True, fmt=".1f")

In [None]:
columns_to_include = [
    'points', 'win_equiv'
]

# Subset your DataFrame to include only the selected columns
subset_df = nba[columns_to_include]

# Create a pairplot
sns.pairplot(subset_df, diag_kind='hist', markers='o')

In [None]:
sns.boxplot(x='franchise_id', y='points', data = nba.head(100))

plt.xlabel('Franchise')
plt.ylabel('Points')
plt.title('Boxplot of Points by Franchise')