In [5]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns

# Define paths
DATA_PATH = "./data/spreadspoke_scores.csv"
OUTPUT_DIR = "./output"
PLOTS_DIR = "./plots"

# Ensure output directories exist
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(PLOTS_DIR, exist_ok=True)

def remove_nan(df):
    """Drops rows with NaN values."""
    return df.dropna()

def replace_nan(df):
    """Replaces NaN values: mean for numerical columns, mode for categorical."""
    output = df.copy()
    for col in output.columns:
        if output[col].dtype in ['float64', 'int64']:
            output[col].fillna(output[col].mean(), inplace=True)
        else:
            output[col].fillna(output[col].mode()[0], inplace=True)
    return output

def compile(path):
    """Cleans data and generates summary statistics."""
    df = pd.read_csv(path)
    df_cleaned = replace_nan(df)
    summary = {"numerical": df_cleaned.describe().to_dict()}

    categorical_summary = {}
    for col in df_cleaned.select_dtypes(include=['object']).columns:
        categorical_summary[col] = {
            "unique": df_cleaned[col].unique().tolist(),
            "counts": df_cleaned[col].value_counts().to_dict()
        }
    summary["categorical"] = categorical_summary

    return df_cleaned, summary

# Process data
df, summary_stats = compile(DATA_PATH)

# Save cleaned data
cleaned_path = f"{OUTPUT_DIR}/spreadspoke_scores_cleaned.csv"
df.to_csv(cleaned_path, index=False)
print(f"Cleaned data saved to {cleaned_path}")

# Read cleaned data for visualization
df = pd.read_csv(cleaned_path)
df = df.apply(pd.to_numeric, errors='ignore')

# Generate Histograms for Numerical Columns
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns
for col in numerical_columns:
    plt.figure()
    sns.histplot(df[col], bins=30, kde=True)
    plt.xlabel(col)
    plt.ylabel("Frequency")
    plt.title(f"Histogram for {col}")
    plt.savefig(f"{PLOTS_DIR}/histogram_{col}.png")
    plt.close()

print("All visualizations saved in the 'plots' directory.")


Cleaned data saved to ./output/spreadspoke_scores_cleaned.csv
All visualizations saved in the 'plots' directory.


In [6]:
df = pd.read_csv(cleaned_path)
df = df.apply(pd.to_numeric, errors='ignore')

plot_dir = "plots"
os.makedirs(plot_dir, exist_ok=True)

numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns
for col in numerical_columns:
    plt.figure()
    sns.histplot(df[col], bins=30, kde=True)
    plt.xlabel(col)
    plt.ylabel("Frequency")
    plt.title(f"Histogram for {col}")
    plt.savefig(f"{plot_dir}/histogram_{col}.png")
    plt.close()

categorical_columns = df.select_dtypes(include=['object']).columns
for col in categorical_columns:
    plt.figure()
    df[col].value_counts().plot(kind='bar', color='skyblue')
    plt.xlabel(col)
    plt.ylabel("Count")
    plt.title(f"Bar for {col}")
    plt.xticks(rotation=45)
    plt.savefig(f"{plot_dir}/barplot_{col}.png")
    plt.close()

violin_pairs = [("score_home", "team_home"), ("score_away", "team_away"), ("weather_temperature", "stadium")]
for num_col, cat_col in violin_pairs:
    plt.figure(figsize=(12, 6))
    sns.violinplot(x=df[cat_col], y=df[num_col])
    plt.xlabel(cat_col)
    plt.ylabel(num_col)
    plt.title(f"Violin for {num_col} by {cat_col}")
    plt.xticks(rotation=90)
    plt.savefig(f"{plot_dir}/violin_{num_col}_by_{cat_col}.png")
    plt.close()

scatter_pairs = [("score_home", "score_away"), ("weather_temperature", "weather_wind_mph"), ("spread_favorite", "over_under_line")]
for x_col, y_col in scatter_pairs:
    plt.figure()
    sns.scatterplot(x=df[x_col], y=df[y_col])
    plt.xlabel(x_col)
    plt.ylabel(y_col)
    plt.title(f"Scatter for {x_col} vs {y_col}")
    plt.savefig(f"{plot_dir}/scatter_{x_col}_vs_{y_col}.png")
    plt.close()

plt.figure(figsize=(10, 8))
corr = df.select_dtypes(include=['int64', 'float64']).corr()
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Matrix")
plt.savefig(f"{plot_dir}/correlation_matrix.png")
plt.close()

plt.figure()
sns.boxplot(x=df["schedule_week"], y=df["score_home"])
plt.xlabel("Schedule Week")
plt.ylabel("Home Team Score")
plt.title("Box Plot of Home Score by Week")
plt.savefig(f"{plot_dir}/boxplot_schedule_week_score_home.png")
plt.close()

plt.figure()
sns.lineplot(x=df["schedule_season"], y=df["score_home"], ci=None)
plt.xlabel("Season")
plt.ylabel("Home Team Score")
plt.title("Home Scores Over Seasons")
plt.savefig(f"{plot_dir}/lineplot_season_score_home.png")
plt.close()


The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  sns.lineplot(x=df["schedule_season"], y=df["score_home"], ci=None)


In [7]:
print(summary_stats)

{'numerical': {'schedule_season': {'count': 14086.0, 'mean': 1997.1923896067017, 'std': 16.71193134299691, 'min': 1966.0, '25%': 1983.0, '50%': 1998.0, '75%': 2012.0, 'max': 2024.0}, 'score_home': {'count': 14086.0, 'mean': 22.51370154763595, 'std': 10.519008794254821, 'min': 0.0, '25%': 15.0, '50%': 22.0, '75%': 30.0, 'max': 72.0}, 'score_away': {'count': 14086.0, 'mean': 19.839698991906857, 'std': 10.141589177683171, 'min': 0.0, '25%': 13.0, '50%': 20.0, '75%': 27.0, 'max': 62.0}, 'spread_favorite': {'count': 14086.0, 'mean': -5.360644438700784, 'std': 3.11322077339942, 'min': -26.5, '25%': -6.5, '50%': -5.360644438700784, '75%': -3.0, 'max': 0.0}, 'weather_temperature': {'count': 14086.0, 'mean': 59.00406374501992, 'std': 14.664562712750417, 'min': -6.0, '25%': 50.0, '50%': 59.00406374501992, '75%': 72.0, 'max': 97.0}, 'weather_wind_mph': {'count': 14086.0, 'mean': 7.56135311951492, 'std': 5.404492796002771, 'min': 0.0, '25%': 4.0, '50%': 7.561353119514919, '75%': 11.0, 'max': 40.0}