<a href="https://colab.research.google.com/github/ChrisMoc07/Love-Data-Week-Project/blob/main/LoveDataWeek.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip -q install pandas plotly pillow scikit-learn

import os, re
import numpy as np
import pandas as pd
from PIL import Image
from google.colab import files
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import plotly.graph_objects as go

uploaded = files.upload()

csv_name = None
logo_files = []
for k in uploaded.keys():
    if k.lower().endswith(".csv"):
        csv_name = k
    if k.lower().endswith((".png",".jpg",".jpeg",".webp")):
        logo_files.append(k)

if csv_name is None:
    raise ValueError("Upload fight-songs.csv AND your PNG logos together.")

df = pd.read_csv(csv_name)
df.columns = df.columns.str.strip()

def yn01(s):
    return (
        s.astype(str).str.strip().str.lower()
        .map({"yes": 1, "no": 0})
        .fillna(0)
        .astype(int)
    )

df["school"] = df.get("school", "").astype(str).str.strip()
df["conference"] = df.get("conference", "").astype(str).str.strip()

for c in ["number_fights", "trope_count", "bpm", "sec_duration"]:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")
    else:
        df[c] = np.nan

yn_cols = ["rah", "spelling", "opponents", "men", "colors", "nonsense", "victory", "win_won"]
for c in yn_cols:
    if c in df.columns:
        df[c] = yn01(df[c])
    else:
        df[c] = 0

df["crowd_hype"] = df["rah"] + df["spelling"] + df["opponents"] + df["men"] + df["colors"]
df["aggression"] = df["number_fights"].fillna(0) + (df["victory"] + df["win_won"])
df["nonsense"] = df["nonsense"]
df["tropes"] = df["trope_count"].fillna(0)

df = df[(df["school"] != "") & (df["conference"] != "")].copy()

school = (
    df.groupby(["school","conference"], as_index=False)
      .agg(
          crowd_hype=("crowd_hype","mean"),
          aggression=("aggression","mean"),
          tropes=("tropes","mean"),
          nonsense=("nonsense","mean"),
          bpm=("bpm","mean"),
          sec_duration=("sec_duration","mean"),
          n_songs=("school","size")
      )
)

school = school.dropna(subset=["bpm","sec_duration"]).copy()

features = ["crowd_hype","aggression","tropes","nonsense","bpm","sec_duration"]
X = school[features].to_numpy(dtype=float)
X = StandardScaler().fit_transform(X)

pca = PCA(n_components=2, random_state=0)
PC = pca.fit_transform(X)

school["pc1"] = PC[:,0]
school["pc2"] = PC[:,1]

def norm_key(s):
    s = s.lower().strip()
    s = re.sub(r"[^a-z0-9]+", "_", s)
    s = re.sub(r"_+", "_", s).strip("_")
    return s

logo_map = {}
for lf in logo_files:
    base = os.path.splitext(os.path.basename(lf))[0]
    logo_map[norm_key(base)] = lf

logo_targets = {
    "illinois": "Illinois",
    "ohio_state": "Ohio State",
    "michigan": "Michigan",
    "alabama": "Alabama",
    "notre_dame": "Notre Dame"
}

targets = []
for k, sname in logo_targets.items():
    if sname in school["school"].values and k in logo_map:
        targets.append(sname)

fig = go.Figure()

mask_logo = school["school"].isin(targets)
base = school[~mask_logo].copy()
logos = school[mask_logo].copy()

fig.add_trace(go.Scatter(
    x=base["pc1"],
    y=base["pc2"],
    mode="markers",
    marker=dict(
        size=np.clip(6 + np.sqrt(base["n_songs"].to_numpy())*3, 6, 30),
        opacity=0.75
    ),
    text=base["school"],
    customdata=np.stack([
        base["conference"],
        base["crowd_hype"],
        base["aggression"],
        base["tropes"],
        base["nonsense"],
        base["bpm"],
        base["sec_duration"],
        base["n_songs"]
    ], axis=1),
    hovertemplate=(
        "<b>%{text}</b><br>"
        "Conference: %{customdata[0]}<br>"
        "Crowd hype: %{customdata[1]:.2f}<br>"
        "Aggression: %{customdata[2]:.2f}<br>"
        "Tropes: %{customdata[3]:.2f}<br>"
        "Nonsense: %{customdata[4]:.2f}<br>"
        "BPM: %{customdata[5]:.1f}<br>"
        "Duration (sec): %{customdata[6]:.1f}<br>"
        "Songs in dataset: %{customdata[7]}<extra></extra>"
    ),
    showlegend=False
))

conf_list = school["conference"].unique().tolist()
for conf in conf_list:
    d = base[base["conference"] == conf]
    fig.add_trace(go.Scatter(
        x=d["pc1"],
        y=d["pc2"],
        mode="markers",
        marker=dict(
            size=np.clip(6 + np.sqrt(d["n_songs"].to_numpy())*3, 6, 30),
            opacity=0.75
        ),
        name=conf,
        text=d["school"],
        customdata=np.stack([
            d["crowd_hype"],
            d["aggression"],
            d["tropes"],
            d["nonsense"],
            d["bpm"],
            d["sec_duration"],
            d["n_songs"]
        ], axis=1),
        hovertemplate=(
            "<b>%{text}</b><br>"
            "Crowd hype: %{customdata[0]:.2f}<br>"
            "Aggression: %{customdata[1]:.2f}<br>"
            "Tropes: %{customdata[2]:.2f}<br>"
            "Nonsense: %{customdata[3]:.2f}<br>"
            "BPM: %{customdata[4]:.1f}<br>"
            "Duration (sec): %{customdata[5]:.1f}<br>"
            "Songs in dataset: %{customdata[6]}<extra></extra>"
        ),
        showlegend=True
    ))

x_range = [school["pc1"].min(), school["pc1"].max()]
y_range = [school["pc2"].min(), school["pc2"].max()]
x_pad = (x_range[1]-x_range[0]) * 0.08
y_pad = (y_range[1]-y_range[0]) * 0.10
x_range = [x_range[0]-x_pad, x_range[1]+x_pad]
y_range = [y_range[0]-y_pad, y_range[1]+y_pad]

for _, r in logos.iterrows():
    k = norm_key(r["school"])
    if k not in logo_map:
        continue
    img = Image.open(logo_map[k]).convert("RGBA")
    w, h = img.size
    max_side = 110
    scale = max_side / max(w, h)
    img = img.resize((max(1,int(w*scale)), max(1,int(h*scale))))
    tmp = f"/mnt/data/_logo_{k}.png"
    img.save(tmp)
    fig.add_layout_image(dict(
        source=Image.open(tmp),
        xref="x",
        yref="y",
        x=float(r["pc1"]),
        y=float(r["pc2"]),
        sizex=(x_range[1]-x_range[0]) * 0.04,
        sizey=(y_range[1]-y_range[0]) * 0.08,
        xanchor="center",
        yanchor="middle",
        layer="above"
    ))
    fig.add_annotation(
        x=float(r["pc1"]),
        y=float(r["pc2"]),
        text=r["school"],
        showarrow=True,
        arrowhead=2,
        ax=30,
        ay=-30
    )

loadings = pca.components_.T
scale = 2.6
for i, f in enumerate(features):
    x = loadings[i,0] * scale
    y = loadings[i,1] * scale
    fig.add_trace(go.Scatter(
        x=[0, x],
        y=[0, y],
        mode="lines+text",
        text=["", f],
        textposition="top center",
        showlegend=False
    ))

fig.update_layout(
    title="Fight Song Style Map (PCA): What lyrical/tempo features cluster together?",
    xaxis_title=f"PC1 ({pca.explained_variance_ratio_[0]*100:.1f}% variance)",
    yaxis_title=f"PC2 ({pca.explained_variance_ratio_[1]*100:.1f}% variance)",
    height=850,
    margin=dict(l=60, r=40, t=80, b=60)
)

fig.update_xaxes(range=x_range, zeroline=True)
fig.update_yaxes(range=y_range, zeroline=True)

fig.write_html("index.html", include_plotlyjs="cdn")
files.download("index.html")

Saving fight-songs.csv to fight-songs (1).csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>