In [5]:
# --- 0. imports -------------------------------------------------------------
import pandas as pd
import numpy as np
import statsmodels.api as sm

ONLY_CHUNKS_4_TO_8 = True

# --- 1. load CSV ------------------------------------------------------------
PATH = "All_Chunked_Presence_in_All_Movies_DELUXE.csv"   # <-- update if needed
df   = pd.read_csv(PATH)

# --- 2. create boolean rival flag ------------------------------------------
df["is_rival"] = df["Role"].str.lower().eq("rival")

# --- 3. collapse to rival / other counts per film × clip -------------------
agg = (df.groupby(["Title", "Timeline", "is_rival"])["Value"]
         .sum()
         .unstack(fill_value=0)                 # columns = {False:'other', True:'rival'}
         .rename(columns={False: "other", True: "rival"})
         .reset_index())

if ONLY_CHUNKS_4_TO_8:
	agg = agg[agg["Timeline"].between(4, 8)]   # <-- restrict to chunks 4-8

# --- 4. corpus-level GLM with cluster-robust SEs ---------------------------
y = agg[["rival", "other"]]                      # successes / failures
X = sm.add_constant(agg["Timeline"])             # linear trend  (clip 1-9)
glm_corpus = sm.GLM(y, X, family=sm.families.Binomial()).fit(
                 cov_type="cluster",
                 cov_kwds={"groups": agg["Title"]})

print("CORPUS-LEVEL MODEL")
print(glm_corpus.summary())
print()

# --- 5. per-film slopes -----------------------------------------------------
results = []
for film, sub in agg.groupby("Title"):
    y_f = sub[["rival", "other"]]
    X_f = sm.add_constant(sub["Timeline"])
    fit  = sm.GLM(y_f, X_f, family=sm.families.Binomial()).fit()
    results.append({
        "Film": film,
        "Slope (β)":   fit.params["Timeline"],
        "p-value":     fit.pvalues["Timeline"]
    })


per_film = (pd.DataFrame(results)
              .sort_values("p-value")
              .reset_index(drop=True))
print("PER-FILM SLOPES")
print(per_film)

# (optional) save the table
per_film.to_csv("per_film_rival_slopes.csv", index=False)

# show only the films whose rival-slope p-value is below .05
sig = per_film[per_film["p-value"] < 0.05]

print("\n\n\n")
print("FILMS WITH SIGNIFICANT RIVAL FADE-OUT (p < .05)")
print("Filter other chunks not within 4-8: " + f"{ONLY_CHUNKS_4_TO_8}")
print(sig.to_string(index=False, float_format="%.4g"))


CORPUS-LEVEL MODEL
                 Generalized Linear Model Regression Results                  
Dep. Variable:     ['rival', 'other']   No. Observations:                   80
Model:                            GLM   Df Residuals:                       78
Model Family:                Binomial   Df Model:                            1
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -843.61
Date:                Thu, 15 May 2025   Deviance:                       1384.4
Time:                        21:15:55   Pearson chi2:                 1.37e+03
No. Iterations:                     5   Pseudo R-squ. (CS):             0.4633
Covariance Type:              cluster                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.4421      0.569 