In [29]:
# --- 0. imports -------------------------------------------------------------
import pandas as pd
import numpy as np
import statsmodels.api as sm

START_CHUNK = 7
END_CHUNK = 9

# --- 1. load CSV ------------------------------------------------------------
PATH = "All_Chunked_Presence_in_All_Movies_DELUXE.csv"   # <-- update if needed
df   = pd.read_csv(PATH)

# --- 2. create boolean rival flag ------------------------------------------
df["is_rival"] = df["Role"].str.lower().eq("rival")

# --- 3. collapse to rival / other counts per film × clip -------------------
agg = (df.groupby(["Title", "Timeline", "is_rival"])["Value"]
         .sum()
         .unstack(fill_value=0)                 # columns = {False:'other', True:'rival'}
         .rename(columns={False: "other", True: "rival"})
         .reset_index())


agg = agg[agg["Timeline"].between(START_CHUNK, END_CHUNK)]   # <-- restrict to chunks 4-8

# --- 4. corpus-level GLM with cluster-robust SEs ---------------------------
y = agg[["rival", "other"]]                      # successes / failures
X = sm.add_constant(agg["Timeline"])             # linear trend  (clip 1-9)
glm_corpus = sm.GLM(y, X, family=sm.families.Binomial()).fit(
                 cov_type="cluster",
                 cov_kwds={"groups": agg["Title"]})

print("CORPUS-LEVEL MODEL")
print(glm_corpus.summary())
print()

# --- 5. per-film slopes -----------------------------------------------------
results = []
for film, sub in agg.groupby("Title"):
    y_f = sub[["rival", "other"]]
    X_f = sm.add_constant(sub["Timeline"])
    fit  = sm.GLM(y_f, X_f, family=sm.families.Binomial()).fit()
    results.append({
        "Film": film,
        "Slope (β)":   fit.params["Timeline"],
        "p-value":     fit.pvalues["Timeline"]
    })


per_film = (pd.DataFrame(results)
              .sort_values("p-value")
              .reset_index(drop=True))
print("PER-FILM SLOPES")
print(per_film)

# (optional) save the table
per_film.to_csv("per_film_rival_slopes.csv", index=False)

# show only the films whose rival-slope p-value is below .05
sig = per_film[per_film["p-value"] < 0.05]

print("\n\n\n")
print("FILMS WITH SIGNIFICANT RIVAL FADE-OUT (p < .05)")
print("Filter other chunks not in " + f"range {START_CHUNK}-{END_CHUNK}")
print(sig.to_string(index=False, float_format="%.4g"))


CORPUS-LEVEL MODEL
                 Generalized Linear Model Regression Results                  
Dep. Variable:     ['rival', 'other']   No. Observations:                   48
Model:                            GLM   Df Residuals:                       46
Model Family:                Binomial   Df Model:                            1
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -558.42
Date:                Tue, 20 May 2025   Deviance:                       955.15
Time:                        17:43:12   Pearson chi2:                     916.
No. Iterations:                     5   Pseudo R-squ. (CS):            0.06078
Covariance Type:              cluster                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -2.1100      1.676 

In [None]:
# gmm_simple.py
import pandas as pd
import numpy as np
from sklearn.mixture import GaussianMixture

# ---------- Settings ----------
csv_path     = "rival_end.csv"      # CSV file
value_column = "Value"              # Numeric column to cluster
n_components = 3                    # Number of mixture components
random_seed  = 42                   # Reproducible centroids
# --------------------------------

# 1. Load CSV
df = pd.read_csv(csv_path)

# 2. If your numeric column is literally named "Sum(8-9)", rename safely:
df = df.rename(columns={"Sum(8-9)": "Value"})
value_column = "Value"

# 3. Ensure numeric
df[value_column] = pd.to_numeric(df[value_column], errors="raise")

# 4. Fit GMM
gmm = GaussianMixture(
    n_components=n_components,
    random_state=random_seed,
    n_init=10
).fit(df[[value_column]])

df["GMM_"] = gmm.predict(df[[value_column]])

# 5. Report
centers = np.sort(gmm.means_.flatten())
print("GMM cluster centers:", np.round(centers, 3))
print(df.to_string(index=False))

# Optional: save
# df.to_csv("rival_end_with_clusters.csv", index=False)

GMM cluster centers: [0.201 0.575 1.229]
                     Entity    Value  GMM_
     Cliff:::Professor Yuan 0.023810     1
             Gilda:::Ballin 0.046875     1
            Sabrina:::David 0.166667     1
    Waves of Life:::Bo Kang 0.233150     1
        Human Desire:::Carl 0.246571     1
             Oklahoma:::Jud 0.295582     1
 Notorious:::Alex Sebastian 0.327958     1
        Jigoku Mon:::Wataru 0.432534     2
     Diary of Nurse:::Haoru 0.482143     2
Kurutta Kajitsu:::Natsuhisa 0.505690     2
 Black River:::Hitokiri Joe 0.556202     2
    Duel in the Sun:::Jesse 0.576923     2
       Our Village:::Keming 0.605938     2
   Chikamatsu Story:::Ishun 0.691125     2
 The Man from Laramie:::Vic 0.788767     2
            Vikings:::Einar 1.229167     0
