In [1]:
import pandas as pd
import numpy as np

def sanatize(df):
    cols = df.columns.difference(['username'])
    df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')


def user_intersection(df_1, df_2):
    user_col = 'username'
    common = set(df_1[user_col]) & set(df_2[user_col])

    a = (df_1[df_1[user_col].isin(common)]
         .sort_values(user_col)
         .drop_duplicates(subset=[user_col], keep="first")
         .reset_index(drop=True))
    b = (df_2[df_2[user_col].isin(common)]
         .sort_values(user_col)
         .drop_duplicates(subset=[user_col], keep="first")
         .reset_index(drop=True))

    a.insert(0, "user_id", range(len(a)))
    b.insert(0, "user_id", range(len(b)))

    assert len(a) == len(b), "still mismatched — check for NaNs or types"
    return a, b

def add_mean_std(df):
    values = np.arange(0.5, 5.5, 0.5)
    count_cols = [str(x) for x in values]
    vals = pd.Series(values, index=count_cols)
    n = df[count_cols].sum(axis=1)
    df['mean'] = (df[count_cols] @ vals) / n
    m2 = (df[count_cols] @ (vals**2)) / n
    var_pop = m2 - df['mean']**2
    df['std'] = np.sqrt(var_pop * n/(n-1))



def change_raw_rating_to_z_score(df):
    n_rows, n_cols = df.shape
    for r in range(n_rows):
        user = df.iat[r, 1]
        mean = user_rating_counts.at[user, 'mean']
        std = user_rating_counts.at[user, 'std']
        for c in range(2, n_cols):
            rating = df.iat[r, c]
            df.iat[r, c] = np.clip((rating - mean)/std, -4, 4)

award_reviews = pd.read_csv('user_award_reviews.csv')
non_award_reviews = pd.read_csv('user_non_award_reviews.csv')
user_rating_counts = pd.read_csv('user_rating_counts.csv')

award_reviews, non_award_reviews = user_intersection(award_reviews, non_award_reviews)

# non_award_reviews.drop(columns=['wicked-2024', 'emilia-perez', 'i-tonya', 'the-banshees-of-inisherin'], inplace=True)


sanatize(award_reviews)
sanatize(non_award_reviews)
sanatize(user_rating_counts)

add_mean_std(user_rating_counts)
user_rating_counts.set_index('username', inplace=True)

change_raw_rating_to_z_score(award_reviews)
change_raw_rating_to_z_score(non_award_reviews)

input_length = len(award_reviews)

user_to_id = dict(zip(award_reviews['username'], award_reviews['user_id']))
id_to_user = dict(zip(award_reviews['user_id'], award_reviews['username']))



In [2]:
y = {movie : 1 for movie in award_reviews.columns[2:]} | {movie : 0 for movie in non_award_reviews.columns[2:]}
training_yrs = (2015, 2021)
validation_yrs = (2022, 2023)
test_yr = (2024, 2025)

awards = pd.read_csv('awarded_movie_date.csv')
non_awards = pd.read_csv('non_awarded_movie_date.csv')
def movie_range(df, time_frame):
    start, end = time_frame
    df['date'] = pd.to_datetime(
        df['date'], format='%m/%d/%Y', errors='coerce'
    )
    mask = df['date'].dt.year.between(start, end, inclusive='both')
    return df.loc[mask, 'movie'].tolist()


train_movies = movie_range(awards, training_yrs) + movie_range(non_awards, training_yrs)
val_movies = movie_range(awards, validation_yrs) + movie_range(non_awards, validation_yrs)
test_movies = movie_range(awards, test_yr) + movie_range(non_awards, test_yr)


# movie_to_id = {m: i for i, m in enumerate(list(award_reviews.columns[2:]) + list(non_award_reviews.columns[2:]))}
# id_to_movie = {i: m for m, i in movie_to_id.items()}
def combine_reviews(award_reviews: pd.DataFrame, non_award_reviews: pd.DataFrame) -> pd.DataFrame:
    # set a stable MultiIndex on (user_id, username); drop the 2 meta columns from data
    aw = award_reviews.set_index(list(award_reviews.columns[:2]))
    naw = non_award_reviews.set_index(list(non_award_reviews.columns[:2]))

    # align to the union of users
    idx = aw.index.union(naw.index)
    aw, naw = aw.reindex(idx), naw.reindex(idx)

    # combined z-score matrix: rows = users, cols = all movies (winners + non-winners)
    return pd.concat([aw, naw], axis=1)

y = {movie : 1 for movie in award_reviews.columns[2:]} | {movie : 0 for movie in non_award_reviews.columns[2:]}
reviews_z = combine_reviews(award_reviews, non_award_reviews)   # cols = all movies


In [3]:
reviews_z

Unnamed: 0_level_0,Unnamed: 1_level_0,anora,conclave,the-brutalist,emilia-perez,nickel-boys,wicked-2024,maria-2024,oppenheimer-2023,poor-things-2023,the-holdovers,...,hell-or-high-water,nocturnal-animals,the-lobster,deadpool,the-jungle-book-2016,captain-fantastic,ant-man,carol-2015,steve-jobs,star-wars-the-force-awakens
user_id,username,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,14ltobias,,,,,,,,,,,...,,,,,,,,,,
1,1godfella,,1.225216,,,,,,0.670104,,1.225216,...,,,,,,,,,,
2,__matheus__,1.437797,0.645231,,-1.732464,,1.437797,-0.147334,,,,...,,,,,,,,,,
3,_isocertified,1.049219,1.049219,,,,1.526416,,1.049219,,1.526416,...,,,,,,,,,,
4,_lydiogames,-0.809676,,,,,-0.300111,,0.719020,,0.719020,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
638,zascmo,1.913259,-0.197550,,-1.252954,1.385557,0.330153,0.330153,1.913259,0.330153,1.385557,...,,,,,,,,,,
639,zoelovesfilm,-1.382719,0.266862,,,,1.366583,,-0.832859,0.266862,-0.282998,...,,,,,,,,,,
640,zoerosebryant,1.720830,0.788061,,,0.788061,1.254446,-0.611092,1.254446,1.720830,1.720830,...,,,,1.254446,0.788061,,0.321677,,,
641,zrshelton,1.377239,1.377239,,-1.024150,1.777470,0.576776,0.176544,,1.777470,0.576776,...,,,,,,,,,,


In [4]:


def build_bag_of_users(reviews_z: pd.DataFrame, movies, y: dict):
    #returns
    #X_x is array with each row a user, and column a z-score of movie in predetermined order
    #X_m is the mask for missing entries
    #y is the label for awarded or not

    missing = [m for m in movies if m not in reviews_z.columns]
    if missing:
        raise ValueError(f"Movies missing from reviews_z: {missing}")
    
    Z = reviews_z[movies].astype("float32").to_numpy(copy=True)     #(number of users, number of all movies)
    M = ~np.isnan(Z)
    X = np.where(M, Z, 0.0).astype("float32")
    return X.T.astype("float32"), M.T.astype("float32"), np.array([int(y[m]) for m in movies], dtype=np.int64), movies


def pack(split_movies):
    X_x, X_m, y_vec, movies = build_bag_of_users(reviews_z, split_movies, y)
    return {"X_x": X_x, "X_m": X_m, "y": y_vec, "movies": movies}


Dtr, Dva, Dte = pack(train_movies), pack(val_movies), pack(test_movies)



In [5]:
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import average_precision_score, roc_auc_score
from sklearn.preprocessing import StandardScaler


# scale training data only to 0 mean variance 1 because mlp works better
def pack_X(Dtr, Dva, scale_x=True):
    Xtr_x, Xtr_m = Dtr["X_x"], Dtr["X_m"]
    Xva_x, Xva_m = Dva["X_x"], Dva["X_m"]
    if scale_x:
        sc = StandardScaler().fit(Xtr_x)          # fit on Train only
        Xtr_x, Xva_x = sc.transform(Xtr_x), sc.transform(Xva_x)
    Xtr = np.concatenate([Xtr_x, Xtr_m], axis=1).astype(np.float32)
    Xva = np.concatenate([Xva_x, Xva_m], axis=1).astype(np.float32)
    return Xtr, Xva

Xtr, Xva = pack_X(Dtr, Dva, scale_x=True)

Xte = np.concatenate([Dte["X_x"], Dte["X_m"]], axis=1).astype(np.float32)
ytr, yva, yte = Dtr["y"].astype(int), Dva["y"].astype(int), Dte["y"].astype(int)

#train
clf = MLPClassifier(
    hidden_layer_sizes=(128,), 
    random_state=0,
    max_iter=1000,
    learning_rate_init=1e-3,
    alpha=1e-4,
    early_stopping=False,
    shuffle=True,
    verbose=False,
    )
clf.fit(Xtr, ytr)

#validate
p_val = clf.predict_proba(Xva)[:, 1]
aupr_val = average_precision_score(yva, p_val)
auroc_val = roc_auc_score(yva, p_val) if len(np.unique(yva)) == 2 else float("nan")
print(f"[VAL] AUPR={aupr_val:.3f} | AUROC={auroc_val:.3f}")

#test
p_test = clf.predict_proba(Xte)[:, 1]
aupr_test = average_precision_score(yte, p_test)
auroc_test = roc_auc_score(yte, p_test) if len(np.unique(yte)) == 2 else float("nan")
print(f"[TEST] AUPR={aupr_test:.3f} | AUROC={auroc_test:.3f}")

[VAL] AUPR=0.727 | AUROC=0.626
[TEST] AUPR=0.871 | AUROC=0.727


In [6]:
import numpy as np
from sklearn.metrics import precision_recall_curve, average_precision_score, roc_auc_score, log_loss, brier_score_loss, confusion_matrix

# Threshold on Val by max-F1
prec, rec, thr = precision_recall_curve(yva, p_val)
f1 = 2*prec*rec/(prec+rec+1e-12)
best = int(np.nanargmax(f1))
thr_star = thr[best] if best < len(thr) else 0.5

# Apply to Test
yhat = (p_test >= 0.5).astype(int)
tn, fp, fn, tp = confusion_matrix(yte, yhat).ravel()
acc  = (tp + tn) / (tp + tn + fp + fn)
prec_t = tp / (tp + fp) if (tp+fp) else 0.0
rec_t  = tp / (tp + fn) if (tp+fn) else 0.0
f1_t   = 2*prec_t*rec_t/(prec_t+rec_t+1e-12) if (prec_t+rec_t) else 0.0

print(f"[TEST @thr={thr_star:.4f}] correct={tp+tn}/{tp+tn+fp+fn} | acc={acc:.3f} | P={prec_t:.3f} R={rec_t:.3f} F1={f1_t:.3f}")
print(f"Confusion: TP={tp} FP={fp} TN={tn} FN={fn}")

[TEST @thr=0.4520] correct=12/17 | acc=0.706 | P=0.688 R=1.000 F1=0.815
Confusion: TP=11 FP=5 TN=1 FN=0


In [7]:
# Build a dataframe of test predictions
df_test_preds = pd.DataFrame({
    "movie": Dte["movies"],              # list of test movie names aligned with rows
    "y_true": yte.astype(int),           # ground-truth label 0/1
    "p_hat": p_test.astype(float),       # predicted probability of winning
})

# Predicted class at the chosen threshold
df_test_preds["y_pred"] = (df_test_preds["p_hat"] >= thr_star).astype(int)

# Predicted class at the chosen threshold
df_test_preds["Result"] = np.where(
    df_test_preds["y_true"] == df_test_preds["y_pred"],
    "Correct",
    "Wrong",
)

# Sort for display
df_test_preds = df_test_preds.sort_values("p_hat", ascending=False).reset_index(drop=True)

print(df_test_preds)  # peek at top 10

                     movie  y_true     p_hat  y_pred   Result
0         oppenheimer-2023       1  0.999657       1  Correct
1            the-holdovers       1  0.999283       1  Correct
2                 conclave       1  0.999157       1  Correct
3              wicked-2024       1  0.996949       1  Correct
4         poor-things-2023       1  0.996491       1  Correct
5         american-fiction       1  0.995987       1  Correct
6            dune-part-two       0  0.995797       1    Wrong
7            the-substance       0  0.995001       1    Wrong
8                    anora       1  0.982810       1  Correct
9   furiosa-a-mad-max-saga       0  0.977468       1    Wrong
10             a-real-pain       0  0.864265       1    Wrong
11             nickel-boys       1  0.848371       1  Correct
12              maria-2024       1  0.788987       1  Correct
13            emilia-perez       1  0.721089       1  Correct
14           the-brutalist       1  0.621125       1  Correct
15      