In [None]:
import numpy as np
import pandas as pd


df_og = pd.read_csv('user_distribution_and_reviews.csv')
df_og.drop_duplicates(inplace=True)
# turns everything to numeric, in particular, turns late and missing reviews to nan as well
cols = df_og.columns.difference(['username'])
df_og[cols] = df_og[cols].apply(pd.to_numeric, errors='coerce')

# df = rows of [user, 0.5, ..., 5] 
df = df_og.iloc[:, : 11]

# add mean and std columns to df
values = np.arange(0.5, 5.5, 0.5)
count_cols = [str(x) for x in values]
vals = pd.Series(values, index=count_cols)
n = df[count_cols].sum(axis=1)
df['mean'] = (df[count_cols] @ vals) / n
m2 = (df[count_cols] @ (vals**2)) / n
var_pop = m2 - df['mean']**2
df['std'] = np.sqrt(var_pop * n/(n-1))

# add harshness discrete and continuous
df.sort_values('mean', ascending=False, inplace=True)
df['harshness_d'] = pd.qcut(range(len(df)), q=10, labels=False)
df['harshness_c'] = -df['mean']
df.sort_index(inplace=True)

#add z score columns for each movie to df
movies = df_og.columns.tolist()[12:]
ratings = df_og[movies].apply(pd.to_numeric, errors='coerce')
Z = ratings.sub(df['mean'], axis=0).div(df['std'], axis=0)
Z = Z.add_prefix('z_')
df = pd.concat([df, Z], axis=1)

# make list of movies by year, eg movie_lists[20] is the list of awarded movies from 2020
# moreover, movie_yr[movie] returns the two digit (int) year that movie was awarded
df_movies = pd.read_csv('awarded_movie_date.csv')
df_movies['yy'] = df_movies['first_award_date'].astype(str).str.strip().str[-2:].astype(int)
movie_lists = {yy: df_movies.loc[df_movies['yy'] == str(yy), 'movie'].tolist() for yy in range(15, 26)}
movie_yr = dict(zip(df_movies['movie'], df_movies['yy']))



FileNotFoundError: [Errno 2] No such file or directory: 'user_distribution_and_reviews.csv'

In [None]:
# generating synthetic non-award winning movies by year, by making duplicates
# 1. for movie m, the synthetic m will have the same number of reviews from our collected users
# 2. 
R = {i: [] for i in range(15, 26)}
for user, row in df_og.set_index('username')[list(df_movies['movie'])].iterrows():
    for movie, rating in row.items():
        if not np.isnan(rating):
            movie_z_score = df.loc[df['username'].eq(user), f'z_{movie}'].iloc[0]
            movie_harshness_c = df.loc[df['username'].eq(user), 'harshness_c'].iloc[0]
            movie_harshness_d = df.loc[df['username'].eq(user), 'harshness_d'].iloc[0]
            R[movie_yr[movie]].append([user, movie, rating, movie_z_score, movie_harshness_d, movie_harshness_c])

header = ['username', 'movie', 'rating', 'harshness_c', 'harshness_d', 'z_score']
R = {yy: pd.DataFrame(R[yy], columns=header) for yy in range(15, 26)}

In [None]:
len(R[20].index)

1133

In [None]:
from itertools import product
p = {
    y: R[y]['harshness_d']
        .value_counts(normalize=True)
        .reindex(range(10), fill_value=0.0)
    for y in range(15, 26)
}
C = {
    (y, d): R[y].loc[R[y]['harshness_d'].eq(d),
                    ['username','movie','z_score','harshness_d','harshness_c']].copy()
    for y in range(15, 26) for d in range(10)
}
def excluded(C, y, d, m):
    """
    Takes a C above, returns a new dataframe with where movie != m
    """
    return C[(y, d)].loc[C[(y, d)]['movie'].ne(m)].copy()


n = {
    (y, m): cnt
    for y in range(15, 26)
    for m, cnt in R[y]['movie'].value_counts().items()
}


In [None]:
def largest_remainder_quota(p_y: pd.Series, n_target: int) -> pd.Series:
    """
    p_y: Series indexed 0..9 of year-level decile proportions (sum â‰ˆ 1).
    n_target: n_{y,m} for this shell (int).
    Returns Series of int quotas per decile summing exactly to n_target.
    """
    p = p_y.reindex(range(10), fill_value=0.0)
    raw = p * n_target
    base = np.floor(raw).astype(int)
    leftover = int(n_target - base.sum())
    if leftover <= 0:
        return base

    # stable tie-break: sort by remainder desc, then by decile asc
    rem = (raw - base).reset_index()
    rem.columns = ['decile', 'rem']
    rem = rem.sort_values(['rem', 'decile'], ascending=[False, True])

    add_deciles = rem['decile'].to_numpy()[:leftover]
    base = base.copy()
    for d in add_deciles:
        base.loc[d] += 1
    return base
largest_remainder_quota(p[25], n[(25, 'anora')])


harshness_d
0    38
1    35
2    39
3    39
4    41
5    42
6    37
7    41
8    33
9    39
Name: proportion, dtype: int64