In [7]:
import pandas as pd
import numpy as np
from crowdkit.aggregation import BradleyTerry
from scipy import stats

import random
import itertools
from tqdm import tqdm
import matplotlib.pyplot as plt


In [8]:
movies_imdb = {
    'Interstellar': 8.7,
    'Cloud Atlas': 7.4,
    'Rogue One: A Star Wars Story': 7.8,
    'I Am Legend': 7.2,
    'No Country for Old Men': 8.2,
    'The Martian': 8.0,
    'Shutter Island': 8.2,
    'The Truman Show': 8.2,
    'The Curious Case of Benjamin Button': 7.8,
    'The Pursuit of Happyness': 8.0,     
}

movies_imdb = dict(sorted(movies_imdb.items()))
movies_imdb


{'Cloud Atlas': 7.4,
 'I Am Legend': 7.2,
 'Interstellar': 8.7,
 'No Country for Old Men': 8.2,
 'Rogue One: A Star Wars Story': 7.8,
 'Shutter Island': 8.2,
 'The Curious Case of Benjamin Button': 7.8,
 'The Martian': 8.0,
 'The Pursuit of Happyness': 8.0,
 'The Truman Show': 8.2}

In [9]:
### Preliminary ranking

# PAIRS_NUM = 40

# movie_pairs = list(itertools.combinations(movies,2))
# movie_pairs = random.sample(movie_pairs, k=PAIRS_NUM)


# df = pd.DataFrame(movie_pairs)
# df['My choice'] = [0] * len(df)

# df


In [10]:
# for i, row in df.iterrows():
#     while True:
#         my_choice = int(input(f"(0): {row[0]:40} \t or \t (1): {row[1]}\n"))
#         if my_choice in [0,1]: break
        
#     df.loc[i, 'My choice'] = my_choice

# df.to_csv('my_rankings.csv')
# df


In [11]:
df = pd.read_csv('my_rankings.csv', index_col=0)
df

Unnamed: 0,left,right,label
0,No Country for Old Men,The Pursuit of Happyness,The Pursuit of Happyness
1,I Am Legend,Shutter Island,Shutter Island
2,I Am Legend,The Martian,I Am Legend
3,Cloud Atlas,The Pursuit of Happyness,Cloud Atlas
4,Cloud Atlas,I Am Legend,Cloud Atlas
5,Interstellar,Rogue One: A Star Wars Story,Interstellar
6,The Martian,Shutter Island,Shutter Island
7,Shutter Island,The Curious Case of Benjamin Button,The Curious Case of Benjamin Button
8,Rogue One: A Star Wars Story,The Pursuit of Happyness,Rogue One: A Star Wars Story
9,Interstellar,Cloud Atlas,Interstellar


In [12]:
agg_bt = BradleyTerry(n_iter=100).fit_predict(df)
agg_bt = dict(sorted(dict(agg_bt).items()))

agg_bt


{'Cloud Atlas': 0.21294335437769302,
 'I Am Legend': 0.07125987497629856,
 'Interstellar': 0.2659010917711591,
 'No Country for Old Men': 0.026905942993819393,
 'Rogue One: A Star Wars Story': 0.1114410266845456,
 'Shutter Island': 0.10098147201177565,
 'The Curious Case of Benjamin Button': 0.13475588345636055,
 'The Martian': 0.033752746239306144,
 'The Pursuit of Happyness': 0.026905942993819393,
 'The Truman Show': 0.015152664495222573}

In [28]:
imdb_ratings = np.array(list(movies_imdb.values()))
imdb_ratings


array([7.4, 7.2, 8.7, 8.2, 7.8, 8.2, 7.8, 8. , 8. , 8.2])

In [29]:
agg_bt_ratings = np.array(list(agg_bt.values()))
agg_bt_ratings


array([0.21294335, 0.07125987, 0.26590109, 0.02690594, 0.11144103,
       0.10098147, 0.13475588, 0.03375275, 0.02690594, 0.01515266])

In [30]:
stats.spearmanr(imdb_ratings, agg_bt_ratings)

SpearmanrResult(correlation=-0.18887712050782354, pvalue=0.6012580954612852)

## Bootstrapping

In [61]:
import warnings
warnings.filterwarnings("ignore")


In [81]:
BOOTSTRAP_ROUNDS = 1000
bs_res = []

k = 0
while k < BOOTSTRAP_ROUNDS:
    df_sample = df.sample(len(df), replace=True)

    agg_bt = BradleyTerry(n_iter=100).fit_predict(df_sample)
    agg_bt = dict(sorted(dict(agg_bt).items()))
    
    if len(list(agg_bt.values())) == len(movies_imdb):
        bs_res.append(list(agg_bt.values()))
        k += 1
        
        
bs_res = pd.DataFrame(bs_res).T


In [82]:
bs_res

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,0.219276,0.247112,0.246506,0.147099,0.198483,0.233975,0.196349,0.207908,0.198093,0.216576,...,0.141931,0.26223,0.212096,0.193242,0.201387,0.21608,0.230403,0.21775,0.196947,0.233205
1,0.106653,0.132369,0.12378,0.031414,0.120478,0.070952,0.107341,0.081187,0.041578,0.115603,...,0.088655,0.069216,0.049287,0.097105,0.04744,0.068055,0.040193,0.061005,0.084679,0.031391
2,0.226305,0.225224,0.238322,0.327967,0.219652,0.286723,0.242139,0.233428,0.234507,0.232642,...,0.229136,0.26025,0.28146,0.224043,0.262895,0.257694,0.280641,0.269811,0.280492,0.243395
3,0.02074,0.056359,0.055812,0.031915,0.016823,0.012421,0.02851,0.031765,0.060502,0.01205,...,0.009906,0.063407,0.0,0.096698,0.030115,0.022485,0.035723,0.059113,0.02706,0.060279
4,0.191601,0.090483,0.119506,0.02512,0.034577,0.153817,0.097882,0.109234,0.047723,0.077278,...,0.239614,0.10109,0.104738,0.069839,0.23902,0.142713,0.183701,0.131559,0.136912,0.108362
5,0.113454,0.037392,0.026488,0.127575,0.151453,0.079182,0.078002,0.069565,0.233174,0.089808,...,0.072514,0.081661,0.05545,0.120055,0.092552,0.086231,0.100534,0.077672,0.07799,0.126337
6,0.075837,0.125234,0.088063,0.174697,0.166996,0.127159,0.220228,0.128877,0.137016,0.132695,...,0.141135,0.058029,0.201395,0.128336,0.126591,0.131392,0.063425,0.170743,0.147956,0.109185
7,0.0,0.045767,0.012782,0.025395,0.051171,0.0,0.011198,0.066609,0.047406,0.095287,...,0.067171,0.034533,0.087521,0.052088,0.0,0.038012,0.049959,0.0,0.030271,0.026679
8,0.046132,0.014136,0.066151,0.085339,0.0,0.035772,0.0,0.071428,0.0,0.02806,...,0.009937,0.049503,0.008054,0.00855,0.0,0.037338,0.01542,0.0,0.017694,0.026682
9,0.0,0.025923,0.02259,0.023478,0.040366,0.0,0.018352,0.0,0.0,0.0,...,0.0,0.020081,0.0,0.010045,0.0,0.0,0.0,0.012346,0.0,0.034486


In [98]:
bs_df = pd.DataFrame({'Movie': movies_imdb.keys(), 
                      '2.5% percentile': [0]*len(movies_imdb), 
                      '97.5% percentile': [0]*len(movies_imdb)})

for i, movie in enumerate(movies_imdb.keys()):
    bs_df.loc[i, '2.5% percentile'], bs_df.loc[i, '97.5% percentile'] = np.percentile(bs_res.iloc[i,:], [2.5, 97.5])

bs_df

Unnamed: 0,Movie,2.5% percentile,97.5% percentile
0,Cloud Atlas,0.125285,0.272812
1,I Am Legend,0.0,0.162755
2,Interstellar,0.208704,0.311784
3,No Country for Old Men,0.0,0.082473
4,Rogue One: A Star Wars Story,0.024839,0.229733
5,Shutter Island,0.032461,0.192477
6,The Curious Case of Benjamin Button,0.049622,0.224603
7,The Martian,0.0,0.124269
8,The Pursuit of Happyness,0.0,0.080112
9,The Truman Show,0.0,0.064776
