# **Import Google Drive**


In [14]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Setting**


In [15]:
from sklearn.isotonic import isotonic_regression
import numpy as np
import pandas as pd
import matplotlib
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import statistics
import scipy.stats as stats
import seaborn as sns


# **Load CSV file into a pandas DataFrame**

In [16]:
# Load your data from a CSV file
df_review = pd.read_csv('/content/drive/MyDrive/Research/ICML_2023_Raw/Final_Datafile/review_daily_anon_2023_0422.csv')
# Extract the first number from the 'rating' column based on the specific format of the rating content
df_review['rating'] = df_review['rating'].str.extract(r'^(\d+)', expand=False).astype(float)
# Extract the first number from the 'confidence' column
df_review['confidence'] = df_review['confidence'].str.extract(r'^(\d+)', expand=False).astype(float)



df = pd.read_csv(r'/content/drive/MyDrive/Research/ICML_2023_Result/proxy_score.csv')
df = df.drop_duplicates(['submission_idx', 'author_idx'])

# check if all submissions have at least 2 reviews
for submission in df['submission_id'].unique():
  if len( df_review[df_review['submission_id'] == submission]['rating'].tolist() ) <= 1:
      print(submission)


final_submission_list = df['submission_id'].unique()
authors = df['author_id'].unique()

df_expect = pd.read_csv(r'/content/drive/MyDrive/Research/ICML_2023_Data/Final_Datafile/aggregated_results_anonymized.csv')
df_expect = df_expect.dropna(subset=['submission_id_with_most_unexpected_scores'])


# all authors providing unexpected submissions
author_unexpect = df_expect['author_id'].unique()
author_unexpect = [author for author in authors if author in author_unexpect]



# **Record Residual, Confidence and Variance**

In [17]:
# Organize all the submissions by {author: [submission, rank, score]}.
author_submission_rank_old = {}
for author in authors:
    author_submission_rank_old[author] = []
    submissions = df[df['author_id'] == author]['submission_id'].tolist()
    for i in range(len(submissions)):
        rank = df[(df['submission_id'] == submissions[i]) & (df['author_id'] == author)]['rank'].tolist()[0]
        ratings = df[(df['submission_id'] == submissions[i]) & (df['author_id'] == author)]['rating_0422_mean'].tolist()[0]
        author_submission_rank_old[author].append((submissions[i], rank, ratings))



# Sort submissions by rank; in case of ties, sort by score.
def sort_submissions(author_submission_rank_old):
    for author in author_submission_rank_old:
        author_submission_rank_old[author].sort(key=lambda x: (x[1], -x[2]), reverse=False)
    return author_submission_rank_old
author_submission_rank_old = sort_submissions(author_submission_rank_old)



# Compute isotonic scores for each author.
author_submission_new = {}
for author in author_submission_rank_old:
    ir_rank = []
    for i in range(len(author_submission_rank_old[author])):
        r1 = author_submission_rank_old[author][i][2]
        ir_rank.append(r1)
    ir_rank = np.array(ir_rank)
    ir_rank_pred =  isotonic_regression(ir_rank, sample_weight = None, y_min=0.0, y_max=10.0, increasing=False)

    for i in range(len(author_submission_rank_old[author])):
        author_submission_new[ (author, author_submission_rank_old[author][i][0]) ] = ir_rank_pred[i]



# record average score
author_submission_old = {}
for author in author_submission_rank_old:
    for i in range(len(author_submission_rank_old[author])):
        author_submission_old[ (author, author_submission_rank_old[author][i][0]) ] = author_submission_rank_old[author][i][2]



# record the residual
residual = {}
for author in author_unexpect:
    submission_list = df[df['author_id'] == author ]['submission_id'].tolist()
    for submission in submission_list:
        residual[(author, submission)] = abs( author_submission_old[ (author, submission) ] - author_submission_new[ (author, submission) ] )



# record confidence
confidence = {}
for author in author_unexpect:
    submission_list = df[df['author_id'] == author ]['submission_id'].tolist()
    for submission in submission_list:
        confidences = df_review[df_review['submission_id'] == submission]['confidence'].tolist()
        confidences = np.array(confidences)
        confidence[(author, submission)] =  np.mean(confidences)



# record variance
variance = {}
for author in author_unexpect:
    submission_list = df[df['author_id'] == author ]['submission_id'].tolist()
    for submission in submission_list:
        ratings = df_review[df_review['submission_id'] == submission]['rating'].tolist()
        ratings = np.array(ratings)
        variance[(author, submission)] =  np.var(ratings)

# **Count prediction accuracy**

In [18]:
residual_count = 0
variance_count = 0
confidence_count = 0
total_authors = 0
for author in author_unexpect:
    submission_list = df[df['author_id'] == author ]['submission_id'].tolist()
    unexpect_submission = df_expect[df_expect['author_id'] == author]['submission_id_with_most_unexpected_scores'].tolist()[0]
    if unexpect_submission in submission_list:
      total_authors += 1
      if all( residual[(author, unexpect_submission)] >= residual[(author, submission)] for submission in submission_list ):
          residual_count += 1
      if all( variance[(author, unexpect_submission)] >= variance[(author, submission)] for submission in submission_list ):
          variance_count += 1
      if all( confidence[(author, unexpect_submission)] <= confidence[(author, submission)] for submission in submission_list ):
          confidence_count += 1


print("# authors with unexpected submissions:", total_authors)
print("# authors with unexpected submissions that have the largest mean residual:", residual_count)
print("# authors with unexpected submissions that have the largest variance:", variance_count)
print("# authors with unexpected submissions that have the least confidence:", confidence_count)

# authors with unexpected submissions: 322
# authors with unexpected submissions that have the largest mean residual: 254
# authors with unexpected submissions that have the largest variance: 162
# authors with unexpected submissions that have the least confidence: 136
