# Codeforces blog post

** TO DO **
* Make better problem data function so it doesn't make 600+ api calls each time/ only update when needed
* Figure out where to store the problem data function

Figures list
* **DONE** Trend in problem ELOs over time
* **DONE** Distribution of all problem ELOs in D1 and D2
* **DONE** Correlation between problem ELO and scores
* Correlation between D1 and D2 scores for duplicated problems
* Hardest and easiest problem writers


In [601]:
%matplotlib inline
%load_ext rpy2.ipython

import requests
import pandas as pd
import numpy as np
from datetime import datetime

import seaborn as sns
import matplotlib.pyplot as plt

from api_functions import getProblemDataFromContest
from api_functions import getContestList

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [48]:
# Grab a list of all problem data from currently rated contests
# ONLY RUN THIS WHEN IT NEEDS TO BE UPDATED
dflist = []
for contestID in set(df_problems.contestID):
    contestProblems = getProblemDataFromContest(contestID)
    contestProblems = contestProblems.rename(index=str, columns={'index': 'problemID'})
    contestProblems = contestProblems.drop('contestId', 1)
    dflist.append(contestProblems)


problemData = pd.concat(dflist)
problemData.to_csv('problem_data.csv', encoding='utf-8', index=None)

In [560]:
df_problem_ratings = pd.DataFrame.from_csv('problem_ratings.csv', index_col=None)
df_problem_info = pd.DataFrame.from_csv('problem_data.csv', index_col=None)

df_problems = pd.merge(df_problem_ratings, df_problem_info)
df_rhist = pd.DataFrame.from_csv('rating_histories.csv', index_col=None)
df_rhist = df_rhist.rename(columns={'contestId':'contestID'})

In [747]:
# playing around with problem tags
from re import sub
from re import compile
dict_tag = []

regexp = compile('\(.+?\)')
regexp2 = compile('\[.+?\]')
regexp3 = compile('\(.+?\)')
regexp4 = compile('\".+?\"')
with open('problem_data.csv') as f:
    lines = f.readlines()
    headers = lines[0].strip().split(',')
    tag_idx = headers.index('tags')
    
    for line in lines[1:]:
        oldline = line
        sline = regexp.sub('', line.strip())
        sline = regexp2.sub('', sline)
        sline = regexp3.sub('', sline)
        sline = regexp4.sub('', sline)
        sline = sline.split(',')
        contestID = sline[0]

        division = sline[2]
        problemID = sline[5]

        if ',"[' in line:
            tags = line.strip().split(',"[')[1]
        elif ',[' in line:
            tags = line.strip().split(',[')[1]
        tags = tags.split(']')[0]
        tags = tags.split(', ')

        for tag in tags:
            dict_tag.append(
                {
                    'contestID': int(contestID),
                    'problemID': problemID,
                    'division': int(division),
                    'tag': tag
                }
                )
df_tags = pd.DataFrame.from_dict(dict_tag)
df_tags = pd.merge(df_tags, df_problem_ratings)

# Some basic plots using df_rhist and df_problems 

In [870]:
%%R -i df_problems -i df_rhist -i df_tags
# [1] "contestID"        "problemID"        "problemRating"    "contestName"     
# [5] "division"         "name"             "points"           "startTimeSeconds"
# [9] "tags"             "type"   
library(ggplot2)
library(plotly)

df <- df_problems

df$division <- factor(df$division, levels=c(1,2,12))
df$bin <- cut(df$points, c(0, 500, 1000, 1500, 2000, 2500, 3000, 3500, 4000))
df$type <- 'other'
df$adjusted_elo <- df$problemRating

# annotate max and min problem ELOs
tapply(1:nrow(df), df$contestID, function(idx){
    ismax <- df[idx, 'problemRating'] == max(df[idx, 'problemRating'])
    df$type[idx[ismax]] <<- 'hardest problem in contest'
    
    ismin <- df[idx, 'problemRating'] == min(df[idx, 'problemRating'])
    df$type[idx[ismin]] <<- 'easiet problem in contest'
    
    df$adjusted_elo[idx] <<- df$adjusted_elo[idx] - min(df$adjusted_elo[idx])
})

# create dict of contest to division
dict_contestID_division <- unique(df[, c('contestID' ,'division')])
rownames(dict_contestID_division) <- as.character(dict_contestID_division$contestID)

# average user rating per contest
averageRating <- tapply(1:nrow(df_rhist), df_rhist$contestID, function(idx){
    contestID <- df_rhist[idx[1], 'contestID']
    averageRating <- median(df_rhist[idx, 'newRating'])
    division <- dict_contestID_division[as.character(contestID), 'division']
    
    data.frame(contestID=contestID,
              averageRating=averageRating,
              division=division)
    
})
df_averageRating <- do.call(rbind, averageRating)
# filter out combined contests
df_averageRating <- df_averageRating[df_averageRating$division != 12,]

#df_rhist$type <- 'other'
## annotate max and min user ratings per contest
#tapply(1:nrow(df_rhist), df_rhist$contestId, function(idx){
#    ismax <- df_rhist[idx, 'newRating'] == max(df_rhist[idx, 'newRating'])
#    df_rhist$type[idx[ismax]] <<- 'max'
#    
#    ismin <- df_rhist[idx, 'newRating'] == min(df_rhist[idx, 'newRating'])
#    df_rhist$type[idx[ismin]] <<- 'min'
#    
#    df_rhist$adjusted_elo[idx] <<- df_rhist$adjusted_elo[idx] - min(df_rhist$adjusted_elo[idx])
#})

df$problemID_simple <- substr(df$problemID, 1, 1)

blank_theme <- theme(
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        strip.background = element_blank(),
        strip.text.x = element_blank(),
        legend.position = 'bottom')

df$divsion <- factor(df$division)

df2 <- df[!is.na(df$points),]
df2$division <- gsub('12', '1 + 2 Combined', df2$division)
df2$division <- gsub('1', 'Div. 1', df2$division)
df2$division <- gsub('2', 'Div. 2', df2$division)
df2$division <- factor(df2$division, levels=c('Div. 2', 'Div. 1', 'Div. 1 + Div. 2 Combined'))
fig_points_vs_rating <- ggplot(df2) +
    #geom_boxplot(aes(x=bin, y=problemRating), outlier.shape = NA) + 
    geom_violin(aes(x=bin, y=problemRating)) + 
    annotate("rect", ymin=1200, ymax=1399, xmin=-Inf, xmax=Inf, color=NA, fill='green', alpha=bgalpha) +
    annotate("rect", ymin=1400, ymax=1599, xmin=-Inf, xmax=Inf, color=NA, fill='#30DBCA', alpha=bgalpha) +
    annotate("rect", ymin=1600, ymax=1899, xmin=-Inf, xmax=Inf, color=NA, fill='#3094DB', alpha=bgalpha) +
    annotate("rect", ymin=1900, ymax=2199, xmin=-Inf, xmax=Inf, color=NA, fill='#B930DB', alpha=bgalpha) +
    annotate("rect", ymin=2200, ymax=2299, xmin=-Inf, xmax=Inf, color=NA, fill='#FFEA4D', alpha=bgalpha) +
    annotate("rect", ymin=2300, ymax=2399, xmin=-Inf, xmax=Inf, color=NA, fill='#FFBF00', alpha=bgalpha) +
    annotate("rect", ymin=2400, ymax=2599, xmin=-Inf, xmax=Inf, color=NA, fill='#FF7E61', alpha=bgalpha) +
    annotate("rect", ymin=2600, ymax=2899, xmin=-Inf, xmax=Inf, color=NA, fill='#FF4117', alpha=bgalpha) +
    annotate("rect", ymin=2900, ymax=Inf, xmin=-Inf, xmax=Inf, color=NA, fill='#CC0000', alpha=bgalpha) +
    geom_violin(aes(x=bin, y=problemRating)) + 
    geom_jitter(aes(x=bin, y=problemRating), width = .6, size=.5, alpha=.1, color='blue') +
    facet_wrap(~division) +
	theme(axis.text.x = element_text(angle = 45, hjust=1, vjust=1)) +
    labs(x="Points Assigned to Problem", y="Problem Rating")

c <- ggplot(df)
fig_points_histogram <- c + 
    annotate("rect", xmin=1200, xmax=1399, ymin=-Inf, ymax=Inf, color=NA, fill='green', alpha=bgalpha) +
    annotate("rect", xmin=1400, xmax=1599, ymin=-Inf, ymax=Inf, color=NA, fill='#30DBCA', alpha=bgalpha) +
    annotate("rect", xmin=1600, xmax=1899, ymin=-Inf, ymax=Inf, color=NA, fill='#3094DB', alpha=bgalpha) +
    annotate("rect", xmin=1900, xmax=2199, ymin=-Inf, ymax=Inf, color=NA, fill='#B930DB', alpha=bgalpha) +
    annotate("rect", xmin=2200, xmax=2299, ymin=-Inf, ymax=Inf, color=NA, fill='#FFEA4D', alpha=bgalpha) +
    annotate("rect", xmin=2300, xmax=2399, ymin=-Inf, ymax=Inf, color=NA, fill='#FFBF00', alpha=bgalpha) +
    annotate("rect", xmin=2400, xmax=2599, ymin=-Inf, ymax=Inf, color=NA, fill='#FF7E61', alpha=bgalpha) +
    annotate("rect", xmin=2600, xmax=2899, ymin=-Inf, ymax=Inf, color=NA, fill='#FF4117', alpha=bgalpha) +
    annotate("rect", xmin=2900, xmax=Inf,  ymin=-Inf, ymax=Inf, color=NA, fill='#CC0000', alpha=bgalpha) +
    geom_freqpoly(aes(x=problemRating, ..density.., linetype=division, group=division, color=division), binwidth=100) +
    scale_color_manual(values = c('black', 'black', 'gray')) +
	theme(axis.text.x = element_text(angle = 45, hjust=1, vjust=1))

#fig_index_vs_rating <- 
#test <- c + 
#    geom_line(aes(x=problemID_simple, y=adjusted_elo, group=contestID), alpha=.2) + 
#	theme(axis.text.x = element_text(angle = 45, hjust=1, vjust=1))
#ggplotly()

color_scale <- c('hardest problem in contest' = 'red', 
                 'easiet problem in contest'='blue',
                 'other'='gray')

df$division <- as.character(df$division)
df <- df[df$division != '12',]
df <- df[df$problemRating != 5000,]

fig_contestID_v_rating <-  
    ggplot(df) + 
    geom_point(aes(x=contestID, y=problemRating, color=type), alpha=.2, size=2)  +
    geom_line(data=df_averageRating, aes(x=contestID, y=averageRating, group=division, linetype=division), 
              color='black', size=1) + 
    scale_color_manual(values = color_scale) +
    facet_wrap(~division, drop=TRUE, ncol=1) +
    scale_alpha(range=c(0,1)) + 
    theme(legend.position = 'bottom')

#pdf('fig_contestID_v_rating.pdf', width=8, height=12); print(fig_contestID_v_rating); dev.off()
pdf('fig_points_vs_rating.pdf', width=10, height=5); print(fig_points_vs_rating); dev.off()
#pdf('fig_points_histogram.pdf', width=10, height=5); print(fig_points_histogram); dev.off()



quartz_off_screen 
                3 


In [807]:
%%R -i df_tags

sorting <- tapply(1:nrow(df_tags), df_tags$tag, function(idx){
    data.frame(tag=df_tags[idx, 'tag'][1], rating = median(df_tags[idx, 'problemRating']))
    
})

sorting <- do.call(rbind,sorting)
sorting <- sorting[order(sorting$rating),]

df_tags$tag <- factor(df_tags$tag, levels=sorting$tag)
#print(sorting[order(sorting),])

bgalpha <- .2
pdf('fig_tags.pdf', width=15, height=5)
c <- ggplot() +
    geom_violin(data=df_tags, aes(x=tag, y=problemRating), alpha=1) + 
    annotate("rect", ymin=1200, ymax=1399, xmin=-Inf, xmax=Inf, color=NA, fill='green', alpha=bgalpha) +
    annotate("rect", ymin=1400, ymax=1599, xmin=-Inf, xmax=Inf, color=NA, fill='#30DBCA', alpha=bgalpha) +
    annotate("rect", ymin=1600, ymax=1899, xmin=-Inf, xmax=Inf, color=NA, fill='#3094DB', alpha=bgalpha) +
    annotate("rect", ymin=1900, ymax=2199, xmin=-Inf, xmax=Inf, color=NA, fill='#B930DB', alpha=bgalpha) +
    annotate("rect", ymin=2200, ymax=2299, xmin=-Inf, xmax=Inf, color=NA, fill='#FFEA4D', alpha=bgalpha) +
    annotate("rect", ymin=2300, ymax=2399, xmin=-Inf, xmax=Inf, color=NA, fill='#FFBF00', alpha=bgalpha) +
    annotate("rect", ymin=2400, ymax=2599, xmin=-Inf, xmax=Inf, color=NA, fill='#FF7E61', alpha=bgalpha) +
    annotate("rect", ymin=2600, ymax=2899, xmin=-Inf, xmax=Inf, color=NA, fill='#FF4117', alpha=bgalpha) +
    annotate("rect", ymin=2900, ymax=Inf, xmin=-Inf, xmax=Inf, color=NA, fill='#CC0000', alpha=bgalpha) +
    geom_violin(data=df_tags, aes(x=tag, y=problemRating), alpha=1) + 
    geom_jitter(data=df_tags, aes(x=tag, y=problemRating), width = .6, size=.5, alpha=.2, color='blue') +
	theme(axis.text.x = element_text(angle = 45, hjust=1, vjust=1))
print(c)
dev.off()

quartz_off_screen 
                3 


# Duplicated Problems

In [391]:
# read in probable duplicate questions
duplicates = []
with open('problem_duplicates.csv') as f:
    lines = f.readlines()
    for line in lines[1:]:
        line = line.strip().split(',')
        if line[0] != line[2] and line[2] != line[3]:
            duplicates.append(line)

df_dup = []

cnt = 0
for dup in duplicates:
    p1 = df_problems.loc[(df_problems.contestID == int(dup[0])) & (df_problems.problemID == dup[1])]
    p2 = df_problems.loc[(df_problems.contestID == int(dup[2])) & (df_problems.problemID == dup[3])]
    if p1.shape[0] > 0 and p2.shape[0] > 0:
        if p1.loc[p1.index[0], 'division'] == 2:
            p1, p2 = p2, p1
        
        if abs(p1.loc[p1.index[0], 'contestID'] - p2.loc[p2.index[0], 'contestID']) > 1:
            continue
        
        if p1.loc[p1.index[0], 'problemID'] > p2.loc[p2.index[0], 'problemID']:
            continue
            
        data = {
            'd1_contestID':p1.loc[p1.index[0], 'contestID'],
            'd2_contestID':p2.loc[p2.index[0], 'contestID'],
            'd1_problemID':p1.loc[p1.index[0], 'problemID'],
            'd2_problemID':p2.loc[p2.index[0], 'problemID'],
            'd1_elo':p1.loc[p1.index[0], 'problemRating'],
            'd2_elo':p2.loc[p2.index[0], 'problemRating']
        }
        
        df_dup.append(data)
        
#        df_dup.append(p1[['problemRating', 'division']])
#        df_dup.append(p2[['problemRating', 'division']])
        
        cnt += 1
        if cnt == 100:
            break

df_dup = pd.DataFrame.from_dict(df_dup)

as expected, there is some ELO inflation for problems that appeared both in D1 and in D2 

In [821]:
%%R -i df_dup
df <- df_dup

ratings <- c(1200, 1400, 1600, 1900, 2200, 2300, 2400, 2600, 2900, Inf)
colorscales = c(
    '0' = 'gray',
    '1' = 'green',
    '2' = '#30DBCA',
    '3' = '#3094DB',
    '4' = '#B930DB',
    '5' = '#FFEA4D',
    '6' = '#FFBF00',
    '7' = '#FF7E61',
    '8' = '#FF4117',
    '9' = '#CC0000'
)
names(colorscales) <- unique(df$color)

df$color <- cut(df$d1_elo, ratings)
c <- ggplot(df, aes(x=d1_elo, y=d2_elo))  +
    geom_point(alpha=.5, aes(color=color), size=3) +
    geom_abline(intercept=0, slope=1, size=.2, color='black') +
    scale_color_manual(values = colorscales) +
    theme(legend.position = 'None') +
    labs(x='Div. 1 Problem ELO Score', y='Div. 2 Problem ELO Score')
#ggplotly()
pdf('fig_d1_v_d2.pdf', width=5, height=5)
print(c)
dev.off()

quartz_off_screen 
                3 


In [479]:
%%R -i df_problems

df_writers <- tapply(1:nrow(df_problems), df$contestID, function(x){
    df <- df_problems[x,]
    c('contest'=df$contestID[0], 'rating'=mean(df$problemRating))
})

df_writers <- sort(df_writers, decreasing=TRUE)
print(head(df_writers))
print(tail(df_writers))

     232      223      457      571      671      434 
3339.200 3125.400 3122.833 2978.400 2938.400 2906.200 
      9      16     412      34       4     413 
1450.60 1421.00 1405.20 1379.80 1373.25 1327.40 


In [480]:
base = 'http://codeforces.com/api/contest.standings?contestId='
suffix = '&showUnofficial=false'

id = '413'
url  = base + id + suffix

r = requests.get(url).json()['result']
r['contest']

{u'durationSeconds': 7200,
 u'frozen': False,
 u'id': 413,
 u'name': u'Coder-Strike 2014 - Round 2',
 u'phase': u'FINISHED',
 u'relativeTimeSeconds': 71594658,
 u'startTimeSeconds': 1397977200,
 u'type': u'CF'}