# Scripts for coursework
## Step 1. Preparing data

In [2]:
import xml.etree.ElementTree as Xet
from bs4 import BeautifulSoup as bs
import pandas as pd
import xmltodict, csv

### Reading input & extracting problems

In [4]:
# Reading input file
fd = open('external-43799.xml', 'r')
xml_file=fd.read()
fd.close()

soup=bs(xml_file, 'lxml')

# Preparing header for df_problems
cols = ["problem_title", "problem_name"]
rows = []

for tag in soup.findAll("problem"):
    title = tag["title"]
    name = tag["longname"]

    rows.append({"problem_title": title,
                 "problem_name": name})

df_problems = pd.DataFrame(rows, columns=cols)
df_problems = df_problems.set_index('problem_name')

df_problems

Unnamed: 0_level_0,problem_title
problem_name,Unnamed: 1_level_1
Дележ яблок-1,A
Последняя цифра,B
100A summer,C
100 раз подряд в квадрате,D
Сумма цифр трехзначного числа,E
Пингвины,F*
Шашки,G*
Исполнитель раздвоитель,H*
Ханойские башни,I**
Ремонт в Ханое,J**


### Extracting users

In [5]:
cols = ["user_id", "login", "user_name"]
rows = []

for tag in soup.findAll("user"):
    id = tag["id"]
    login = tag["loginname"]
    name = tag["displayedname"]

    rows.append({"user_id": id,
                 "login": login,
                 "user_name": name})

df_users = pd.DataFrame(rows, columns=cols)
df_users =df_users.set_index('user_id')

df_users

Unnamed: 0_level_0,login,user_name
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
104786219,samonenko,Илья Самоненко
104786220,hse-cs-fd-2022-1,hse-cs-fd-2022-1
104786221,hse-cs-fd-2022-2,hse-cs-fd-2022-2
104786222,hse-cs-fd-2022-3,hse-cs-fd-2022-3
104786223,hse-cs-fd-2022-4,hse-cs-fd-2022-4
...,...,...
104786315,hse-cs-fd-2022-96,hse-cs-fd-2022-96
104786316,hse-cs-fd-2022-97,hse-cs-fd-2022-97
104786317,hse-cs-fd-2022-98,hse-cs-fd-2022-98
104786318,hse-cs-fd-2022-99,hse-cs-fd-2022-99


### Extracting submits without tests

In [9]:
# Preparing header for submits df
cols = ["contestTime", "absoluteTime", "lastUpdateTime", "id", "problem", "user", "lang", "maxTime", "maxMemory", "verdict"]
rows = []

# Preparing header for submit's tests df_tests
test_cols = ["submit_id", "number", "verdict", "time", "memory"]
test_rows = []

# Parsing input for submits (without tests)
for tag in soup.findAll("submit"):
    contestTime = tag["contesttime"]
    absoluteTime = tag["absolutetime"]
    lastUpdateTime = tag["lastupdatetime"]
    id = tag["id"]
    problem = tag["problemtitle"]
    user = tag["userid"]
    lang = tag["languageid"]
    time = tag["maxtimeusedmillis"]
    memory = tag["maxmemoryusedbytes"]
    verdict = tag["verdict"]
    
    for test in tag.findAll("test"):
        number = test["number"]
        t_verdict = test["verdict"]
        t_time = test["timeusedmillis"]
        t_memory = test["memoryusedbytes"]

        test_rows.append({"submit_id": id,
                          "number": number,
                          "verdict": t_verdict,
                          "time": t_time,
                          "memory": t_memory})
    
    rows.append({"contestTime": contestTime,
                 "absoluteTime": absoluteTime,
                 "lastUpdateTime": lastUpdateTime,
                 "id": id,
                 "problem": problem,
                 "user": user,
                 "lang": lang,
                 "maxTime": time,
                 "maxMemory": memory,
                 "verdict": verdict})
    
df = pd.DataFrame(rows, columns = cols)


df_tests = pd.DataFrame(test_rows, columns=test_cols)
df_tests = df_tests.set_index('submit_id')

df

Unnamed: 0,contestTime,absoluteTime,lastUpdateTime,id,problem,user,lang,maxTime,maxMemory,verdict
0,27347772,1670474147000,1670474150000,78966875,A,104786269,gcc7_3,0,0,CE
1,27357271,1670474157000,1670474167000,78966877,A,104786269,python3_7_3,253,4448256,OK
2,27415883,1670474215000,1670474232000,78966881,B,104786269,python3_7_3,70,4448256,OK
3,27482637,1670474282000,1670474285000,78966890,C,104786269,python3_7_3,47,4083712,OK
4,27571229,1670474371000,1670474378000,78966899,D,104786269,python3_7_3,48,4444160,OK
...,...,...,...,...,...,...,...,...,...,...
609,137125694,1670583925000,1670583928000,79063954,G*,104786246,python3_7_3,46,3928064,WA
610,137289823,1670584089000,1670584104000,79064135,G*,104786246,python3_7_3,79,4448256,OK
611,144321452,1670591121000,1670591132000,79073077,I**,104786301,python3_7_3,167,4444160,OK
612,146125154,1670592925000,1670592929000,79075929,J**,104786301,python3_7_3,51,4202496,WA


### Creating modified dataframes 
Different interpretations of verdicts:
- df: original dataframe with all verdicts
- df_hard: dataframe with OK as OK and others as FAIL verdicts (partial solution is FAIL)
- df_soft: dataframe with RE, CE and PE as FAIL, others as OK (partial solution is SUCCESS)

In [20]:
df_hard = df
# print(df_hard.groupby(['verdict'])['id'].count())
df_hard.verdict[df_hard.verdict != 'OK'] = 'FAIL'
df_hard

Unnamed: 0,contestTime,absoluteTime,lastUpdateTime,id,problem,user,lang,maxTime,maxMemory,verdict
0,27347772,1670474147000,1670474150000,78966875,A,104786269,gcc7_3,0,0,FAIL
1,27357271,1670474157000,1670474167000,78966877,A,104786269,python3_7_3,253,4448256,OK
2,27415883,1670474215000,1670474232000,78966881,B,104786269,python3_7_3,70,4448256,OK
3,27482637,1670474282000,1670474285000,78966890,C,104786269,python3_7_3,47,4083712,OK
4,27571229,1670474371000,1670474378000,78966899,D,104786269,python3_7_3,48,4444160,OK
...,...,...,...,...,...,...,...,...,...,...
609,137125694,1670583925000,1670583928000,79063954,G*,104786246,python3_7_3,46,3928064,FAIL
610,137289823,1670584089000,1670584104000,79064135,G*,104786246,python3_7_3,79,4448256,OK
611,144321452,1670591121000,1670591132000,79073077,I**,104786301,python3_7_3,167,4444160,OK
612,146125154,1670592925000,1670592929000,79075929,J**,104786301,python3_7_3,51,4202496,FAIL


## Step 2. Building process models