In [1]:
import requests
import json
import time
from IPython.core.magic import register_line_cell_magic

@register_line_cell_magic
def rumble(line, cell=None):
    if cell is None:
        data = line
    else:
        data = cell

    start = time.time()                                                         
    response = json.loads(requests.post(server, data=data).text)                   
    end = time.time()                                                              
    print("Took: %s s" % (end - start))

    if 'warning' in response:
        print(json.dumps(response['warning']))
    if 'values' in response:
        for e in response['values']:
            print(json.dumps(e))
    elif 'error-message' in response:
        return response['error-message']
    else:
        return response

server = 'http://localhost:8001/jsoniq'

```
spark-submit --driver-memory 10G rumbledb-1.16.2-for-spark-3.1.jar --server yes --port 8001
```

```
docker run -p 8001:8001 --rm -it --mount type=bind,source=D:\Projects\bigdata-exercises\exercise11,target=/home  rumbledb/rumble --server yes --port 8001 --host 0.0.0.0 
```

## Assignment

confusion-100000.json  
confusion-2014-03-02/confusion-2014-03-02.json



1.Find the number of games where the guessed language is correct (meaning equal to the target one) and that language is Russian.

In [21]:
%%rumble
count(
for $i in json-file("confusion-2014-03-02/confusion-2014-03-02.json")
where $i.target eq $i.guess and $i.target eq "Russian"
return $i
)

Took: 13.592159986495972 s
290818


2.Return the number of distinct "target" languages.

In [23]:
%%rumble
let $targets :=
for $i in json-file("confusion-2014-03-02/confusion-2014-03-02.json")
return $i.target

return count(distinct-values($targets))


Took: 14.838841676712036 s
78


3.Return the sample IDs (i.e., the sample field) of the top two (2) games where the guessed language is correct (equal to the target one) ordered by language (ascending), then by country (ascending), then by date (ascending).

In [3]:
%%rumble
for $ i in json-file("confusion-2014-03-02/confusion-2014-03-02.json")
where $i.target eq $i.guess 
order by $i.target ascending, $i.country ascending, $i.date ascending
count $c where $c <= 2
return $i.sample

Took: 62.572317600250244 s
"00b85faa8b878a14f8781be334deb137"
"efcd813daec1c836d9f030b30caa07ce"


4.Aggregate all games by country and target language, counting the number of guesses for each group and return the frequencies of the three most frequent country/language combinations.

In order to help with the grading please add them in the following format: count1,count2,count3 (meaning separated with commas and without any spaces between them)

In [20]:
%%rumble
let $x :=
for $i in json-file("confusion-2014-03-02/confusion-2014-03-02.json")
group by $country  := $i.country, $target := $i.target
return {"country": $country, "target": $target, "cnt":  count($i)}

for $j in $x
order by $j.cnt descending
count $c where $c <= 3
return $j

Took: 43.2846405506134 s
{"country": "US", "target": "French", "cnt": 112934}
{"country": "US", "target": "German", "cnt": 112007}
{"country": "US", "target": "Spanish", "cnt": 110919}


5.Find the percentage of games where (the answer was correct && the correct guess was the first choice amongst the array of possible answers)

Please write the fraction rounding to 4 decimals (eg. 0.3323)

In [9]:
%%rumble
count(
    for $i in json-file("confusion-2014-03-02/confusion-2014-03-02.json")
    where $i.target eq $i.guess and $i.guess eq $i.choices[[1]]
    return $i)
div
count(
    for $i in json-file("confusion-2014-03-02/confusion-2014-03-02.json") 
    return $i)

Took: 27.67630171775818 s
0.2560398308


6.Sort the languages by decreasing overall percentage of correct guesses and return the first three languages.

In order to help with the grading please add them in the following format: language1,language2,language3 (meaning separated with commas and without any spaces between them)

In [2]:
%%rumble
let $x := 
for $i in json-file("confusion-2014-03-02/confusion-2014-03-02.json")
where $i.target eq $i.guess
group by $language := $i.target
return {"language" : $language, "cnt1" : count($i) }

let $y := 
for $j in json-file("confusion-2014-03-02/confusion-2014-03-02.json")
group by $language := $j.target
return {"language" : $language, "cnt2" : count($j) }

let $z :=
for $i in $x, $j in $y
where $i.language eq $j.language
return {"language": $i.language, "p":  $i.cnt1 div $j.cnt2}

for $k in $z
order by $k.p descending
count $c where $c <= 3
return $k

Took: 54.376150608062744 s
{"language": "French", "p": 0.9382414927}
{"language": "German", "p": 0.9197634593}
{"language": "Spanish", "p": 0.8956432116}


Trying using let to emulate AS
It seems that you must use a variable to bind to the result of join ($z). Then iterate that join result. 

In [3]:
%%rumble
let $x := 
for $i in json-file("confusion-2014-03-02/confusion-2014-03-02.json")
where $i.target eq $i.guess
group by $language := $i.target
return {"language" : $language, "cnt1" : count($i) }

let $y := 
for $j in json-file("confusion-2014-03-02/confusion-2014-03-02.json")
group by $language := $j.target
return {"language" : $language, "cnt2" : count($j) }

let $z :=
for $i in $x, $j in $y
where $i.language eq $j.language
let $p := $i.cnt1 div $j.cnt2
order by $p descending
count $c where $c <= 3
return {"language": $i.language, "p":  $p}

for $k in $z
return $k

Took: 51.431007623672485 s
{"language": "French", "p": 0.9382414927}
{"language": "German", "p": 0.9197634593}
{"language": "Spanish", "p": 0.8956432116}


7.Return the number games played on the latest day.

In [18]:
%%rumble
let $date := max(for $i in json-file("confusion-2014-03-02/confusion-2014-03-02.json") return $i.date)
return count(
    for $i in json-file("confusion-2014-03-02/confusion-2014-03-02.json")
    where $i.date eq $date
    return $i
)

Took: 26.778305053710938 s
65653
