In [1]:
import numpy as np
import spacy
from DataReader.XMLReader import get_essays
from matplotlib import pyplot as plt
from collections import Counter
import operator

## 1. Load dataset

Several levels can be combined

In [3]:
levels = [4,5,6]
essays = []
scores = []
for level in levels:
    dataset = "EF-dataset/Ef" + str(level) + ".xml"
    print("loading level " + str(level) + "...")
    essays_level, scores_level = get_essays(dataset)
    essays += essays_level
    scores += scores_level
print("done")
print("Number of essays: " + str(len(essays)))

loading level 4...
loading level 5...
loading level 6...
done
Number of essays: 69355


## 2. Calculate score distribution
First we'll count the occurences for each score

In [None]:
# 2.1 count & sort results
c = Counter([int(score) for score in scores])
counted_scores = list(c.items())
counted_scores.sort(key = operator.itemgetter(0)) # sort them by score

In [None]:
# 2.2 create a figure
x_values, y_values = zip(*counted_scores)
y_values_dist = np.divide(y_values,len(essays)/100) # percentage instead of absolute values

#plt.figure(figsize=(25, 20))
#plt.bar(x_values,y_values_dist, color="blue",align='center', width=0.3)
#plt.title("Score distribution for levels " + ', '.join(map(str,levels)))
#plt.xlabel("")
#plt.ylabel("%")
#plt.xticks(x_values)

#for i, v in enumerate(y_values_dist):
#    plt.text(float(x_values[i])-0.5, v+0.2, str(round(v,2))+ "%",rotation=90,size=10)
#plt.show()

In [None]:
plt.figure(figsize=(5, 5))
plt.bar(x_values,y_values,width=1)
plt.title("Score distribution for levels " + ', '.join(map(str,levels)))
#plt.xlabel("")
#plt.ylabel("%")
#plt.xticks(x_values)
plt.show()

In [None]:
print("score\tnumber\tpercentage")
for score, number_of_essays in counted_scores:
    print(str(score) + "\t" + str(number_of_essays) + "\t"+ str(round(((number_of_essays/len(essays))*100),2)) + "%" )

## 3. Calculate the token-count distribution

In [None]:
# 3.1. tokenize texts & count tokens
token_count_list = []
def token_counter(doc):
    token_count_list.append(len(doc))
    return doc

nlp = spacy.load('en_core_web_sm')
# we only need the tokenizer
nlp.remove_pipe('tagger')
nlp.remove_pipe('parser')
nlp.remove_pipe('ner')

nlp.add_pipe(token_counter, name="token counter", last=True)

docs = list(nlp.pipe(essays, batch_size=500))

In [None]:
# 3.2 count token_num occurences
c = Counter(token_count_list)
counted_token = list(c.items())
counted_token.sort(key = operator.itemgetter(0)) # sort them by #tokens

# filter outliner
counted_token_filtered = [(tokennum,occurence) for (tokennum,occurence) in counted_token if tokennum < 500]

In [None]:
x_values, y_values = zip(*counted_token_filtered)
plt.figure(figsize=(5, 5))
plt.bar(x_values,y_values,width=1)
plt.title("Token distribution for levels " + ', '.join(map(str,levels)))
plt.show()

In [None]:
score_token_ratio = dict()
for i in range(1,101):
    score_token_ratio[i] = []
for index,doc in enumerate(docs):
    score = scores[index]
    score_token_ratio[score].append(len(doc))

for score, token_counts in score_token_ratio.items():
    if len(token_counts) == 0:
        score_token_ratio[score] = 0
        continue
    score_token_ratio[score] = np.average(token_counts)
score_token_ratio

In [None]:
x_values, y_values = zip(*score_token_ratio.items())
plt.figure(figsize=(5, 5))
plt.bar(x_values,y_values,width=1)
plt.title("Average token number for each grade for levels " + ', '.join(map(str,levels)))
plt.show()