In [61]:

from collections import Counter
import spacy
nlp = spacy.load("en_core_web_sm")
path_en = "en_pud-ud-test.txt"
path_is = "is_pud-ud-test.txt"

In [62]:

fp_en = open(f'{path_en}')
data_en = fp_en.read()

fp_is = open(f'{path_is}')
data_is = fp_is.read()


In [70]:
def top_20_pos_dep_extractor(data ):
    doc = nlp(data)
    # Extract the POS tags and dependency labels
    pos_tags = [token.pos_ for token in doc]
    dep_labels = [token.dep_ for token in doc]

    # Count the occurrences of each POS tag and dependency label
    pos_counts = Counter(pos_tags)
    dep_counts = Counter(dep_labels)

    # Sort the results in descending order of frequency
    top_pos = pos_counts.most_common(20)
    top_deps = dep_counts.most_common(20)

    return top_pos, top_deps

def compare_features (top_pos_lang1,top_deps_lang1,lang_tag1,top_pos_lang2,top_deps_lang2,lang_tag2):
    
    top_pos_lang1 = dict(top_pos_lang1)
    top_pos_lang2 =  dict(top_pos_lang2)
    top_deps_lang1 = dict(top_deps_lang1)
    top_deps_lang2 =  dict(top_deps_lang2)
    print(f'\nComparing the top 20 POS tags - change {lang_tag1}/{lang_tag2}%:')
    for tag in top_pos_lang1:
        if tag in top_pos_lang2:
            print(tag, (top_pos_lang1[tag]/top_pos_lang2[tag])*100 - 100)
        else:
            print(f'{tag} is not present in {lang_tag2}')

    print(f'\nComparing the top 20 dependency labels - change {lang_tag1}/{lang_tag2}%:')
    for tag in top_deps_lang1:
        if tag in top_deps_lang2:
            print(tag, (top_deps_lang1[tag]/top_deps_lang2[tag])*100 - 100)
        else:
            print(f'{tag} is not present in {lang_tag2}')
        

def display_features(top_pos,top_deps,lang_tag):
    # Print the results
    print(f'\nTop 20 POS tags - {lang_tag}:')
    for tag, count in top_pos:
        print(f'{tag}\t{count}')

    print(f'\nTop 20 dependency labels - {lang_tag}:')
    for label, count in top_deps:
        print(f'{label}\t{count}')

    

In [64]:
top_pos_en, top_deps_en = top_20_pos_dep_extractor(data_en)

top_pos_is, top_deps_is = top_20_pos_dep_extractor(data_is)


display_features(top_pos_en, top_deps_en, "English")

display_features(top_pos_is, top_deps_is, 'Icelandic')




Top 20 POS tags - English:
NOUN	3840
ADP	2559
PUNCT	2449
VERB	2146
DET	2059
PROPN	2045
SPACE	1614
ADJ	1477
PRON	1076
AUX	1029
ADV	736
CCONJ	573
PART	459
NUM	440
SCONJ	292
SYM	24
X	3
INTJ	1

Top 20 dependency labels - English:
punct	2461
prep	2405
pobj	2338
det	2047
dep	1627
nsubj	1374
amod	1327
compound	1036
ROOT	991
dobj	871
advmod	812
aux	673
conj	613
cc	575
poss	354
auxpass	267
ccomp	264
nummod	258
advcl	256
nsubjpass	222

Top 20 POS tags - Icelandic:
PROPN	10349
NOUN	3060
PUNCT	2118
SPACE	1594
ADP	1018
VERB	696
ADJ	613
INTJ	529
NUM	347
X	157
ADV	75
PRON	55
SCONJ	49
AUX	43
DET	40
SYM	9
CCONJ	3
PART	3

Top 20 dependency labels - Icelandic:
compound	6464
punct	2232
dep	1700
nmod	1558
appos	1357
prep	1244
pobj	1098
ROOT	1026
dobj	892
nsubj	813
amod	553
intj	444
nummod	298
conj	243
npadvmod	224
advmod	125
ccomp	106
det	85
advcl	58
relcl	50


In [71]:
compare_features(top_pos_is, top_deps_is,'Icelandic',top_pos_en, top_deps_en, "English")


Comparing the top 20 POS tags - change Icelandic/English%:
PROPN 406.0635696821516
NOUN -20.3125
PUNCT -13.515720702327485
SPACE -1.2391573729863694
ADP -60.218835482610395
VERB -67.56756756756756
ADJ -58.49695328368314
INTJ 52800.0
NUM -21.136363636363626
X 5133.333333333334
ADV -89.80978260869566
PRON -94.88847583643123
SCONJ -83.21917808219177
AUX -95.82118561710398
DET -98.05730937348227
SYM -62.5
CCONJ -99.47643979057591
PART -99.34640522875817

Comparing the top 20 dependency labels - change Icelandic/English%:
compound 523.9382239382239
punct -9.305160503860222
dep 4.48678549477566
nmod is not present in English
appos is not present in English
prep -48.274428274428274
pobj -53.03678357570573
ROOT 3.5317860746720413
dobj 2.411021814006901
nsubj -40.8296943231441
amod -58.327053504144686
intj is not present in English
nummod 15.503875968992247
conj -60.35889070146819
npadvmod is not present in English
advmod -84.60591133004925
ccomp -59.84848484848485
det -95.84758182706399
advcl

For the top 20 POS tags:

    The most frequent POS tag in Icelandic is PROPN, while in English it is NOUN.
    The second most frequent tag is PUNCT in Icelandic, but ADP in English.
    The proportion of INTJ in Icelandic is higher than in English, by about 52800% and the proportion of X is also much higher in Icelandic, by about 5133%.
    The least frequent tags in Icelandic are CCONJ and PART, while in English those are X and INTJ.
    The lowest compared frequencies are had by CCONJ and PART, which are about 99% less present in Icelandic compared to English

For the top 20 dependency labels:

    The most frequent dependency label in  Icelandic is compound, while in English it's punct.
    The second most frequent label in Icelandic is punct, but in English it is prep.
    The proportion of compound is much higher in Icelandic than in English, by about 523%.
    The labels nmod, appos, and npadvmod are not present in the top 20 labels of English.
    The label "det" is about 95% less frequent in Icelandic than in English
    The least frequent labels in Icelandic are det, advcl and relcl, while in English, they are nummod, advcl and nsubjpass.