#Setup

##Bibliotheken laden und importieren

In [1]:
%%capture
!pip install language-tool-python
!pip install xlsxwriter
!pip install -U spacy
!pip install textacy
!python -m spacy download de_core_news_lg
!pip install lexical-diversity
!pip install compound-split

In [2]:
import pandas as pd
import numpy as np
import xlsxwriter
import math
import spacy
import textacy
import lexical_diversity
import language_tool_python
from tqdm import tqdm
from spacy.tokens import DocBin
from textacy import text_stats
from lexical_diversity import lex_div as ld
from compound_split import char_split

In [3]:
!pip install Stanza
import stanza
corenlp_dir = './corenlp'
stanza.install_corenlp(dir=corenlp_dir)
import os
os.environ["CORENLP_HOME"] = corenlp_dir
stanza.download_corenlp_models(model='german', version='4.1.0', dir=corenlp_dir)
from stanza.server import CoreNLPClient

Collecting Stanza
  Downloading stanza-1.2.3-py3-none-any.whl (342 kB)
[?25l[K     |█                               | 10 kB 22.8 MB/s eta 0:00:01[K     |██                              | 20 kB 28.9 MB/s eta 0:00:01[K     |██▉                             | 30 kB 33.0 MB/s eta 0:00:01[K     |███▉                            | 40 kB 34.6 MB/s eta 0:00:01[K     |████▉                           | 51 kB 31.4 MB/s eta 0:00:01[K     |█████▊                          | 61 kB 29.9 MB/s eta 0:00:01[K     |██████▊                         | 71 kB 21.0 MB/s eta 0:00:01[K     |███████▋                        | 81 kB 22.4 MB/s eta 0:00:01[K     |████████▋                       | 92 kB 23.9 MB/s eta 0:00:01[K     |█████████▋                      | 102 kB 25.1 MB/s eta 0:00:01[K     |██████████▌                     | 112 kB 25.1 MB/s eta 0:00:01[K     |███████████▌                    | 122 kB 25.1 MB/s eta 0:00:01[K     |████████████▌                   | 133 kB 25.1 MB/s eta 0:0

Installing CoreNLP package into ./corenlp...


Downloading http://nlp.stanford.edu/software/stanford-corenlp-latest.zip:   0%|          | 0.00/504M [00:00<?,…

For customized installation location, please set the `CORENLP_HOME` environment variable to the location of the installation. In Unix, this is done with `export CORENLP_HOME=./corenlp`.
Downloading german models (version 4.1.0) into directory ./corenlp...


Downloading http://nlp.stanford.edu/software/stanford-corenlp-4.1.0-models-german.jar:   0%|          | 0.00/2…

In [4]:
tool = language_tool_python.LanguageTool('de-DE') 
nlp = spacy.load("de_core_news_lg")
de = textacy.load_spacy_lang("de_core_news_lg")
nlp = spacy.load("de_core_news_lg")

Downloading LanguageTool: 100%|██████████| 203M/203M [00:04<00:00, 44.0MB/s]
Unzipping /tmp/tmpfeso5uqa.zip to /root/.cache/language_tool_python.
Downloaded https://www.languagetool.org/download/LanguageTool-5.4.zip to /root/.cache/language_tool_python.


##Funktionen definieren

In [18]:
def count_errors(_text):
  errors = 0
  matches = tool.check(_text)
  for match in matches:
    if (match.ruleIssueType == 'uncategorized' or match.ruleIssueType == 'misspelling') and (match.matchedText not in '-month-' or match.matchedText not in '-number-'):
        errors += 1
  return errors

def correct_errors(_text):
  if match.matchedText != '-month-' or match.matchedText != '-number-':
    correct_text = tool.correct(_text)
  return correct_text

def load_docs_from_disc(_path):
  from spacy.tokens import DocBin
  _nlp = spacy.load("de_core_news_lg")
  _doc_bin = DocBin().from_disk(_path)
  _docs = list(_doc_bin.get_docs(_nlp.vocab))
  return _docs

def get_diversity_features(_doc): #braucht spacy doc als Input
  _lemma = []
  for _token in _doc:
      _lemma.append(_token.lemma_)
  _freatures = {
  'lex_ttr': ld.ttr(_lemma),
  'lex_root_ttr': ld.root_ttr(_lemma),
  'lex_hdd': ld.hdd(_lemma),
  'lex_mtld': ld.mtld(_lemma)
  }
  return _freatures

def get_num_adverb_types(_doc):
  _adverbs = []
  for _token in _doc:
    if 'ADV' in _token.tag_:
      _adverbs.append(_token.lemma_)
  if len(_adverbs) == 0:
    _num_adverbs = 0 
    _num_adverb_types = 0
  else:
    _adverbs = np.char.lower(np.array(_adverbs))
    _num_adverbs = len(_adverbs)
    _num_adverb_types = len(np.unique(_adverbs))
  return _num_adverb_types, _num_adverbs 

def get_num_adjective_types(_doc):
  _adjectives = []
  for _token in _doc:
    if 'ADJ' in _token.tag_:     
      _adjectives.append(_token.lemma_)
  if len(_adjectives) == 0:
    _num_adjectives = 0 
    _num_adjective_types = 0
  else:
    _adjectives = np.char.lower(np.array(_adjectives))
    _num_adjectives = len(_adjectives)
    _num_adjective_types = len(np.unique(_adjectives))
  return _num_adjective_types, _num_adjectives

def get_num_verb_types(_doc):
  _verbs = []
  for _token in _doc:
    if 'VV' in _token.tag_ or 'VA' in _token.tag_ or 'VM' in _token.tag_:
      _verbs.append(_token.lemma_)
  if len(_verbs) == 0:
    _num_verbs = 0 
    _num_verb_types = 0
  else:
    _verbs = np.char.lower(np.array(_verbs))
    _num_verbs = len(_verbs) 
    _num_verb_types = len(np.unique(_verbs))
  return _num_verb_types, _num_verbs

def get_num_lexical_words(_doc):
  _lex_words = []
  for _token in _doc:
    if 'VV' in _token.tag_ or 'VA' in _token.tag_ or 'VM' in _token.tag_ or 'ADJ' in _token.tag_ or 'NN' in _token.tag_:
      _lex_words.append(_token.lemma_)
  if len(_lex_words) == 0:
    _num_lex_words = 0 
  else:
    _lex_words = np.char.lower(np.array(_lex_words))
    _num_lex_words = len(_lex_words) 
  return _num_lex_words

def save_division(n, d):
    return n / d if d else 0

def get_num_articles(_doc):
  _articles = []
  for _token in _doc:
    if 'ART' in _token.tag_:
      _articles.append(_token.text) 
  return len(_articles)

def get_num_nouns(_doc):
  _nouns = []
  for _token in _doc:
    if _token.pos_ == 'NOUN':
      _nouns.append(_token.text)
  return len(_nouns)

def get_num_noun_types(_doc):
  _nouns = []
  for _token in _doc:
    if _token.pos_ == 'NOUN':
      _nouns.append(_token.lemma_)
  if len(_nouns) == 0:
    _num_nouns = 0 
    _num_nouns_types = 0
  else:
    _nouns = np.char.lower(np.array(_nouns))
    _num_nouns = len(_nouns) 
    _num_nouns_types = len(np.unique(_nouns))
  return _num_nouns_types, _num_nouns

def get_num_verb_person(_doc, _person):
  _fist_person_words = []
  for _token in _doc:
    if _person in _token.morph.get('Person'):
      _fist_person_words.append(_token.text)
  return len(_fist_person_words)

def get_num_past_tense_verbs(_doc):
  _past_tense_verbs = []
  for _token in _doc:
    if 'VV' in _token.tag_ or 'VA' in _token.tag_ or 'VM' in _token.tag_ and 'Past' in _token.morph.get('Tense'):
      _past_tense_verbs.append(_token.text)
  return len(_past_tense_verbs)

def get_num_finverbs(_doc):
  _verbs = []
  for _token in _doc:
    if 'VVFIN' in _token.tag_ or 'VAFIN' in _token.tag_ or 'VMFIN' in _token.tag_:
      _verbs.append(_token.text)
  return len(_verbs)

def get_num_past_participle_verbs(_doc):
  _verbs = []
  for _token in _doc:
    if 'VVPP' in _token.tag_ or 'VAPP' in _token.tag_ or 'VMPP' in _token.tag_:
      _verbs.append(_token.text)
  return len(_verbs)

def get_num_case_nouns(_doc, _case):
  _case_nouns = []
  for _token in _doc:
    if 'NN' in _token.tag_ and _case in _token.morph.get('Case'):
      _case_nouns.append(_token.text)
    if len(_case_nouns) == 0:
      return 0
    else:
      return len(_case_nouns)

def get_num_compound_nouns(_doc):
  _compounds = []
  for _token in _doc:
    _split = char_split.split_compound(_token.text)
    if _token.pos_ == 'NOUN' and _split[0][0] > 0.5:
      _compound = []
      _compound.append(_token.text)
      _compound.append(_split[0])
      _compounds.append(_compound)
  return len(_compounds)

def get_num_numerus(_doc, _num):
  _numerus = []
  for _token in _doc:
    if _num in _token.morph.get(_num):
      _numerus.append(_token.text)
  return len(_numerus)

def get_num_suffix(_doc, _suffix):
  _suffix = []
  for _token in _doc:
    if 'NN' in _token.tag_ and  _token.text.endswith(str(_suffix)):
      _suffix.append(_token.text)
  return len(_suffix)

  def match_pattern(_pattern, _text):
    matches = client.tregex(_text,_pattern)
    print(matches)
    l = matches['sentences']
    for sentence in l:
      for match_id in sentence:
        print(sentence[match_id]['match'])

def count_matches(_pattern, _text):
  matches = client.tregex(_text,_pattern)
  list = matches['sentences']
  count_units = 0
  for sentence in l:
    for match_id in sentence:
      count_units = count_units + 1
  return count_unit, l

def avg_length_matches(_pattern, _text):
  matches = client.tregex(_text,_pattern)
  l = matches['sentences']
  length_sum = 0
  for sentence in l:
    for match_id in sentence:
      start = sentence[match_id]['characterOffsetBegin']
      end = sentence[match_id]['characterOffsetEnd']
      length_sum = length_sum + (end - start)
  avg_length = length_sum/len(l[0])
  return avg_length, l

  def save_division(n, d):
    return n / d if d else 0
    
def get_dep_per_noun(_doc):
  _num_nouns = 0
  _num_dependents_noun = 0
  for _token in _doc:
    if 'NOUN' in _token.pos_:
      _num_nouns += 1
      _num_dependents_noun += len([child for child in _token.children])
  return save_division(_num_dependents_noun,_num_nouns)

def get_dep_per_verb(_doc):
  _num_verbs = 0
  _num_dependents_verb = 0
  for _token in _doc:
    if 'VERB' in _token.pos_:
      _num_verbs += 1
      _num_dependents_verb += len([child for child in _token.children])
  return save_division(_num_dependents_verb,_num_verbs)

def get_avg_frequency(_doc, _frequency_table, _column_of_words, _solumn_of_values):
  _tokens = 0
  _sum_frequency = 0
  for _token in _doc:
    _tokens += 1
    _match = _frequency_table[_frequency_table[_column_of_words]==_token.text]
    if math.isnan(_match[_solumn_of_values].max()) == False:
      _sum_frequency += _match[_solumn_of_values].max()
  return _sum_frequency / _tokens

def get_textacy_features(_doc): #braucht Spacy Doc als Inputformat
  _ts = text_stats.TextStats(_doc)
  _features = {
  'shal_text_length': _ts.n_words,
  'shal_chars_per_word' : _ts.n_chars/_ts.n_words,
  'shal_polysyl_to_word':  _ts.n_polysyllable_words/_ts.n_words,
  'shal_sents_length': _ts.n_words/_ts.n_sents,
  'shal_syl_per_word': sum(_ts.n_syllables_per_word)/_ts.n_words,
  'lex_fog': _ts.gunning_fog_index,
  'lex_smog': _ts.smog_index,
  'lex_wiener_sachtextformel': _ts.wiener_sachtextformel,
  }
  return _features

#Daten laden

In [6]:
data = pd.read_excel('/content/drive/MyDrive/Masterarbeit/Data/texts.xlsx', index_col=0)  

#Spacy Dokumente erstellen und speichern

In [None]:
texts = data['text'].tolist()
docs = []
for doc in tqdm(nlp.pipe(texts)):
    docs.append(doc)

1033it [00:16, 61.87it/s] 


##Spacy Dokumente speichern

In [None]:
doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=True)
for doc in tqdm(docs):
  doc_bin.add(doc)

100%|██████████| 1033/1033 [00:01<00:00, 803.34it/s]


In [None]:
doc_bin = DocBin(docs=docs)

In [None]:
doc_bin.to_disk("/content/drive/MyDrive/Masterarbeit/Data/spacy_docs.spacy")

#Spacy Dokumente laden

In [7]:
def load_docs_from_disc(_path):
  from spacy.tokens import DocBin
  _nlp = spacy.load("de_core_news_lg")
  _doc_bin = DocBin().from_disk(_path)
  _docs = list(_doc_bin.get_docs(_nlp.vocab))
  return _docs

In [8]:
docs = load_docs_from_disc('/content/drive/MyDrive/Masterarbeit/Data/spacy_docs.spacy')

#Rechtschreibfehler auslesen
https://pypi.org/project/language-tool-python/

In [None]:
num_errors = []
for text in tqdm(data['text']):
  num_errors.append(count_errors(text))
num_sents = []
for doc in tqdm(docs):
  ts = text_stats.TextStats(doc)
  num_sents.append(ts.n_sents)


100%|██████████| 1033/1033 [13:11<00:00,  1.31it/s]
100%|██████████| 1033/1033 [00:00<00:00, 13804.48it/s]


In [None]:
avg_num_errors = [i / j for i, j in zip(num_errors, num_sents)]

In [None]:
print(num_errors[0:3])
print(num_sents[0:3])
print(avg_num_errors[0:3])

[15, 25, 32]
[22, 15, 24]
[0.6818181818181818, 1.6666666666666667, 1.3333333333333333]


In [None]:
feature_errors = data.drop(['text'], axis=1)
feature_errors['er_avg_num_errors'] = avg_num_errors


In [None]:
feature_errors.to_excel('/content/drive/MyDrive/Masterarbeit/Data/features_errors.xlsx', engine='xlsxwriter', index=False)  

#Features mit textacy 
(Shallow Features, Features Lesbarkeitsforschung)
https://textacy.readthedocs.io/en/latest/index.html


In [19]:
text_length = []
chars_per_word = []
polysyl_to_word = []
sents_length = []
syl_per_word = []
fog = []
smog = []
wiener_sachtextformel = []

for doc in tqdm(docs):
  feat = get_textacy_features(doc)
  text_length.append(feat['shal_text_length'])
  chars_per_word.append(feat['shal_chars_per_word'])
  polysyl_to_word.append(feat['shal_polysyl_to_word'])
  sents_length.append(feat['shal_sents_length'])
  syl_per_word.append(feat['shal_syl_per_word'])
  fog.append(feat['lex_fog'])
  smog.append(feat['lex_smog'])
  wiener_sachtextformel.append(feat['lex_wiener_sachtextformel'])

textacy_features = data.drop(columns='text')
textacy_features['shal_text_length'] = text_length
textacy_features['shal_chars_per_word'] = chars_per_word
textacy_features['shal_polysyl_to_word'] = polysyl_to_word
textacy_features['shal_sents_length'] = sents_length
textacy_features['shal_syl_per_word'] = syl_per_word
textacy_features['lex_fog'] = fog
textacy_features['lex_smog'] = smog
textacy_features['les_wiener_sachtextformel'] = wiener_sachtextformel

  0%|          | 0/1033 [00:00<?, ?it/s]SMOG index may be unreliable for n_sents < 30
doc lang = '%s', but wiener sachtextformel is meant for use on German-language texts, only
  0%|          | 0/1033 [00:01<?, ?it/s]


KeyError: ignored

In [None]:
textacy_features

Unnamed: 0,id,shal_text_length,shal_chars_per_word,shal_polysyl_to_word,shal_sents_length,shal_syl_per_word,les_fog,les_smog,les_wiener_sachtextformel
0,1023_0101853,166,5.897590,0.204819,7.545455,1.987952,11.210953,10.230975,7.209793
1,1023_0101856,129,5.759690,0.224806,8.600000,1.868217,12.432248,11.072351,7.514238
2,1023_0101855,186,5.166667,0.145161,7.750000,1.752688,8.906452,9.188382,4.994510
3,1023_0101893,171,5.929825,0.251462,9.000000,1.976608,13.658480,11.723239,8.783075
4,1023_0101895,169,6.118343,0.266272,10.562500,2.029586,14.875888,12.709667,9.080222
...,...,...,...,...,...,...,...,...,...
1028,1023_0101849,149,5.583893,0.214765,7.450000,1.879195,11.570604,10.355216,6.886009
1029,1023_0101848,110,5.890909,0.236364,7.857143,2.018182,12.597403,10.914261,8.507896
1030,1023_0101852,150,5.700000,0.226667,8.333333,1.913333,12.400000,10.980519,7.767067
1031,1023_0101851,162,6.148148,0.253086,7.714286,2.024691,13.209171,11.111385,8.683285


In [None]:
textacy_features.to_excel('/content/drive/MyDrive/Masterarbeit/Data/features_textacy.xlsx', engine='xlsxwriter', index=False) 

#Lexical Features

##Lexical Diversity Features
https://pypi.org/project/lexical-diversity/

In [None]:
hdd = []
mtld =[]
root_ttr =[]
ttr =[]
for doc in tqdm(docs):
  div = get_diversity_features(doc)
  hdd.append(div['lex_hdd'])
  mtld.append(div['lex_mtld'])
  root_ttr.append(div['lex_root_ttr'])
  ttr.append(div['lex_ttr'])
diversity_features = data.drop(columns='text')
diversity_features['lex_hdd'] = hdd
diversity_features['lex_mtld'] = mtld
diversity_features['lex_root_ttr'] = root_ttr
diversity_features['lex_ttr'] = ttr

100%|██████████| 1033/1033 [00:02<00:00, 439.98it/s]


In [None]:
diversity_features

Unnamed: 0,id,lex_hdd,lex_mtld,lex_root_ttr,lex_ttr
0,1023_0101853,0.809061,60.495676,8.190251,0.598930
1,1023_0101856,0.870824,87.934713,8.168874,0.687943
2,1023_0101855,0.775203,52.001214,7.645529,0.531401
3,1023_0101893,0.804940,62.044725,8.092098,0.579487
4,1023_0101895,0.799652,53.467725,7.740703,0.570652
...,...,...,...,...,...
1028,1023_0101849,0.796394,51.563177,7.542472,0.592593
1029,1023_0101848,0.781611,52.903756,6.914804,0.620968
1030,1023_0101852,0.808166,58.681837,7.699607,0.604938
1031,1023_0101851,0.798132,52.983324,7.866796,0.578378


In [None]:
diversity_features.to_excel('/content/drive/MyDrive/Masterarbeit/Data/features_lex_diversity.xlsx', engine='xlsxwriter', index=False) 

##Lexical Density & Variation Features

In [12]:
num_adverb_types = []
num_adjective_types = []
num_noun_types = []
num_nouns = []
num_verb_types = []
num_verbs = []
num_lexical_words = []
for doc in tqdm(docs):
  num_adverb_types.append(get_num_adverb_types(doc)[0])
  num_adjective_types.append(get_num_adjective_types(doc)[0])
  num_verb_types.append(get_num_verb_types(doc)[0])
  num_verbs.append(get_num_verb_types(doc)[1])
  num_noun_types.append(get_num_noun_types(doc)[0])
  num_nouns.append(get_num_noun_types(doc)[1])
  num_lexical_words.append(get_num_lexical_words(doc))

100%|██████████| 1033/1033 [00:00<00:00, 1098.41it/s]


In [13]:
dens_var_features = data.drop(columns=['text'])

In [15]:
dens_var_features['lex_adverb_var'] = [save_division(i, j) for i, j in zip(num_adverb_types, num_lexical_words)]
dens_var_features['lex_modifier_var'] = [save_division(i, j) for i, j in zip([h + k for h, k in zip(num_adjective_types, num_adverb_types)], num_lexical_words)]
dens_var_features['lex_noun_var'] = [save_division(i, j) for i, j in zip(num_noun_types, num_lexical_words)]
dens_var_features['lex_verb_var_1'] = [save_division(i, j) for i, j in zip(num_verb_types, num_verbs)]
dens_var_features['lex_verb_var_2'] = [save_division(i, j) for i, j in zip(num_verb_types, num_lexical_words)]
dens_var_features['lex_sqrt_verb_var_1'] = [save_division(i**2, j) for i, j in zip(num_verb_types, num_verbs)]
dens_var_features['lex_cor_verb_var_1'] = [save_division(i, math.sqrt(2*j)) for i, j in zip(num_verb_types, num_verbs)]


In [16]:
dens_var_features

Unnamed: 0,id,lex_adverb_var,lex_modifier_var,lex_noun_var,lex_verb_var_1,lex_verb_var_2,lex_sqrt_verb_var_1,lex_cor_verb_var_1
0,1023_0101853,0.045977,0.218391,0.402299,0.677419,0.241379,14.225806,2.667003
1,1023_0101856,0.100000,0.250000,0.566667,0.666667,0.133333,5.333333,1.632993
2,1023_0101855,0.096386,0.168675,0.397590,0.600000,0.216867,10.800000,2.323790
3,1023_0101893,0.071429,0.226190,0.488095,0.818182,0.214286,14.727273,2.713602
4,1023_0101895,0.038961,0.181818,0.493506,0.842105,0.207792,13.473684,2.595543
...,...,...,...,...,...,...,...,...
1028,1023_0101849,0.069444,0.222222,0.444444,0.666667,0.194444,9.333333,2.160247
1029,1023_0101848,0.056604,0.169811,0.452830,0.714286,0.188679,7.142857,1.889822
1030,1023_0101852,0.041096,0.205479,0.452055,0.666667,0.191781,9.333333,2.160247
1031,1023_0101851,0.075000,0.175000,0.462500,0.800000,0.250000,16.000000,2.828427


In [17]:
dens_var_features.to_excel('/content/drive/MyDrive/Masterarbeit/Data/features_lex_dens_var.xlsx', engine='xlsxwriter', index=False)

#Morphological Features
(Compound Nouns mit https://pypi.org/project/compound-split/)





In [None]:
num_sents =[]
num_articles = []
num_nouns = []
num_verbs = []
num_finverbs = []
num_past_participle_verbs = []
num_first_person = []
num_second_person = []
num_third_person = []
num_past_tense_verbs = []
num_nominative_nouns = []
num_genitiv_nouns = []
num_dativ_nouns = []
num_akkusativ_nouns = []
num_compound_nouns = []
num_singular = []
num_plural = []
num_keit_suffix = []
num_ung_suffix = []
num_werk_suffix = []

for doc in tqdm(docs):
  num_articles.append(get_num_articles(doc))
  num_nouns.append(get_num_nouns(doc))
  num_verbs.append(get_num_verb_types(doc)[1])
  num_finverbs.append(get_num_finverbs(doc))
  num_past_participle_verbs.append(get_num_past_participle_verbs(doc))
  num_first_person.append(get_num_verb_person(doc,'1'))
  num_second_person.append(get_num_verb_person(doc, '2'))
  num_third_person.append(get_num_verb_person(doc, '3'))
  num_past_tense_verbs.append(get_num_past_tense_verbs(doc))
  num_nominative_nouns.append(get_num_case_nouns (doc, 'Nom'))
  num_genitiv_nouns.append(get_num_case_nouns (doc, 'Gen'))
  num_dativ_nouns.append(get_num_case_nouns (doc, 'Dat'))
  num_akkusativ_nouns.append(get_num_case_nouns (doc, 'Acc'))
  num_compound_nouns.append(get_num_compound_nouns(doc))
  num_singular.append(get_num_numerus(doc,'Sing'))
  num_plural.append(get_num_numerus(doc,'Plur'))
  num_keit_suffix.append(get_num_suffix(doc,'keit'))
  num_ung_suffix.append(get_num_suffix(doc,'ung'))
  num_werk_suffix.append(get_num_suffix(doc,'werk'))
  ts = text_stats.TextStats(doc)
  num_sents.append(ts.n_sents)

morphological_features = data.drop(columns=['text'])

morphological_features['morph_article_ratio'] = [save_division(i, j) for i, j in zip(num_articles, num_sents)]
morphological_features['morph_comp_noun_ratio'] = [save_division(i, j) for i, j in zip(num_compound_nouns, num_nouns)]
morphological_features['morph_first_person_verb_ratio'] = [save_division(i, j) for i, j in zip(num_first_person, num_verbs)]
morphological_features['morph_second_person_verb_ratio'] = [save_division(i, j) for i, j in zip(num_second_person, num_verbs)]
morphological_features['morph_third_person_verb_ratio'] = [save_division(i, j) for i, j in zip(num_third_person, num_verbs)]
morphological_features['morph_first_person_fin_verb_ratio'] = [save_division(i, j) for i, j in zip(num_first_person, num_finverbs)]
morphological_features['morph_past_tense_verb_ratio'] = [save_division(i, j) for i, j in zip(num_past_tense_verbs, num_verbs)]
morphological_features['morph_past_participle_verb_ratio'] = [save_division(i, j) for i, j in zip(num_past_participle_verbs, num_verbs)]
morphological_features['morph_nominative_nouns_ratio'] = [save_division(i, j) for i, j in zip(num_nominative_nouns, num_nouns)]
morphological_features['morph_genitiv_nouns_ratio'] = [save_division(i, j) for i, j in zip(num_genitiv_nouns, num_nouns)]
morphological_features['morph_dativ_nouns_ratio'] = [save_division(i, j) for i, j in zip(num_dativ_nouns, num_nouns)]
morphological_features['morph_akkusativ_nouns_ratio'] = [save_division(i, j) for i, j in zip(num_akkusativ_nouns, num_nouns)]
morphological_features['morph_singular_nouns_ratio'] = [save_division(i, j) for i, j in zip(num_singular, num_nouns)]
morphological_features['morph_plural_nouns_ratio'] = [save_division(i, j) for i, j in zip(num_plural, num_nouns)]
morphological_features['morph_keit_suffix_ratio'] = [save_division(i, j) for i, j in zip(num_keit_suffix, num_nouns)]
morphological_features['morph_ung_suffix_ratio'] = [save_division(i, j) for i, j in zip(num_ung_suffix, num_nouns)]
morphological_features['morph_werk_suffix_ratio'] = [save_division(i, j) for i, j in zip(num_werk_suffix, num_nouns)]

100%|██████████| 1033/1033 [00:05<00:00, 180.76it/s]


In [None]:
morphological_features.to_excel('/content/drive/MyDrive/Masterarbeit/Data/features_morphological.xlsx', engine='xlsxwriter', index=False)

In [None]:
def get_num_case_nouns(_doc, _case):
  _case_nouns = []
  print(_case)
  print(type(_case))
  for _token in _doc:
    print(token.morph.get('Case'))
    if 'NN' in _token.tag_ and 'Nom' in _token.morph.get('Case'):
      print('True')
      _case_nouns.append(_token.text)
  if len(_case_nouns) == 0:
    return 0
  else:
    return len(_case_nouns)

In [None]:
get_num_case_nouns(doc,'Nom')

Nom
<class 'str'>
['Nom']


0

In [None]:
for token in doc:
  if 'NN' in token.tag_ and 'Nom' in token.morph.get('Case'):
    print(token.morph.get('Case'))


['Nom']
['Nom']
['Nom']
['Nom']
['Nom']
['Nom']
['Nom']
['Nom']
['Nom']
['Nom']


#Syntactic Features

##Parsetree (with corenlp)

In [None]:
texts=data['text'].tolist()
with CoreNLPClient(
    properties='german',
    annotators=['tokenize','ssplit', 'pos', 'mwt', 'parse'], 
    memory='8G', 
    endpoint='http://localhost:9001',
    be_quiet=True) as client:
  avg_clause_length =[]
  avg_tunit_length =[]
  avg_vp_length =[]
  num_dep_clause = []
  num_dep_clause_w_conjunc = []
  num_non_terminal_symb =[]
  num_vz = []


  for text in tqdm(texts):
    print(text)
    ann = client.annotate(text)
    avg_clause_length.append(avg_length_matches('(S > NUR) | (S >> CS >> NUR)',text)[0])
    avg_tunit_length.append(avg_length_matches('S',text)[0])
    avg_vp_length.append(avg_length_matches('VP',text)[0])
    num_dep_clause.append(count_matches('S >> (S > (CS > NUR > ROOT) | > (NUR > ROOT))',text)[0])
    num_dep_clause_w_conjunc.append(count_matches('S < SCONJ',text)[0])
    num_non_terminal_symb.append(count_matches('AA | AP | AVP | CAC | CAP | CAVP | CCP | CH | CNP | CO | CPP | CS | CVP | CVZ | DL | ISU | MPN | MTA | NM | NP | PP | QL | S | VP | VZ',text)[0])
    num_vz.append(count_matches('VZ',text)[0])   
 

##Dependency (with spacy)

In [None]:
dep_per_noun=[]
dep_per_verb=[]
for doc in tqdm(docs):
  dep_per_noun.append(get_dep_per_noun(doc))
  dep_per_verb.append(get_dep_per_verb(doc))

syn_dependency_features = data
syn_dependency_features['dep_per_noun'] = dep_per_noun
syn_dependency_features['dep_per_verb'] = dep_per_verb
syn_dependency_features = data.drop(columns=['text'])



100%|██████████| 1033/1033 [00:00<00:00, 7028.71it/s]


In [None]:
syn_dependency_features.to_excel('/content/drive/MyDrive/Masterarbeit/Data/features_syn_dependency.xlsx', engine='xlsxwriter', index=False)

#Frequency Features

##Frequency Tabelle aus DlexDB export erstellen

In [None]:
import pandas as p
import numpy as np

In [None]:
word_list = []
for doc in tqdm(docs):
  for token in doc:
    if 'NN' in token.tag_:
      word_list.append(token.lemma_)
    else:
      word_list.append(token.lemma_.lower())

word_list = pd.Series(word_list).unique()
np.savetxt('/content/drive/MyDrive/Masterarbeit/Data/unique_words.txt', word_list, fmt='%s')

100%|██████████| 1033/1033 [00:00<00:00, 6876.10it/s]


In [None]:
len(word_list)

9517

In [None]:
frequency_table = pd.read_csv('/content/drive/MyDrive/Masterarbeit/Data/dlexdb_frequency.csv',sep='	')
frequency_table = frequency_table.drop(columns=['Zeile'])
frequency_table = frequency_table.drop_duplicates()
frequency_table['Typefrequenz_normalisiert'] = frequency_table['Typefrequenz_normalisiert'].str.replace('None', '0')
frequency_table['Typefrequenz_normalisiert'] = frequency_table['Typefrequenz_normalisiert'].apply(pd.to_numeric)
frequency_table['Typefrequenz_absolut'] = frequency_table['Typefrequenz_absolut'].str.replace('None', '0')
frequency_table['Typefrequenz_absolut'] = frequency_table['Typefrequenz_absolut'].apply(pd.to_numeric)

In [None]:
frequency_table.to_excel('/content/drive/MyDrive/Masterarbeit/Data/dlexdb_frequency_clean.xlsx', engine='xlsxwriter', index=False)

##Mean Frequency

In [None]:
frequency_table = pd.read_excel('/content/drive/MyDrive/Masterarbeit/Data/dlexdb_frequency_clean.xlsx',)  

In [None]:
frequency_table['logTypefrequenz_normalisiert'] = np.log(frequency_table['Typefrequenz_normalisiert'])
frequency_table['logTypefrequenz_normalisiert'] = frequency_table['logTypefrequenz_normalisiert'].replace([np.inf, -np.inf], '0').apply(pd.to_numeric)
frequency_table['logTypefrequenz_absolut'] = np.log(frequency_table['Typefrequenz_absolut'])
frequency_table['logTypefrequenz_absolut'] = frequency_table['logTypefrequenz_absolut'].replace([np.inf, -np.inf], '0').apply(pd.to_numeric)

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [None]:
avg_log_frequency = []
for doc in tqdm(docs):
  avg_log_frequency.append(get_avg_frequency(doc, frequency_table,'Wort','logTypefrequenz_normalisiert'))

100%|██████████| 1033/1033 [03:28<00:00,  4.96it/s]


In [None]:
frequency_features = data.drop(columns=['text'])
frequency_features['freq_avg_log_word_frequency'] = avg_log_frequency
frequency_features.to_excel('/content/drive/MyDrive/Masterarbeit/Data/feature_freq_meanfrequency.xlsx', engine='xlsxwriter', index=False) 

In [None]:
frequency_features

Unnamed: 0,id,avg_log_word_frequency
0,1023_0101853,4.127020
1,1023_0101856,3.873948
2,1023_0101855,4.370954
3,1023_0101893,4.535678
4,1023_0101895,3.974254
...,...,...
1028,1023_0101849,4.405623
1029,1023_0101848,4.117336
1030,1023_0101852,4.303197
1031,1023_0101851,4.242734
