## Prepare data from experiements in Python to be used in Java Implementation of the DSGVO Handler

In [1]:
from GDPRDataset import GDPRDataset
import json
from pathlib import Path
import random
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Load Config

In [2]:
# Load config for dev dataset creation process
config_file = open(f"{str(Path.cwd())}/dev_dataset_for_jvm_impl_config.json")
config = json.load(config_file)
config_file.close()

### Export list of documents to fit TF-IDF Vectorizer on
Only use websites from training set to fit the vectorizer


In [6]:
fit_tfidf_documents_df = GDPRDataset(dataset_dir=config['train_dataset_path']).dataframe

In [7]:
fit_tfidf_documents = list(fit_tfidf_documents_df['content'].values)

In [10]:
with open(config['jvm_fit_tfidf_documents_path'], 'w+') as f:
    json.dump({
        'documents':fit_tfidf_documents
    },f)

### Export list of SVM training samples
Samples are pairs String to Boolean. Only use websites from training set for svm training.
Just use the json for svm training as created in svm_training_set_creation

In [13]:
with open(config['svm_training_set_v1_path'], 'r') as f_in:
    svm_training_data = json.load(f_in)

    with open(config['jvm_fit_svm_training_data_path'], 'w+') as f_out:
        json.dump(svm_training_data,f_out)

I need to check if the java svm is not working or if the problem is the TfIdf vectors. So I export the vecotirzed SVM training set

In [29]:
svm_training_data = None
with open(config['svm_training_set_v1_path'], 'r') as f_in:
    svm_training_data = json.load(f_in)

# vectorize
german_stop_words = stopwords.words('german')
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words=german_stop_words)
vocab = tfidf_vectorizer.vocabulary

training_texts = [svm_training_data['text'][i] for i in svm_training_data['text'].keys()]
vecs = tfidf_vectorizer.fit_transform(training_texts)

AttributeError: 'TfidfVectorizer' object has no attribute 'vocabulary_'

In [25]:
vectors = []
for text in training_texts:
    vectors.append(tfidf_vectorizer.transform([text])[0].toarray()[0].tolist())

In [26]:

with open(config['jvm_fit_svm_training_data_vec_path'], 'w+') as f_out:
    json.dump({"tfidf_vectors": vectors},f_out)

In [27]:
for vec in vectors:
    print(np.sum(vec))

2.71961105959526
8.419630912559642
4.525580350695929
9.322955143655584
13.760945660923344
8.54515536130653
7.958315926483226
5.983214201308089
10.994325475188576
7.5825783196729635
3.015993890422198
2.8965385613439616
5.249610213069992
5.96578337239758
3.10604821816086
8.312495181902062
3.600015280667246
5.249610213069992
5.593533685538785
3.2194873204141135
8.603377631878399
5.13952597990277
10.344539847591946
9.35358103521697
3.576876721650729
4.547532200398576
4.357657376695433
9.588722979765631
10.207894543368475
3.0487233320389895
4.544373379662849
8.571245216758099
5.757702885813106
8.675251567603887
6.167777304436728
2.0294933983000853
8.519916417647014
4.5527084270353
8.688927229686197
5.217672262097688
8.73510919012003
11.46513224003783
8.908837916510944
4.8725167963933735
5.4563222141185035
7.382154117214036
8.643723188535045
4.721838052283233
7.933465640953882
8.665523634658795
3.132088980958481
9.976547099440236
8.609549119848133
7.10901914752775
8.818908089233886
8.2574473

In [30]:
german_stop_words = stopwords.words('german')
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words=german_stop_words)
tfidf_vectorizer.fit([
    "Klettern ist die Bewegung in der Vertikalen mithilfe der Beine, Arme und Hände entlang von Kletterrouten",
    "Klettern ist eine vom Menschen seit jeher angewandte Fortbewegungsart in der vertikalen Ebene.",
    "Im Mittelalter bekamen Felsen eine zunehmende strategische Bedeutung."
])

print(tfidf_vectorizer.transform([
    "Klettern Menschen Beine",
    "Menschen Beine Klettern Menschen"
]))

  (0, 14)	0.6227660078332259
  (0, 12)	0.4736296010332684
  (0, 3)	0.6227660078332259
  (1, 14)	0.8467889668239188
  (1, 12)	0.3220024178194947
  (1, 3)	0.4233944834119594


## Export test dataset as json file

In [3]:
test_dataset_df = GDPRDataset(config["test_dataset_path"]).dataframe

In [9]:
test_dataset_df.to_json(
    path_or_buf=config["jvm_test_dataset_path"],
    orient="records")

In [10]:
jvm_test_dataset_json = json.load(open(config["jvm_test_dataset_path"]))

In [11]:
print(jvm_test_dataset_json[3]["content"])

  Bitte aktivieren Sie Javascript zur vollständigen Anzeige aller Funktionen dieser Seite.            Software-Entwicklung    Web-Entwicklung    App-Entwicklung      Unser Angebot     Renovierung Ihrer Java / Java EE-Anwendung     Sie stellen die Fragen - wir geben die Antworten     App-Check      Karriere    Softwareentwickler/-in Java    Softwareentwickler/-in Vue.js      Softceed    Referenzen    Über uns                       Wir kreieren & realisieren digitale Produkte  Software-Entwicklung | Unternehmenskritische Anwendungen | Mobile Anwendungen       Experten im Bereich Java  Java | Java EE | Cloud  Mehr erfahren       Praxiserfahren & kompetent  Referenzen  Mehr erfahren       Previous     Next      Softceed GmbH  Wir kreieren und realisieren digitale Produkte: mobile Anwendungen & Webanwendungen  Softceed entwickelt seit über 15 Jahren hochwertige Individualsoftware in den Bereichen Mobile und Web.      Unsere Leistungen  Alles aus einer Hand: Konzeption, Design, Entwicklung, 