# SVM Training set
In order to train a SVM classifier that labels text as either GDPR content or not GDPR content we need a training set of GDPR texts and none GDPR texts. In this notebook such a training set is constructed from the dev_dataset/the GDPRDataset.

In [8]:
import json
from pathlib import Path
import GDPRDataset
import pandas as pd

ImportError: attempted relative import with no known parent package

# Load Config

In [2]:
config_file = open(f"{str(Path.cwd())}/create_svm_training_set_from_dev_dataset_config.json")
config = json.load(config_file)
config_file.close()

# Load Train Dataset

In [7]:
train_dataset_df = GDPRDataset.GDPRDataset(config['train_dataset_path']).dataframe

NameError: name 'GDPRDataset' is not defined

# Get GDPR texts

In [4]:
GDPR_texts_df = pd.DataFrame(data = {'content_removed':train_dataset_df[train_dataset_df['contains_GDPR'] == True]['content_removed']})

GDPR_texts_df['text_word_count'] = GDPR_texts_df['content_removed'].apply(lambda x: len(str(x).split(" ")))
GDPR_texts_df = GDPR_texts_df.reset_index()

## Get texts for negative samples
For each GDPR text get a text of similar length from none GDPR websites.

In [5]:
# Get none GDPR texts
no_GDPR_texts_df = pd.DataFrame(data={
    'content': train_dataset_df[train_dataset_df['contains_GDPR'] == False]['content'],
})
no_GDPR_texts_df = no_GDPR_texts_df.reset_index()

In [6]:
# trim the none GDPR texts to have similar lengths as the GDPR texts
no_GDPR_texts = []
for i in range(min(len(GDPR_texts_df),len(no_GDPR_texts_df))):
    text_len = GDPR_texts_df.loc[i]['text_word_count']
    no_GDPR_text = no_GDPR_texts_df.loc[i]['content']
    stripped_no_GDPR_text = " ".join(list(no_GDPR_text.split(" "))[:text_len])
    no_GDPR_texts.append(stripped_no_GDPR_text)

no_GDPR_texts_df = pd.DataFrame(data={'stripped_text':no_GDPR_texts})

#### Store as first version training dataset
The negative texts are sampled quite random. By choosing harder negative samples the SVM might perform better. For now store these plain texts as my first SVM training set.

In [7]:
tfifd_svm_dataset = list(GDPR_texts_df['content_removed'].values) + list(no_GDPR_texts_df['stripped_text'].values)

labels = [1] * len(GDPR_texts_df) + [0] * len(no_GDPR_texts_df)

tfifd_svm_dataset_df = pd.DataFrame(data={
    'text': tfifd_svm_dataset,
    'is_GDPR': labels}
)
tfifd_svm_dataset_df.to_json(f"{config['svm_training_sets_path']}/svm_training_set_v1.json")

# Test SVM

In [5]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/erik/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
import pandas as pd
from POC.my_utils.text_preprocessing import clean_german_text, clean_german_texts
import sklearn.svm as svm
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [9]:
train_set_file = open("/home/erik/Desktop/WiSe2022_23/IR-Project/dsgvo_handler/POC/data/svm_training_sets/svm_training_set_v1.json", 'r')
train_set = json.load(train_set_file)
train_set_file.close()

In [15]:
documents = list(train_set['text'].values())
print(len(documents))
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words=stopwords.words('german'))
fit_tfidf_documents = clean_german_texts(documents)
tfidf_vectorizer.fit_transform(fit_tfidf_documents)

250


<250x8307 sparse matrix of type '<class 'numpy.float64'>'
	with 27637 stored elements in Compressed Sparse Row format>

In [14]:
# train or load SVM
train_texts = list(train_set['text'].values())
train_texts = clean_german_texts(train_texts)
train_text_vectors = tfidf_vectorizer.transform(train_texts)
train_labels = list(train_set['is_GDPR'].values())
SVM = svm.SVC(C=1.0, kernel='rbf', degree=3, gamma='auto', probability=True)
SVM.fit(train_text_vectors, train_labels)

In [16]:
# train acc
Y = SVM.predict(train_text_vectors)