In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [2]:
import numpy
import pandas
print(numpy.__version__)
print(pandas.__version__)

2.0.2
2.2.2


In [3]:
data = pd.read_csv('gdpr_violations.csv')
additional_data=pd.read_csv('/content/gdpr_violations_additional.csv')
data=data[['article_violated','summary']]
data=pd.concat([data,additional_data])

In [5]:
data.head()

Unnamed: 0,article_violated,summary
0,Art. 28 GDPR,No data processing agreement has been conclude...
1,Art. 12 GDPR|Art. 13 GDPR|Art. 5 (1) c) GDPR|A...,A controller was sanctioned because he had unl...
2,Art. 5 GDPR|Art. 6 GDPR,The company had unlawfully processed the perso...
3,Art. 31 GDPR,Iberdrola Clientes violated Article 13 of the ...
4,Art. 32 GDPR,Raiffeisen Bank Romania did not observe the ne...


In [6]:
data['article_violated'] = data['article_violated'].str.replace(r'\(1\)', '', regex=True)
data['article_violated'] = data['article_violated'].str.replace(r'\(2\)', '', regex=True)
data['article_violated'] = data['article_violated'].str.replace(r'\(3\)', '', regex=True)
data['article_violated'] = data['article_violated'].str.replace(r'\(4\)', '', regex=True)
data['article_violated'] = data['article_violated'].str.replace(r'\(5\)', '', regex=True)

In [7]:
data['article_violated'].unique()

array(['Art. 28 GDPR',
       'Art. 12 GDPR|Art. 13 GDPR|Art. 5  c) GDPR|Art. 6 GDPR',
       'Art. 5 GDPR|Art. 6 GDPR', 'Art. 31 GDPR', 'Art. 32 GDPR',
       'Art. 32 GDPR|Art. 33 GDPR', 'Art. 5  c) GDPR|Art. 25 GDPR',
       'Art. 21  GDPR|Art. 25 GDPR', 'Art. 5  a) GDPR|Art. 6  a) GDPR',
       'Art. 15 GDPR|Art. 17 GDPR|Art. 21 GDPR', 'Art. 5  c) GDPR',
       'Art. 13 GDPR|Art. 37 GDPR', 'Art. 17 GDPR',
       'Art. 5  c) GDPR|Art. 9 GDPR|Art. 35 GDPR|Art. 36 GDPR',
       'Art. 6 GDPR',
       'Art. 5  GDPR|Art. 5  GDPR|Art. 6  GDPR|Art. 13  c) GDPR|Art. 14  c) GDPR',
       'Art. 25  GDPR|Art. 5  c) GDPR', 'Art. 33 GDPR',
       'Art. 5  c) GDPR|Art. 12 GDPR|Art. 13 GDPR|Art. 32 GDPR',
       'Art. 5  a)|Art. 7  GDPR', 'Art. 5  e) GDPR|Art. 5  GDPR',
       'Art. 5  b) GDPR|Art. 6 GDPR',
       'Art. 6 GDPR|Art. 5  b) GDPR|Art. 13 GDPR',
       'Art. 5 GDPR|Art. 32 GDPR|Art. 33 GDPR',
       'Art. 5  a) GDPR|Art. 5  b) GDPR|Art. 32  GDPR', 'Art. 15 GDPR',
       'Art. 5  a) GDP

In [8]:
article_encoder = LabelEncoder()
data["article_violated_encoded"] = article_encoder.fit_transform(data["article_violated"])

# Risk Assessment Function
def risk_assessment(article):
    high_risk = {
        "Art. 5 GDPR", "Art. 6 GDPR", "Art. 7 GDPR", "Art. 9 GDPR", "Art. 10 GDPR",
        "Art. 17 GDPR", "Art. 22 GDPR", "Art. 33 GDPR", "Art. 44 GDPR", "Art. 45 GDPR",
        "Art. 46 GDPR", "Art. 83 GDPR"
    }

    medium_risk = {
        "Art. 8 GDPR", "Art. 12 GDPR", "Art. 13 GDPR", "Art. 14 GDPR", "Art. 15 GDPR",
        "Art. 18 GDPR", "Art. 19 GDPR", "Art. 20 GDPR", "Art. 23 GDPR", "Art. 35 GDPR"
    }

    low_risk = {
        "Art. 11 GDPR", "Art. 16 GDPR", "Art. 21 GDPR", "Art. 37 GDPR", "Art. 43 GDPR"
    }

    if article in high_risk:
        return "High"
    elif article in medium_risk:
        return "Medium"
    elif article in low_risk:
        return "Low"
    else:
        return "High"  # If the article is not in the predefined list, because it means there are 2 articles involved

# Example usage:
print(risk_assessment("Art. 5 GDPR"))   # High
print(risk_assessment("Art. 12 GDPR"))  # Medium
print(risk_assessment("Art. 16 GDPR"))  # Low
print(risk_assessment("Art. 99 GDPR"))  # Unknown (not categorized)


High
Medium
Low
High


In [9]:
# Apply Risk Assessment
data["risk_level"] = data["article_violated"].apply(risk_assessment)

# Encode `risk_level`
risk_encoder = LabelEncoder()
data["risk_level_encoded"] = risk_encoder.fit_transform(data["risk_level"])

In [10]:
data[data['risk_level']=='Medium'][10:20]

Unnamed: 0,article_violated,summary,article_violated_encoded,risk_level,risk_level_encoded
105,Art. 15 GDPR,The data controller could not provide access t...,15,Medium,2
111,Art. 13 GDPR,The company collected personal data without ac...,10,Medium,2
112,Art. 13 GDPR,The company TODOTECNICOS24H collected personal...,10,Medium,2
125,Art. 13 GDPR,No further information is available.,10,Medium,2
152,Art. 15 GDPR,The hospital unlawfully charged a copying fee ...,15,Medium,2
155,Art. 13 GDPR,The company was fined because it collected per...,10,Medium,2
216,Art. 13 GDPR,The Spanish Data Protection Authority determin...,10,Medium,2
225,Art. 13 GDPR,The website of the company did not contain a p...,10,Medium,2
287,Art. 15 GDPR,Not available.,15,Medium,2
295,Art. 14 GDPR,The private company was fined for having breac...,13,Medium,2


In [11]:
data['risk_level'][429]

'High'

In [12]:
# Feature Engineering
vectorizer = TfidfVectorizer()
X_text = vectorizer.fit_transform(data['summary'])


In [41]:
# Train-Test Split
X_train, X_test, y_train_articles, y_test_articles, y_train_risk, y_test_risk = train_test_split(
    X_text, data["article_violated_encoded"], data["risk_level_encoded"], test_size=0.2, random_state=42
)

# Train Model for Predicting `article_violated`
article_model = RandomForestClassifier(n_estimators=200, random_state=42)
article_model.fit(X_train, y_train_articles)


In [42]:
# Predictions for `article_violated`
y_pred_articles = article_model.predict(X_test)

# Convert predicted articles back to original labels
y_pred_articles_real = article_encoder.inverse_transform(y_pred_articles)
print("Predicted Violations:", y_pred_articles_real)


Predicted Violations: ['Art. 6 GDPR' 'Art. 32 GDPR' 'Art. 6 GDPR' 'Art. 13 GDPR'
 'Art. 5  c) GDPR' 'Art. 5  e) GDPR|Art. 5  GDPR' 'Art. 5  a),  GDPR'
 'Art. 6 GDPR' 'Art. 15 GDPR' 'Art. 5  c) GDPR'
 'Art. 5  a) GDPR|Art. 6  a) GDPR' 'Art. 6 GDPR' 'Art. 6 GDPR|Art. 5 GDPR'
 'Art.14 GDPR' 'Art. 58 GDPR' 'Art. 6 GDPR|Art. 9 GDPR' 'Art. 11 GDPR'
 'Art. 5  a)|Art. 7  GDPR' 'Art. 5 GDPR|Art. 25 GDPR' 'Art. 5  c) GDPR'
 'Art. 46 GDPR' 'Art. 44 GDPR' 'Art. 13 GDPR' 'Art. 5 GDPR'
 'Art. 5  f) GDPR|Art. 32 GDPR' 'Art. 46 GDPR' 'Art. 8 GDPR'
 'Art. 5 GDPR|Art. 6 GDPR|Art. 21 GDPR' 'Art. 32 GDPR' 'Art. 13 GDPR'
 'Art. 5  a) GDPR|Art. 6 GDPR' 'Art. 5 GDPR|Art. 6 GDPR' 'Art. 32 GDPR'
 'Art. 6  GDPR|Art. 25  GDPR' 'Art. 5 GDPR|Art. 6 GDPR' 'Art. 6 GDPR'
 'Art. 32 GDPR' 'Art. 15 GDPR' 'Art. 5  c) GDPR' 'Art. 6 GDPR'
 'Art. 6 GDPR' 'Art. 5  a) GDPR|Art. 6 GDPR' 'Art. 22 GDPR'
 'Art. 6 GDPR|Art. 9 GDPR' 'Art. 32 GDPR' 'Art. 6 GDPR'
 'Art. 5 GDPR|Art. 6 GDPR' 'Art. 5  c) GDPR' 'Art. 15 GDPR' 'Art. 32 GD

In [49]:
from sklearn.metrics import classification_report, accuracy_score,f1_score
accuracy_score(y_test_articles,y_pred_articles)
accuracy_score(y_train_articles,article_model.predict(X_train))

0.9925187032418953

In [50]:
# Train Model for Predicting `risk_level`
risk_model = RandomForestClassifier(n_estimators=100, random_state=42)
risk_model.fit(X_train, y_train_risk)  # Fix: Use X_train instead of labels

# Predict `risk_level`
y_pred_risk = risk_model.predict(X_test)

# Convert predicted risk levels back to original labels
y_pred_risk_real = risk_encoder.inverse_transform(y_pred_risk)
print("Predicted Risk Levels:", y_pred_risk_real)


Predicted Risk Levels: ['High' 'High' 'High' 'Medium' 'High' 'High' 'High' 'High' 'Medium' 'High'
 'High' 'High' 'High' 'High' 'High' 'High' 'High' 'High' 'High' 'High'
 'High' 'High' 'Medium' 'High' 'High' 'High' 'High' 'High' 'High' 'Medium'
 'High' 'High' 'High' 'High' 'High' 'High' 'High' 'High' 'High' 'High'
 'High' 'High' 'High' 'High' 'High' 'High' 'High' 'High' 'Medium' 'High'
 'Medium' 'High' 'High' 'Medium' 'High' 'High' 'High' 'High' 'High' 'High'
 'High' 'High' 'High' 'High' 'High' 'High' 'High' 'High' 'High' 'High'
 'High' 'High' 'High' 'High' 'High' 'High' 'High' 'High' 'Low' 'High'
 'High' 'High' 'High' 'High' 'High' 'High' 'High' 'High' 'High' 'High'
 'High' 'High' 'High' 'High' 'High' 'Medium' 'High' 'High' 'High' 'High'
 'High']


In [51]:
accuracy_score(y_test_risk,y_pred_risk)

0.900990099009901