<a href="https://colab.research.google.com/github/EmreYY20/ToS-Simplification/blob/main/ToS_Summarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Terms of Service Simplification**

# 1. Data preperation

In the following different Terms of Services will be collected from https://tosdr.org/.

Then the data will be processed and relevant data will be extracted.

## 1.1. Scraping the data

In [1]:
import os
import json
import requests
import concurrent.futures

In [2]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299'
}

In [3]:
# Define a function to retrieve service IDs
def get_ids():
    id_url = "https://api.tosdr.org/all-services/v1/"
    r = requests.get(id_url, headers=headers)
    j = r.json()
    tod_ids = []

    # Iterate through services and check if ToS data is already downloaded
    for service in j['parameters']['services']:
        tos_id = service['id']
        file_path = os.path.join("data", "raw_data", f"tos_{tos_id}.json")

        if not os.path.isfile(file_path):
            tod_ids.append(tos_id)
    return tod_ids

In [4]:
# Define a function to download ToS data by ID
def download_tos(tos_id):
    url = f'https://api.tosdr.org/rest-service/v2/{tos_id}.json'

    # Create a session for making HTTP requests
    with requests.Session() as session:
        try:
            r = session.get(url, headers=headers)
            j = r.json()

            # Check for errors and handle if error code 193 is encountered
            if j.get('error') == 193:
                return None

            directory = 'data/raw_data'
            if not os.path.exists(directory):
                os.makedirs(directory)

            # Save the ToS data in a JSON file
            with open(f"{directory}/tos_{tos_id}.json", 'w') as outfile:
                json.dump(j, outfile)
            return tos_id
        except Exception as e:
            print(f"Error downloading ToS {tos_id}: {e}")
            return None

In [None]:
# Main program execution
if __name__ == "__main__":
    ids = get_ids()

    # Use a ThreadPoolExecutor to concurrently download ToS data
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        results = list(executor.map(download_tos, ids))

    # Filter and count downloaded ToS data
    downloaded_tos = [result for result in results if result is not None]
    print(f"Downloaded {len(downloaded_tos)} ToS.")

## 1.2. Processing the data

In [None]:
!pip install spacy_language_detection

In [8]:
import re
import pandas as pd
import spacy
from spacy.language import Language
from spacy_language_detection import LanguageDetector

In [10]:
# Specify the path to JSON files containing data
path_to_json = 'data/raw_data/'

# get all JSON files in the specified directory
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]

In [11]:
# Define a function to get a language detector using a specific spaCy model
def get_lang_detector(nlp, name):
    return LanguageDetector(seed=42)  # We use the seed 42 for consistency

In [12]:
# Initialize a list to store downloaded terms of services
data = []

# Load data from each JSON file into the 'data' list
for json_file in json_files:
    with open(path_to_json + json_file, 'r') as f:
        data.append(json.load(f))

In [13]:
# Initialize a list to store reviewed terms of services
reviewed_terms = []

# Filter and collect terms of services that are comprehensively reviewed
for doc in data:
    if doc['parameters']['is_comprehensively_reviewed'] is True:
        reviewed_terms.append(doc)

# Create summaries by merging quotes from the reviewed terms of services
final_data = []

# Load the spaCy model and add a custom language detector pipeline
nlp_model = spacy.load('en_core_web_sm')
Language.factory("language_detector", func=get_lang_detector)
nlp_model.add_pipe('language_detector', last=True)

# Process each reviewed term of service
for doc in reviewed_terms:
    legal_contracts = {}

    # Group quotes by document ID
    for point in doc['parameters']['points']:
        if point['quoteStart'] is not None and point['quoteText'] is not None:
            legal_contracts.setdefault(point['document_id'], []).append(point)

    # Sort quotes by 'quoteStart'
    for doc_id, value in legal_contracts.items():
        legal_contracts[doc_id] = sorted(value, key=lambda i: i['quoteStart'])

        # Initialize variables to store plain text and summary
        plain_text = ""
        summary = ""

        # Concatenate quote text and titles to form plain text and summary
        for point in legal_contracts[doc_id]:
            plain_text += point['quoteText'] + " "
            summary += point['title'] + ". "

        # Perform regex preprocessing to remove HTML tags and newline characters
        plain_text = re.sub(r"<[^>]*>", '', plain_text)
        plain_text = re.sub(r"\n", ' ', plain_text)

        # Language check using the custom language detector
        doc = nlp_model(plain_text)

        # Check if the detected language is English ('en')
        if doc._.language['language'] == 'en':
            final_data.append([plain_text, summary])

# Create a Pandas DataFrame from the processed data
df = pd.DataFrame(final_data, columns=['plain_text', 'summary'])

# Export the DataFrame to a JSON file in 'records' format with lines
df.to_json('data/dataset.json', orient='records', lines=True)

# 2. Understanding the data

# 3. Fine-Tuning

# 4. Evalutation