<a href="https://colab.research.google.com/github/Ethan4thewin/NLP-policy/blob/main/demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers

Collecting transformers
  Using cached transformers-4.35.0-py3-none-any.whl (7.9 MB)
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Using cached huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Using cached tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
Collecting safetensors>=0.3.1 (from transformers)
  Using cached safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Using cached huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
Installing collected packages: safetensors, huggingface-hub, tokenizers, transformers
Successfully installed huggingface-hub-0.17.3 safetensors-0.4.0 tokenizers-0.14.1 transformers-4.35.0


In [2]:
import pandas as pd
import numpy as np
import gensim
from gensim.models import KeyedVectors
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import joblib
from transformers import pipeline
import requests

In [3]:
# URL of the Google News Word2Vec model
word2vec_url = 'https://huggingface.co/NathaNn1111/word2vec-google-news-negative-300-bin/resolve/main/GoogleNews-vectors-negative300.bin'
# URL of the svm_model.pkl file
svm_model_url = 'https://huggingface.co/NathaNn1111/word2vec-google-news-negative-300-bin/resolve/main/svm_model.pkl'

# Destination paths for the downloaded files
word2vec_local_path = 'GoogleNews-vectors-negative300.bin'
svm_model_local_path = 'svm_model.pkl'

# Download the Google News Word2Vec model
response_word2vec = requests.get(word2vec_url)
with open(word2vec_local_path, 'wb') as word2vec_file:
    word2vec_file.write(response_word2vec.content)

# Download the svm_model.pkl file
response_svm_model = requests.get(svm_model_url)
with open(svm_model_local_path, 'wb') as svm_model_file:
    svm_model_file.write(response_svm_model.content)

print('Files downloaded successfully.')

Files downloaded successfully.


In [4]:
# Load SVM model and Word2Vec model
model = joblib.load('svm_model.pkl')
embedding_model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

# Download necessary NLTK datasets for pre-processing
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Loading necessary tools for pre-processing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


# Pre-proccessing
def preprocess_policy(policy):
    policy = policy.lower() # Lower case the datapoint
    policy = re.sub('[%s]' % re.escape(string.punctuation), '', policy) # Remove special characters
    policy = re.sub('\w*\d\w*', '', policy) # Remove unmeaning words such as 123, a1b
    tokens = word_tokenize(policy)
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    # tokens = [stemmer.stem(token) for token in tokens] #No need Stemming due to policy's nature
    # Stemming in this case can lead to over stemming
    return tokens

# Create Vector Representations for Policies
def get_average_word2vec(preprocessed_datapoint, w2v_model, generate_missing=False, k=300):
    if len(preprocessed_datapoint)<1:
        return np.zeros(k)

    # Assign vector value if token is not in model: depends on generate_missing, = 0 in this case
    if generate_missing:
        vectorized = [w2v_model[token] if token in w2v_model else np.random.rand(k) for token in preprocessed_datapoint]
    else:
        vectorized = [w2v_model[token] if token in w2v_model else np.zeros(k) for token in preprocessed_datapoint]

    # Calculate the average vector of the datapoint
    # by dividing sum of values in same axis to the number of token in a datapoint
    length_datapoint = len(vectorized)
    summed_vector = np.sum(vectorized, axis=0)
    averaged_vector = np.divide(summed_vector, length_datapoint)

    return averaged_vector

def get_word2vec_embeddings(model, data, generate_missing=False):
    embeddings = data['tokens'].apply(lambda x: get_average_word2vec(x, model, generate_missing=generate_missing))
    return list(embeddings)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
document_content = """We collect and use your personal information for as long as reasonably required in order to provide the a Services, which may include:
- developing and testing new or updated products and features without your permission;
- for internal record keeping and for marketing.

When you upload Posted Content, you automatically grant Vervoe an exclusive, royalty-free, perpetual, irrevocable, worldwide licence to use, reproduce, modify, adapt and publish the content in that Posted Content, including a right to sub-licence as necessary for Vervoe to provide and maintain the Technology. You waive any moral rights you may have in the Posted Content.

You agree that your Posted Content is not rude, offensive, racist, or inappropriate, and does not contain material that is contrary to any law applicable to you.

We are committed to ensuring that the information you provide is secure, and as such we use commercially reasonable endeavors to keep personal information collected through the Site secure. Such endeavours include requesting your username and password to verify your identity before you a grand access to your account."""

problematic = []

def classify_policy(policy_text):
    # Preprocess the policy
    tokens = preprocess_policy(policy_text)
    embedding = get_average_word2vec(tokens, embedding_model)

    # Predict using SVM
    svm_prediction = model.predict([embedding])[0]
    return svm_prediction

def split_into_paragraphs(document_content):
    # Normalize the line breaks
    normalized_content = document_content.replace('\r\n', '\n')
    # Split the document by double line breaks
    chunks = [p.strip() for p in normalized_content.split('\n\n') if p.strip()]

    paragraphs = []
    current_para = ""
    for chunk in chunks:
        # If the chunk starts with any list indicator, append it to the current paragraph
        if chunk.startswith(('•', '+', '-')):
            current_para += '\n' + chunk
        else:
            # If we have content in the current paragraph, store it and start a new one
            if current_para:
                paragraphs.append(current_para)
                current_para = ""
            current_para = chunk
    # Add any remaining content to the paragraphs list
    if current_para:
        paragraphs.append(current_para)

    return paragraphs

paragraphs = split_into_paragraphs(document_content)
for i, paragraph in enumerate(paragraphs, 1):
    print(paragraph)
    print("-----------")

for paragraph in paragraphs:
    predictions = classify_policy(paragraph)
    print(predictions)
    if predictions == 0:
        problematic.append(paragraph)

We collect and use your personal information for as long as reasonably required in order to provide the a Services, which may include:
- developing and testing new or updated products and features without your permission;
- for internal record keeping and for marketing.
-----------
When you upload Posted Content, you automatically grant Vervoe an exclusive, royalty-free, perpetual, irrevocable, worldwide licence to use, reproduce, modify, adapt and publish the content in that Posted Content, including a right to sub-licence as necessary for Vervoe to provide and maintain the Technology. You waive any moral rights you may have in the Posted Content.
-----------
You agree that your Posted Content is not rude, offensive, racist, or inappropriate, and does not contain material that is contrary to any law applicable to you.
-----------
We are committed to ensuring that the information you provide is secure, and as such we use commercially reasonable endeavors to keep personal information co

In [6]:
document_content

'We collect and use your personal information for as long as reasonably required in order to provide the a Services, which may include:\n- developing and testing new or updated products and features without your permission;\n- for internal record keeping and for marketing.\n\nWhen you upload Posted Content, you automatically grant Vervoe an exclusive, royalty-free, perpetual, irrevocable, worldwide licence to use, reproduce, modify, adapt and publish the content in that Posted Content, including a right to sub-licence as necessary for Vervoe to provide and maintain the Technology. You waive any moral rights you may have in the Posted Content.\n\nYou agree that your Posted Content is not rude, offensive, racist, or inappropriate, and does not contain material that is contrary to any law applicable to you.\n\nWe are committed to ensuring that the information you provide is secure, and as such we use commercially reasonable endeavors to keep personal information collected through the Site

In [7]:
from IPython.core.display import display, HTML

def highlight_problematic_sentences(term, problematic_phrases):
    if len(problematic_phrases) >= 1:
        for phrase in problematic_phrases:
            highlighted_phrase = f'<span style="background-color: #a83232">{phrase}</span>'
            term = term.replace(phrase, highlighted_phrase)
        # Replace newline characters with <br> tags for line breaks
        term = term.replace('\n', '<br>')
        display(HTML(term))

highlight_problematic_sentences(document_content, problematic)

In [8]:
bart = pipeline("summarization", model="facebook/bart-large-cnn", min_length=17, max_length=42)
articles = problematic
summaries = []

for article in articles:
    result = bart(article)
    summaries.append(result)

# Print each summary separately
for i, summary in enumerate(summaries):
    print(f"Summary for Article {i + 1}:")
    print(summary[0]['summary_text'])
    print()

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Summary for Article 1:
We collect and use your personal information for as long as reasonably required in order to provide the a Services. Services may include developing and testing new or updated products and features without your permission.

Summary for Article 2:
When you upload Posted Content, you automatically grant Vervoe an exclusive, royalty-free, perpetual, irrevocable, worldwide licence. You waive any moral rights you may have in the Posted

