## Getting Privacy Text

In [1]:
import string
import nltk
import re
import spacy
nlp = spacy.load("en_core_web_sm")

### Web scraping

In [2]:
from bs4 import BeautifulSoup
import requests

In [3]:
# url of website to scrape
web_url = "https://www.stackoverflow.com/" 

In [4]:
source = requests.get(web_url).text
soup = BeautifulSoup(source, 'lxml')

In [5]:
a_tag = soup.find_all("a") #Gives you the list of all the a tags

In [6]:
urls = []
for i in a_tag:
    for term in ["privacy", "terms","conditions","policy", "legal"]:
        if term in i.text.lower():
            urls.append(i["href"])

raw_policies = []

for url in urls: 
    if url[0] == "/":
        url = web_url + url
    try:
        source = requests.get(url).text
        soup = BeautifulSoup(source, 'lxml')
    except:
        continue
        
    policies=soup.find('html')
    
    raw_policies.append(policies.text)

### Text Preprocessing

In [7]:
def remove_punctuation(text):
    """
    To remove any sort of punctuation 
    """
    no_punctuation_text = ''.join([i for i in str(text) if i not in string.punctuation])
    return no_punctuation_text.lower()

In [8]:
def remove_nonwords(str_):
    """
    To remove non words 
    """
    return re.sub("[^A-Za-z ]\w+[^A-Za-z]*", ' ', str_)

In [9]:
def text_preprocessing(text):
    """
        Lemmatization and Removing stop words and non words
    """
    text = remove_punctuation(text)
    text = remove_nonwords(text)
    tokenized_text = [token.lemma_ for token in nlp(text)]
    no_stopwords_list = [i.lower() for i in tokenized_text if i not in nlp.Defaults.stop_words]
    lemma_text = ' '.join(no_stopwords_list)
    return lemma_text

In [10]:
import wordninja
from nltk.tokenize import word_tokenize

# Splitting words
clean_words = []
list_of_words = word_tokenize(".".join(raw_policies).replace("\n",". ").replace("\\",""))
for word in list_of_words:
    if len(word) > 5:
        # if the word has more than two words then split it
        words = wordninja.split(word)
        clean_words.extend(words)
    else:
        clean_words.append(word)

In [11]:
clean_text = " ".join(clean_words)

In [13]:
# To handle abbreviations in a word
remove_unwanted_punctuations_words = []
tokenize_text = word_tokenize(clean_text)
for i,word in enumerate(tokenize_text):
    if i >= len(tokenize_text)-10:
        break
    if "." in word:
        if word == ".":
            if tokenize_text[i+1].strip()[0] == " ":
                if tokenize_text[i+2].strip()[0] == tokenize_text[i+2].lower()[0]:
                    word = " "
            else:
                if tokenize_text[i+1].strip()[0] == tokenize_text[i+1].lower()[0]:
                    word = " "
                
        else:
            # Handling abbreviations
            for j, words in enumerate(word.strip().split(".")[1:]):
                x = len(word.strip().split(".")[j+1])
                if x <= 1:
                    word = remove_punctuation(word)
                    break
        
    remove_unwanted_punctuations_words.append(word.strip())

In [14]:
clean_text = " ".join(remove_unwanted_punctuations_words)

In [15]:
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters

# Get sentences
punkt_param = PunktParameters()
tokenizer = PunktSentenceTokenizer(punkt_param)
clean_policy = tokenizer.tokenize(clean_text)

In [16]:
removed_spaces_sentences = []
for i in clean_policy:
    x = word_tokenize(i)
    removed_spaces_sentences.append(" ".join(x))

In [17]:
valid_lines = []     
    

for i in removed_spaces_sentences:
    if sum(1 for j in i if j.isupper()) < 5:
        if len(i.split(" ")) > 15:
            if i[0] == i[0].upper():
                if i[-1] == ".":
                    valid_lines.append(i)

            

In [18]:
valid_lines

['Sign up or log in to customize your list more stack exchange communities company blog .',
 'Connect and share knowledge within a single location that is structured and easy to search .',
 'Stack Overflow welcomes you to the Network , the largest community of developers in the world , and invites you to participate in the community by sharing knowledge with your peers and colleagues .',
 'Like all communities , we ask that you participate in a manner that respects your fellow community members .',
 'By accessing or using the Services or the Network in any manner , you represent and warrant that you are at least 13 years of age .',
 'By accessing or using the Services or the Network in any manner , you represent and warrant that you are at least 16 years of age .',
 'Where feasible , we may , in our sole discretion , make efforts to in form you about any outages and report on the nature and reason for any outages that may occur in an open and transparent manner , though we are under no

### Similarity Scores
### Find all the privacy text similar to good and bad privacy

#### Reading the data

In [19]:
good_privacy = []
with open("./Data/good_privacy.txt", "r", encoding="utf8") as f:
    for line in f:
        text = text_preprocessing(line.rstrip().lower())
        good_privacy.append(text)

In [20]:
bad_privacy = []
with open("./Data/bad_privacy.txt", "r",  encoding="utf8") as f:
    for line in f:
        text = text_preprocessing(line.rstrip().lower())
        bad_privacy.append(text)

In [21]:
all_privacy = good_privacy + bad_privacy

#### Finding similarity score

In [22]:
# Program to measure the similarity between 
# two sentences using cosine similarity.
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [23]:
sim_sentences = []
for X_raw in valid_lines:
    X = text_preprocessing(X_raw)
    sim_sentence = ""
    for Y in all_privacy:
        
        l1 =[];l2 =[]

        X_set = {w for w in word_tokenize(X)} 
        Y_set = {w for w in word_tokenize(Y)}

        # form a set containing keywords of both strings 
        rvector = X_set.union(Y_set) 
        for w in rvector:
            if w in X_set: l1.append(1) # create a vector
            else: l1.append(0)
            if w in Y_set: l2.append(1)
            else: l2.append(0)
        c = 0

        # cosine formula 
        for i in range(len(rvector)):
                c+= l1[i]*l2[i]
        if float((sum(l1)*sum(l2))**0.5) == 0:
            cosine = 0
        else:
            cosine = c / float((sum(l1)*sum(l2))**0.5)
        
        sim_sentence = X_raw
        if cosine > 0.4:
            break
            
    if cosine > 0.4:
        sim_sentences.append(sim_sentence.strip())
        

In [24]:
final_sentences = []
for sim_sentence in sim_sentences:
    final_sentences.append(sim_sentence)

In [25]:
final_sentences

['You are solely responsible for ensuring that your payment obligations , if any , remain current and not in arrears .',
 'Depending on how you interact with us , we will collect and process your personal information as shown in the table below .',
 'We may collect your individual contact information in order to communicate with you and may use other information that we need in order to manage our account with your business .',
 'Often , you can choose what information to provide , but sometimes we require personal information from you to carry out certain activities such as account verification or registration .',
 'We use cookies , which are small text files that collect and track certain technical information , and similar technologies to help us operate and provide our Services to you .',
 'Information about the systems and device ( s ) you use to access our Services , including , IP address , browser type and version , time zone setting , operating system and platform , device typ