## Getting Privacy Text

In [41]:
import string
import nltk
import re
import spacy
nlp = spacy.load("en_core_web_sm")

### Web scraping

In [42]:
from bs4 import BeautifulSoup
import requests

In [43]:
# url of website to scrape
web_url = "https://www.linkedin.com/" 

In [44]:
source = requests.get(web_url).text
soup = BeautifulSoup(source, 'lxml')

In [45]:
a_tag = soup.find_all("a") #Gives you the list of all the a tags

In [46]:
urls = []
for i in a_tag:
    for term in ["privacy", "terms","conditions","policy", "legal"]:
        if term in i.text.lower():
            urls.append(i["href"])

raw_policies = []

for url in urls: 
    if url[0] == "/":
        url = web_url + url
    try:
        source = requests.get(url).text
        soup = BeautifulSoup(source, 'lxml')
    except:
        continue
        
    policies=soup.find('html')
    
    raw_policies.append(policies.text)

In [47]:
raw_policies

['\n\n\n\n\n\n\n\n\n\n\n\n\n\n408,000+ Legal jobs in United States (17,457 new)\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSkip to main content\n\n\n\nLinkedIn\n \n\n\n\n\n              Legal in United States\n \nExpand search\n\n\n\n\n\n\n\n\n\n\n              Jobs\n            \n\n\n\n              People\n            \n\n\n\n              Learning\n            \n\n\n\n\n\n\n\n\n\n\n\n\nDismiss\n\n\n\n\n\n\nDismiss\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nDismiss\n\n\n \n\n\n\n\n\n\n\n\n\n\n\nDismiss\n\n\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nDismiss\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n          Join now\n        \nSign in\n \n\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n        \n        Most relevant\n      \n\n\n\n\nSort By filter options\n\n\n\n\n        Most relevant\n    \n\n\n\n\n        Most recent\n    \n\n\n\n\n    Done\n  \n\n \n\n\n\n\n\n\n        \n        Any Time\n      \n\n\n\n\nDate Posted filter options\n\n\n\n\n        Past 24 hours (17,457)\n    \n\n\n\n\n        Past Week (101,645)\n    \

### Text Preprocessing

In [49]:
def remove_punctuation(text):
    no_punctuation_text = ''.join([i for i in str(text) if i not in string.punctuation])
    return no_punctuation_text.lower()

In [50]:
def remove_nonwords(str_):
    return re.sub("[^A-Za-z ]\w+[^A-Za-z]*", ' ', str_)

In [51]:
# Lemmatization and Removing stop words and non words
def text_preprocessing(text):
    text = remove_punctuation(text)
    text = remove_nonwords(text)
    tokenized_text = [token.lemma_ for token in nlp(text)]
    no_stopwords_list = [i.lower() for i in tokenized_text if i not in nlp.Defaults.stop_words]
    lemma_text = ' '.join(no_stopwords_list)
    return lemma_text

toSections -> to sections

In [52]:
import wordninja
from nltk.tokenize import word_tokenize

# Splitting words
clean_words = []
list_of_words = word_tokenize(". ".join(raw_policies))
for word in list_of_words:
    if len(word) > 5:
        words = wordninja.split(word)
        clean_words.extend(words)
    else:
        clean_words.append(word)

In [53]:
clean_text = " ".join(clean_words)

In [None]:
remove_unwanted_punctuations_words = []
for i,word in enumerate(word_tokenize(clean_text)):
    if "." in word:
        if word == ".":
            if word_tokenize(clean_text)[i+1].strip()[0] == word_tokenize(clean_text)[i+1].lower()[0]:
                word = " "
        else:
            # Handling abbreviations
            for j, words in enumerate(word.strip().split(".")[1:]):
                x = len(word.strip().split(".")[j+1])
                if x <= 1:
                    word = remove_punctuation(word)
                    break
        
    remove_unwanted_punctuations_words.append(word.strip())

In [None]:
clean_text = " ".join(remove_unwanted_punctuations_words)

In [None]:
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters

punkt_param = PunktParameters()
tokenizer = PunktSentenceTokenizer(punkt_param)
clean_policy = tokenizer.tokenize(clean_text)

In [None]:
clean_policy

### Find all the privacy text similar to good and bad privacy

#### Reading the data

In [22]:
good_privacy = []
with open("./Data/good_privacy.txt", "r", encoding="utf8") as f:
    for line in f:
        text = text_preprocessing(line.rstrip().lower())
        good_privacy.append(text)

In [23]:
bad_privacy = []
with open("./Data/bad_privacy.txt", "r",  encoding="utf8") as f:
    for line in f:
        text = text_preprocessing(line.rstrip().lower())
        bad_privacy.append(text)

In [24]:
all_privacy = good_privacy + bad_privacy

#### Finding similarity score

In [25]:
# Program to measure the similarity between 
# two sentences using cosine similarity.
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [37]:
sim_sentences = []
for X_raw in clean_policy:
    X = text_preprocessing(X_raw)
    sim_sentence = ""
    for Y in all_privacy:
        
        l1 =[];l2 =[]

        X_set = {w for w in word_tokenize(X)} 
        Y_set = {w for w in word_tokenize(Y)}

        # form a set containing keywords of both strings 
        rvector = X_set.union(Y_set) 
        for w in rvector:
            if w in X_set: l1.append(1) # create a vector
            else: l1.append(0)
            if w in Y_set: l2.append(1)
            else: l2.append(0)
        c = 0

        # cosine formula 
        for i in range(len(rvector)):
                c+= l1[i]*l2[i]
        if float((sum(l1)*sum(l2))**0.5) == 0:
            cosine = 0
        else:
            cosine = c / float((sum(l1)*sum(l2))**0.5)
        
        sim_sentence = X_raw
        if cosine > 0.35:
            break
            
    if cosine > 0.35:
        sim_sentences.append(sim_sentence.strip())
        

In [38]:
final_sentences = []
for sim_sentence in sim_sentences:
    if "?" not in sim_sentence:
        final_sentences.append(sim_sentence)

In [40]:
final_sentences

['To provide the Facebook Products , we must process information about you .',
 'The types of information we collect depend on how you use our Products .',
 'Things you and others do and provide Information and content you provide .',
 'We collect the content , communications and other information you provide when you use our Products , including when you sign up for an account , create or share content , and message or communicate with others .',
 'This can include information in or about the content you provide ( like metadata ) , such as the location of a photo or the date a file was created .',
 'Data with special protections : You can choose to provide information in your Facebook profile fields or Life Events about your religious views , political views , who you are `` interested in , `` or your health .',
 'We collect information about the people , Pages , accounts , hash tags and groups you are connected to and how you interact with them across our Products , such as people yo