#### Load Packages

In [1]:
import os
import pandas as pd
from bs4 import BeautifulSoup

#### Load Annotated Data Practices

* Read in the annotation CSV files that show the output of the annotators

In [2]:
tmp_dir = '../opp-115/annotations/' # Point to the directory

file_names = os.listdir(tmp_dir)    # Grab the file names

# Read each file in as a list of DataFrames, then combine into master DataFrame
annotated_data_practices = pd.concat(
    [
        pd.read_csv(
            tmp_dir + file_name,
            names=[
                'annotation_id',
                'batch_id',
                'annotator_id',
                'policy_id',
                'segment_id',
                'category_name',
                'attribute_value_pairs',
                'date',
                'policy_url'
            ],
            index_col=False
        ).assign(source=file_name)
        for file_name in file_names
    ]
); del tmp_dir

#### Parse the Attributes to Flag Segments

##### 1. Segments Allowing Comapnies to Collect and Use Your Data

Find the annotations marking the segments stating that the company will collect and use personal data on the user for the purposes of:
* Advertising, 
* Marketing, 
* Future mergers and acquisitions
* Any other miscellaneous or vague purposes
    
I decided to omit data collection for mandatory or "good" purposes for the user:

* To facilitate basic and additional services/features
* For analytics and research
* To fulfill a legal requirement
* For personalizing the user experience
* For service operation and security

In [3]:
first_party_col_use = annotated_data_practices.query('category_name=="First Party Collection/Use"')

annotated_data_practices = annotated_data_practices.merge(
    pd.DataFrame({
        'annotation_id': first_party_col_use.annotation_id, 
        'first_party_ads': [
            (eval(x)['Does/Does Not']['value'] == 'Does') &
            (eval(x)['Purpose']['value'] in ['Advertising', 'Marketing', 'Merger/Acquisition', 'Other', 'Unspecified'])
            for x in first_party_col_use.attribute_value_pairs
        ]
    }),
    how='outer', on='annotation_id'
); del first_party_col_use

annotated_data_practices.first_party_ads = annotated_data_practices.first_party_ads.fillna(False).astype('int')

annotated_data_practices.groupby('first_party_ads').size()


first_party_ads
0    19311
1     3883
dtype: int64

In [4]:
annotated_data_practices

Unnamed: 0,annotation_id,batch_id,annotator_id,policy_id,segment_id,category_name,attribute_value_pairs,date,policy_url,source,first_party_ads
0,20137,test_category_labeling_highlight_fordham_aaaaa,121,3905,0,Other,"{""Other Type"": {""selectedText"": ""Sci-News.com ...",Not specified,http://www.sci-news.com/privacy-policy.html,1017_sci-news.com.csv,0
1,20324,test_category_labeling_highlight_fordham_aaaaa,121,3905,1,First Party Collection/Use,"{""Collection Mode"": {""selectedText"": ""nformati...",Not specified,http://www.sci-news.com/privacy-policy.html,1017_sci-news.com.csv,0
2,20325,test_category_labeling_highlight_fordham_aaaaa,121,3905,1,First Party Collection/Use,"{""Collection Mode"": {""selectedText"": ""nformati...",Not specified,http://www.sci-news.com/privacy-policy.html,1017_sci-news.com.csv,0
3,20326,test_category_labeling_highlight_fordham_aaaaa,121,3905,2,Data Retention,"{""Personal Information Type"": {""selectedText"":...",Not specified,http://www.sci-news.com/privacy-policy.html,1017_sci-news.com.csv,0
4,20327,test_category_labeling_highlight_fordham_aaaaa,121,3905,3,First Party Collection/Use,"{""Collection Mode"": {""selectedText"": ""Not sele...",Not specified,http://www.sci-news.com/privacy-policy.html,1017_sci-news.com.csv,1
...,...,...,...,...,...,...,...,...,...,...,...
23189,5977,test_category_labeling_highlight,88,3722,5,Other,"{""Other Type"": {""endIndexInSegment"": 220, ""sta...",Not specified,http://mohegansun.com/about-mohegan-sun/privac...,995_mohegansun.com.csv,0
23190,6753,test_category_labeling_highlight,82,3722,5,Other,"{""Other Type"": {""endIndexInSegment"": 220, ""sta...",Not specified,http://mohegansun.com/about-mohegan-sun/privac...,995_mohegansun.com.csv,0
23191,4949,test_category_labeling_highlight,84,3722,6,Other,"{""Other Type"": {""endIndexInSegment"": 190, ""sta...",Not specified,http://mohegansun.com/about-mohegan-sun/privac...,995_mohegansun.com.csv,0
23192,5976,test_category_labeling_highlight,88,3722,6,Other,"{""Other Type"": {""endIndexInSegment"": 151, ""sta...",Not specified,http://mohegansun.com/about-mohegan-sun/privac...,995_mohegansun.com.csv,0


##### 2. Segments Allowing Companies to Share Your Data with Third Parties

Find the annotations marking the segments stating that the company will share personal data with a third party, using the same purposes as above.

In [5]:
third_party_sharing = annotated_data_practices.query('category_name=="Third Party Sharing/Collection"')

annotated_data_practices = annotated_data_practices.merge(
    pd.DataFrame({
        'annotation_id': third_party_sharing.annotation_id, 
        'bad_sharing': [
            (eval(x)['Does/Does Not']['value'] == 'Does') &
            (eval(x)['Purpose']['value'] in ['Advertising', 'Marketing', 'Merger/Acquisition', 'Other', 'Unspecified'])
            for x in third_party_sharing.attribute_value_pairs
        ]
    }),
    how='outer', on='annotation_id'
); del third_party_sharing

annotated_data_practices.bad_sharing = annotated_data_practices.bad_sharing.fillna(False).astype('int')

annotated_data_practices.groupby('bad_sharing').size()

bad_sharing
0    20607
1     2587
dtype: int64

##### 3. Segments Notifying User of Choices Available to Them Re: First Party Data Collection and Use

Find the annotations marking the segments notifying the user about choices or controls available to them about first party collection and use of their data (i.e., by the company itself)

In [6]:
user_choice = annotated_data_practices.query('category_name=="User Choice/Control"')

annotated_data_practices = annotated_data_practices.merge(
    pd.DataFrame({
        'annotation_id': user_choice.annotation_id, 
        'first_party_choice': [
            (eval(x)['Choice Scope']['value'] in ['First party collection', 'First party use'])
            for x in user_choice.attribute_value_pairs
        ]
    }),
    how='outer', on='annotation_id'
); del user_choice

annotated_data_practices.first_party_choice = annotated_data_practices.first_party_choice.fillna(False).astype('int')

annotated_data_practices.groupby('first_party_choice').size()


first_party_choice
0    22263
1      931
dtype: int64

##### 4. Segments Notifying User of Choices Available to Them Re: Third Party Data Sharing

Find the annotations marking the segments notifying the user about choices or controls available to them about third party sharing of their data (e.g., shared with another another company, an affiliate, other users, the public)

In [7]:
user_choice = annotated_data_practices.query('category_name=="User Choice/Control"')

annotated_data_practices = annotated_data_practices.merge(
    pd.DataFrame({
        'annotation_id': user_choice.annotation_id, 
        'third_party_choice': [
            (eval(x)['Choice Scope']['value'] in ['Third party sharing/collection', 'Third party use'])
            for x in user_choice.attribute_value_pairs
        ]
    }),
    how='outer', on='annotation_id'
); del user_choice

annotated_data_practices.third_party_choice = annotated_data_practices.third_party_choice.fillna(False).astype('int')

annotated_data_practices.groupby('third_party_choice').size()

third_party_choice
0    22756
1      438
dtype: int64

#### Aggregate the Data Using Majority Vote (i.e., Agreement between at least 2 Annotators)

Each privacy policy was reviewed by 3 annotators, and the paper suggests treating the agreement of at least 2 as indicating the validity of a flag being applied to a particular segment

In [8]:
consensus = (
    annotated_data_practices[['policy_id', 'segment_id', 'annotator_id', 'first_party_ads', 'bad_sharing', 'first_party_choice', 'third_party_choice']]
    .drop_duplicates()
    .drop('annotator_id', axis=1)
    .groupby(['policy_id', 'segment_id'])
    .sum()
    .apply(lambda var: (var >= 2).astype('int'))
    .reset_index()
)

#### Parse the Text of the Policies

The actual text of the policies have been sanitized, given delimiters between segments, and saved locally as HTML files. Here, I read them in and parse them into its predefined segments.

In [9]:
tmp_dir = '../opp-115/sanitized_policies/'

file_names = os.listdir(tmp_dir)

segment_texts = [None] * len(file_names)

for i in range(len(file_names)):
    with open(tmp_dir + file_names[i], 'r') as html:
        segment_texts[i] = pd.DataFrame(BeautifulSoup(html, 'html.parser').get_text().split('|||'), columns=['segment_text']).reset_index(drop=False, names='segment_id').assign(source = file_names[i].replace('.html', ''))

segment_texts = pd.concat(segment_texts).merge(
    annotated_data_practices[['policy_id', 'source']].drop_duplicates().assign(source = lambda df: df.source.str.replace('.csv', '')), 
    on='source'
).drop(['source'], axis=1)[['policy_id', 'segment_id', 'segment_text']].sort_values(['policy_id', 'segment_id'])

  annotated_data_practices[['policy_id', 'source']].drop_duplicates().assign(source = lambda df: df.source.str.replace('.csv', '')),


Let's clean the data by stripping any whitespace at the start or end of string and converting everything to lowercase

In [10]:
segment_texts.segment_text = segment_texts.segment_text.str.strip()

In [11]:
segment_texts

Unnamed: 0,policy_id,segment_id,segment_text
1955,3621,0,"reddit privacy policy effective Apr 14, 20..."
1956,3621,1,here's how we manage and use your data. We t...
1957,3621,2,Please read this Privacy Policy carefully. You...
1958,3621,3,about this policy reddit is an open platfor...
1959,3621,4,We do take efforts to make sure the platform k...
...,...,...,...
3481,3908,6,Review Your Child's Data and Security Sally Ri...
3482,3908,7,To (a) initiate a review of your son's or daug...
3483,3908,8,To protect your daughter's or son's privacy an...
3484,3908,9,Sally Ride Science has taken steps to ensure t...


#### Merge the Annotation Flags with the Segments

In [12]:
data = segment_texts.merge(consensus)

In [13]:
data

Unnamed: 0,policy_id,segment_id,segment_text,first_party_ads,bad_sharing,first_party_choice,third_party_choice
0,3621,0,"reddit privacy policy effective Apr 14, 20...",0,0,0,0
1,3621,1,here's how we manage and use your data. We t...,0,0,0,0
2,3621,2,Please read this Privacy Policy carefully. You...,0,0,0,0
3,3621,3,about this policy reddit is an open platfor...,0,0,0,0
4,3621,4,We do take efforts to make sure the platform k...,0,0,0,0
...,...,...,...,...,...,...,...
3787,3908,6,Review Your Child's Data and Security Sally Ri...,0,0,1,0
3788,3908,7,To (a) initiate a review of your son's or daug...,0,0,1,0
3789,3908,8,To protect your daughter's or son's privacy an...,0,0,0,0
3790,3908,9,Sally Ride Science has taken steps to ensure t...,0,0,0,0


#### Set Baseline Predictions Based on Regex

In [14]:
baseline_regex = [
    '(we .* (collect|keep|use))|(information about you)|(give us information)',
    'third|share',
    'opt',
    'opt'
]

In [15]:
baseline = {
    'first_party_data_output': data.segment_text.str.contains(baseline_regex[0], case=False).astype('int'),
    'third_party_sharing_output': data.segment_text.str.contains(baseline_regex[1], case=False).astype('int'),
    'first_party_choice_output': data.segment_text.str.contains(baseline_regex[2], case=False).astype('int'),
    'third_party_choice_output': data.segment_text.str.contains(baseline_regex[3], case=False).astype('int'),
}

baseline = pd.DataFrame(baseline)

  'first_party_data_output': data.segment_text.str.contains(baseline_regex[0], case=False).astype('int'),


#### Export the Data

In [16]:
data.to_csv('../data.csv', index=False)
baseline.to_csv('../baseline.csv', index=False)