In [None]:
import numpy as np
import pandas as pd
import re
from bs4 import BeautifulSoup

In [None]:
# Funtions to clean the data
def remove_html_tags(text):
    """Remove HTML tags using BeautifulSoup."""
    soup = BeautifulSoup(text, "html.parser")
    cleaned_text = soup.get_text()
    return cleaned_text
def remove_text_in_brackets(text):
    """Remove text within angle brackets."""
    return re.sub(r'<[^>]*>', '', text)

def remove_urls_and_emails(text):
    """Remove URLs and email addresses."""
    #remove urls
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    # Remove emails
    text = re.sub(r'\S*@\S*\s?', '', text)
    return text

def remove_control_characters(text):
    """Remove non-printable and control characters."""
    # Replace common control characters with space
    text = re.sub(r'[\r\n\t\f\v]', ' ', text)
    # Remove remaining control characters
    text = re.sub(r'[\x00-\x1f\x7f-\x9f]', '', text)
    return text

def remove_special_characters(text, remove_digits=False):
    """Remove special characters, optionally including digits."""
    pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
    text = re.sub(pattern, '', text)
    return text

def preprocess_text(text):
    """Apply all preprocessing steps to the text."""
    text = remove_text_in_brackets(text)
    text = remove_urls_and_emails(text)
    text = remove_control_characters(text)
    text = remove_special_characters(text)
    return text

In [None]:
datapath='gs://legal-terms-data/tosdr-data/clean/final_output2.csv'

## 1. Read in the data and check

In [None]:
df= pd.read_csv(datapath)
print(df.shape)
df.columns = df.columns.str.lower().str.replace(' ', '_')

(34623, 15)


In [None]:
df.head()

Unnamed: 0,unnamed:_0,topic,topic_link,case,case_link,classification,weight,service,title,title_link,source_name,source_link,rating,document_text_shortened,status
0,0,Ownership,https://edit.tosdr.org/topics/27,This service takes credit for your content,https://edit.tosdr.org/cases/179,bad,50.0,Snapchat,Security,https://edit.tosdr.org/points/15613,,,E,,NOT APPROVED
1,1,Ownership,https://edit.tosdr.org/topics/27,This service takes credit for your content,https://edit.tosdr.org/cases/179,bad,50.0,Facebook,This service takes credit for your content,https://edit.tosdr.org/points/8676,,,E,,NOT APPROVED
2,2,Ownership,https://edit.tosdr.org/topics/27,This service takes credit for your content,https://edit.tosdr.org/cases/179,bad,50.0,AliExpress,polkadot magic belgian chocolate bars,https://edit.tosdr.org/points/37011,,,D,,NOT APPROVED
3,3,Ownership,https://edit.tosdr.org/topics/27,This service takes credit for your content,https://edit.tosdr.org/cases/179,bad,50.0,IFTTT,This service takes credit for your content,https://edit.tosdr.org/points/8066,,,C,,NOT APPROVED
4,4,Ownership,https://edit.tosdr.org/topics/27,This service takes credit for your content,https://edit.tosdr.org/cases/179,bad,50.0,TwitPic,Twitpic takes credit for your content,https://edit.tosdr.org/points/1079,,,,,NOT APPROVED


Identify services where we have no texts

In [None]:
no_text= df.groupby('service')['document_text_shortened'].apply(lambda x:x.isna().any()).reset_index()

In [None]:
no_text.document_text_shortened.value_counts()

Unnamed: 0_level_0,count
document_text_shortened,Unnamed: 1_level_1
False,1576
True,876


In [None]:
print("shape of df: ", df.shape)

shape of df:  (34623, 15)


In [None]:
#drop the nulls
new_df= df.dropna(subset=["document_text_shortened"], how= 'any').reset_index()

In [None]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30073 entries, 0 to 30072
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   index                    30073 non-null  int64  
 1   unnamed:_0               30073 non-null  int64  
 2   topic                    30073 non-null  object 
 3   topic_link               30073 non-null  object 
 4   case                     30073 non-null  object 
 5   case_link                30073 non-null  object 
 6   classification           30073 non-null  object 
 7   weight                   30024 non-null  float64
 8   service                  30073 non-null  object 
 9   title                    30073 non-null  object 
 10  title_link               30073 non-null  object 
 11  source_name              30073 non-null  object 
 12  source_link              30073 non-null  object 
 13  rating                   28348 non-null  object 
 14  document_text_shortene

In [None]:
new_df.head()

Unnamed: 0,index,unnamed:_0,topic,topic_link,case,case_link,classification,weight,service,title,title_link,source_name,source_link,rating,document_text_shortened,status
0,6,6,Ownership,https://edit.tosdr.org/topics/27,This service takes credit for your content,https://edit.tosdr.org/cases/179,bad,50.0,ShapeShift,"""ShapeShift will be free to use any ideas, con...",https://edit.tosdr.org/points/6611,Terms of Service,https://edit.tosdr.org/services/1548/annotate#...,E,""">Terms of Service</a></h3>\n </div>\n ...",NOT APPROVED
1,10,10,Ownership,https://edit.tosdr.org/topics/27,This service takes credit for your content,https://edit.tosdr.org/cases/179,bad,50.0,FaceApp,"""You grant FaceApp consent to use the User Con...",https://edit.tosdr.org/points/7381,Terms of Use,https://edit.tosdr.org/services/1713/annotate#...,D,""">Terms of Use</a></h3>\n </div>\n ...",NOT APPROVED
2,20,20,Ownership,https://edit.tosdr.org/topics/27,This service takes credit for your content,https://edit.tosdr.org/cases/179,bad,50.0,Steam,"""<p>If you provide Valve with any feedback or ...",https://edit.tosdr.org/points/13352,Steam Subscriber Agreement,https://edit.tosdr.org/services/180/annotate#d...,D,""">Steam Subscriber Agreement</a></h3>\n ...",NOT APPROVED
3,21,21,Ownership,https://edit.tosdr.org/topics/27,This service takes credit for your content,https://edit.tosdr.org/cases/179,bad,50.0,YouTube,"""Service""",https://edit.tosdr.org/points/27364,Terms of Service,https://edit.tosdr.org/services/274/annotate#d...,E,""">Terms of Service</a></h3>\n </div>\n ...",NOT APPROVED
4,22,22,Ownership,https://edit.tosdr.org/topics/27,This service takes credit for your content,https://edit.tosdr.org/cases/179,bad,50.0,YouTube,"""And""",https://edit.tosdr.org/points/27365,Privacy Policy,https://edit.tosdr.org/services/274/annotate#d...,E,""">Privacy Policy</a></h3>\n </div>\n ...",NOT APPROVED


### Apply data cleaning

In [23]:
new_df["full_text_clean"]= new_df["document_text_shortened"].apply(preprocess_text)

In [24]:
new_df["full_text_clean"][0]

'Terms of Service                  CareersTestimonialsRelease NotesNewsPressLearnBackBlogBitcoinCrypto 101Crypto ProEthereumProductsBackShapeShift PlatformShapeShift ClassicHow It WorksFOX TokensFree TradingCoinCapKeepKeyResourcesBackDeveloper PortalBrand AssetsHelp CenterMiner FeesBuy CryptoRainfallLog inTerms of Service Last Modified September 2020 Welcome to ShapeShiftTHESE TERMS CONSTITUTE A LEGALLY BINDING AGREEMENT BETWEEN YOU AND US PLEASE READ THESE TERMS CAREFULLY TO ENSURE THAT YOU UNDERSTAND AND AGREE TO EVERY PORTION OF THESE TERMS BEFORE USING ANY PART OF THE SERVICE These terms of service these Terms or the Terms govern your relationship with ShapeShift Global Limited with its subsidiaries parents or affiliates collectively ShapeShift we our or us and use or access of 1 ShapeShiftcom ShapeShiftio and any other website maintained or published by ShapeShift each a Website 2 our software platform which enables you to hold transfer buy sell or enable the sale of certain digit

clean the titles

In [25]:
new_df["support_text_clean"]= new_df["title"].apply(preprocess_text)

Final_modeling dataset

In [37]:
new_df = new_df.rename(columns={'case': 'privacy_issue',
                                'topic': 'parent_privacy_issue',
                                'status': 'review_status'
                                })

In [38]:
#Save our modeling data
save_path='gs://legal-terms-data/tosdr-data/modeling/df_mod_v1.csv'
new_df.to_csv(save_path)

In [39]:
new_df.head()

Unnamed: 0,index,unnamed:_0,parent_privacy_issue,topic_link,privacy_issue,case_link,classification,weight,service,title,title_link,source_name,source_link,rating,document_text_shortened,review_status,full_text_clean,support_text_clean
0,6,6,Ownership,https://edit.tosdr.org/topics/27,This service takes credit for your content,https://edit.tosdr.org/cases/179,bad,50.0,ShapeShift,"""ShapeShift will be free to use any ideas, con...",https://edit.tosdr.org/points/6611,Terms of Service,https://edit.tosdr.org/services/1548/annotate#...,E,""">Terms of Service</a></h3>\n </div>\n ...",NOT APPROVED,Terms of Service CareersTesti...,ShapeShift will be free to use any ideas conce...
1,10,10,Ownership,https://edit.tosdr.org/topics/27,This service takes credit for your content,https://edit.tosdr.org/cases/179,bad,50.0,FaceApp,"""You grant FaceApp consent to use the User Con...",https://edit.tosdr.org/points/7381,Terms of Use,https://edit.tosdr.org/services/1713/annotate#...,D,""">Terms of Use</a></h3>\n </div>\n ...",NOT APPROVED,Terms of Use You can see our ...,You grant FaceApp consent to use the User Cont...
2,20,20,Ownership,https://edit.tosdr.org/topics/27,This service takes credit for your content,https://edit.tosdr.org/cases/179,bad,50.0,Steam,"""<p>If you provide Valve with any feedback or ...",https://edit.tosdr.org/points/13352,Steam Subscriber Agreement,https://edit.tosdr.org/services/180/annotate#d...,D,""">Steam Subscriber Agreement</a></h3>\n ...",NOT APPROVED,Steam Subscriber Agreement T...,If you provide Valve with any feedback or sugg...
3,21,21,Ownership,https://edit.tosdr.org/topics/27,This service takes credit for your content,https://edit.tosdr.org/cases/179,bad,50.0,YouTube,"""Service""",https://edit.tosdr.org/points/27364,Terms of Service,https://edit.tosdr.org/services/274/annotate#d...,E,""">Terms of Service</a></h3>\n </div>\n ...",NOT APPROVED,Terms of Service Whats in the...,Service
4,22,22,Ownership,https://edit.tosdr.org/topics/27,This service takes credit for your content,https://edit.tosdr.org/cases/179,bad,50.0,YouTube,"""And""",https://edit.tosdr.org/points/27365,Privacy Policy,https://edit.tosdr.org/services/274/annotate#d...,E,""">Privacy Policy</a></h3>\n </div>\n ...",NOT APPROVED,Privacy Policy When you use o...,And
