In [None]:
import spacy
import os
import pandas as pd
from spacy.matcher import Matcher
from google.colab import files

In [None]:
Condition = ['depending', 'necessary', 'appropriate',
                 'inappropriate', 'as needed', 'as applicable',
                 'otherwise reasonably', 'sometimes',
                 'from time to time']
Generalization = ['generally', 'mostly', 'widely',
                  'general', 'commonly',
                  'usually', 'normally', 'typically',
                  'largely', 'often', 'primarily',
                  'among other things']
Modality = ['may', 'might', 'can', 'could', 'would',
            'likely', 'possible', 'possibly']
Numeric_quantifier = ['anyone', 'certain', 'everyone',
                      'numerous', 'some', 'most', 'few',
                      'much', 'many', 'various',
                      'including but not limited to']

In [None]:
Path = "/content/input/"
filelist = os.listdir(Path)
txt_files = []
file_names = []
for i in filelist:
    if i.endswith(".txt"):
        file_names.append(i)
        with open(Path + i, 'r') as f:
            txt_files.append(f.read())

## Bradley-Terry Coeff

In [None]:
bt_coef ={
    "CN": 1.619,
    "C": 1.783,
    "CM": 1.864,
    "CMN": 2.125,
    "CG": 2.345,
    "CGN": 2.443,
    "MN": 2.569,
    "N": 2.710,
    "M": 2.865,
    "CGMN": 2.899,
    "CGM": 2.968,
    "GN": 3.281,
    "GMN": 3.506,
    "G": 3.550,
    "GM": 4.045
}

## Policy String

In [None]:
Policies = pd.DataFrame([file_names, txt_files]).T
Policies.columns = ["File_name", "text_content"]

In [None]:
Policies

Unnamed: 0,File_name,text_content
0,Telegram.txt,Telegram Privacy Policy EnglishBahasa Indonesi...
1,ShareMe-India'sbestfiletransferapp.txt,"Mi Global HomeDue to COVID-19, the ongoing h..."
2,RhythmwithTabla&TanpuraPREMIUM.txt,PSS Labs | Rhythm with Tabla & Tanpura - Priva...
3,AJIOOnlineShopping-HandpickedCuratedFashion.txt,Reliance RetailPrivacy PolicyHomePrivacy Polic...
4,Vido:LyricalVideoStatusMaker.txt,Welcome to VidoYour privacy is important to Vi...
...,...,...
71,Moj-SnackonIndianShortVideos|MadeinIndia.txt,MOJPrivacy PolicyLast updated: 10th March 2021...
72,InternetOptimizerPro|No-Ads.txt,Privacy PolicyPrivacy Policy ...
73,FacebookLite.txt,FacebookJump toSections of this pageAccessibil...
74,Snapchat.txt,Privacy Policy - Snap Inc.HomeCareersNewsInves...


## Index Tracking


In [None]:
def len_str(x):
  return len(x)

In [None]:
keepWords = Condition + Generalization + Modality + Numeric_quantifier

index_tracker = {}
for num, i in enumerate(keepWords):
  index_tracker[i] = num
_ = keepWords.sort(key=len_str, reverse=True)

## Matcher

In [None]:
nlp = spacy.load('en')

matcher = Matcher(nlp.vocab)
for i in keepWords:
  rule = [ {"LOWER": j } for j in i.split() ]
  matcher.add(i, None, rule)

## Functions

In [None]:
def generateVec(sentence):
  text = nlp(sentence)

  final = []
  category_vaguesness = {"C": 0, "G": 0, "M": 0, "N": 0}
  for i in range(len(keepWords)):
    final.append(0)

  buffer_start = -1
  for word, match_start, match_end in matcher(text):
    if buffer_start < match_start:
      # print(nlp.vocab.strings[word])
      final[index_tracker[nlp.vocab.strings[word]]] += 1
      if nlp.vocab.strings[word] in Condition:
        category_vaguesness["C"] = 1
      elif nlp.vocab.strings[word] in Modality:
        category_vaguesness["M"] = 1
      elif nlp.vocab.strings[word] in Numeric_quantifier:
        category_vaguesness["N"] = 1
      else:
        category_vaguesness["G"] = 1 
    buffer_start = match_end - 1

  if buffer_start==-1:
    return None
  
  temp = "".join([ i for i in category_vaguesness if category_vaguesness[i] ])
  final.append(temp)
  final.append(bt_coef[temp])

  return final

In [None]:
def generateMatrix(text_string):
  final = []
  tok = nlp(text_string)
  for i in tok.sents:
    vector = generateVec(i.text)
    if vector != None:
      final.append(vector)
  return final

In [None]:
def make_df(intext):
  visualization = pd.DataFrame(generateMatrix(intext))
  if len(visualization) == 0:
    return "None"
  keepWords = Condition + Generalization + Modality + Numeric_quantifier + ["Category", "BT Coeff"]
  visualization.columns = keepWords
  return visualization

## Generating Outputs

In [None]:
coefs = []
vagueness_cats = { "cat":[], "score":[] }
for i in Policies.iterrows():
  # print(i[1][0])
  file_name = i[1][0][:-4]
  file_path = "/content/output/" + file_name + ".csv"
  temp_df = make_df(i[1][1])
  if "may" in temp_df:
    temp_df.to_csv(file_path)
    coefs.append(temp_df["BT Coeff"].mean())
    for num, i in enumerate(temp_df["BT Coeff"]):
      vagueness_cats["score"].append(i)
      vagueness_cats["cat"].append(temp_df["Category"][num])
  else:
    coefs.append("NA")
Policies["vague_score"] = coefs

In [None]:
print("CSVs GENERATED")

CSVs GENERATED


In [None]:
# !zip -r result.zip output

# Manual Testing

In [None]:
make_df("i may be hahahehe")

Unnamed: 0,depending,necessary,appropriate,inappropriate,as needed,as applicable,otherwise reasonably,sometimes,from time to time,generally,mostly,widely,general,commonly,usually,normally,typically,largely,often,primarily,among other things,may,might,can,could,would,likely,possible,possibly,anyone,certain,everyone,numerous,some,most,few,much,many,various,including but not limited to,Category,BT Coeff
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,M,2.865


In [None]:
Policies.to_csv("Policies.csv")

In [None]:
for num, i in enumerate(Policies["File_name"]):
  print(i,": ", Policies["vague_score"][num])

Policies["vague_score"].describe()

Telegram.txt :  2.751292307692308
ShareMe-India'sbestfiletransferapp.txt :  2.633935064935067
RhythmwithTabla&TanpuraPREMIUM.txt :  2.6896428571428572
AJIOOnlineShopping-HandpickedCuratedFashion.txt :  2.7960851063829772
Vido:LyricalVideoStatusMaker.txt :  2.7198
PLAYit-ANewAll-in-OneVideoPlayer.txt :  2.6894615384615386
SDMaidPro-Unlocker.txt :  2.8136
NavigationPro:GoogleMapsNavionSamsungWatch.txt :  2.6896428571428572
CrayonIconPack.txt :  2.6896428571428572
Josh-SnackonShortVideoswithTopIndianApp.txt :  2.742947368421053
PGT+🔧:ProGFX&Optimizer(withadvancesetting).txt :  2.566222222222222
PowerampFullVersionUnlocker.txt :  2.7073191489361683
ShareKaro-Share&FileTransferApp,Shareit.txt :  2.3687857142857145
JioSaavnMusic&Radio–JioTunes,Podcasts,Songs.txt :  2.7749210526315817
WynkMusic-NewMP3HindiTamilSong&PodcastApp.txt :  2.7548148148148153
SAXVideoPlayer-AllinoneHdFormatpro2021.txt :  2.865
FreeVPNTomato|FastestFreeHotspotVPNProxy.txt :  2.865
Instagram.txt :  2.7879375000000004
A

count    76.000000
mean      2.713307
std       0.096532
min       2.368786
25%       2.681618
50%       2.713304
75%       2.772303
max       2.920571
Name: vague_score, dtype: float64

In [None]:
vagueness_cats = pd.DataFrame(vagueness_cats)
vagueness_cats = vagueness_cats.groupby(["cat"]).count()
vagueness_cats.columns = ["Count"]
vagueness_cats["Count"] /= sum(vagueness_cats["Count"])
vagueness_cats.columns = ["Probablity"]

In [None]:
plot_data = pd.DataFrame([bt_coef]).T
plot_data.columns = ["bt_coef"]
temp = []
for i in bt_coef:
  try:
    temp.append(vagueness_cats.T[i][0])
  except KeyError :
    temp.append(0)
plot_data["Probablity"] = temp

In [None]:
plot_data

Unnamed: 0,bt_coef,Probablity
CN,1.619,0.007133
C,1.783,0.057405
CM,1.864,0.052649
CMN,2.125,0.004076
CG,2.345,0.000679
CGN,2.443,0.00034
MN,2.569,0.09409
N,2.71,0.112772
M,2.865,0.634511
CGMN,2.899,0.0


In [None]:
plot_data.to_csv("coef_prob_plot.csv")