In [29]:
import spacy
import os
import pandas as pd
from spacy.matcher import Matcher
from google.colab import files

In [30]:
Condition = ['depending', 'necessary', 'appropriate',
                 'inappropriate', 'as needed', 'as applicable',
                 'otherwise reasonably', 'sometimes',
                 'from time to time']
Generalization = ['generally', 'mostly', 'widely',
                  'general', 'commonly',
                  'usually', 'normally', 'typically',
                  'largely', 'often', 'primarily',
                  'among other things']
Modality = ['may', 'might', 'can', 'could', 'would',
            'likely', 'possible', 'possibly']
Numeric_quantifier = ['anyone', 'certain', 'everyone',
                      'numerous', 'some', 'most', 'few',
                      'much', 'many', 'various',
                      'including but not limited to']

In [31]:
Path = "/content/input/"
filelist = os.listdir(Path)
txt_files = []
file_names = []
for i in filelist:
    if i.endswith(".txt"):
        file_names.append(i)
        with open(Path + i, 'r') as f:
            txt_files.append(f.read())

## Bradley-Terry Coeff

In [32]:
bt_coef ={
    "CN": 1.619,
    "C": 1.783,
    "CM": 1.864,
    "CMN": 2.125,
    "CG": 2.345,
    "CGN": 2.443,
    "MN": 2.569,
    "N": 2.710,
    "M": 2.865,
    "CGMN": 2.899,
    "CGM": 2.968,
    "GN": 3.281,
    "GMN": 3.506,
    "G": 3.550,
    "GM": 4.045
}

## Policy String

In [33]:
Policies = pd.DataFrame([file_names, txt_files]).T
Policies.columns = ["File_name", "text_content"]

In [34]:
Policies

Unnamed: 0,File_name,text_content
0,GamersGLToolProwithGameTurbo&PingBooster.txt,Privacy PolicyYour privacy is important to us....
1,Tasker.txt,Tasker for AndroidTaskerHomeDownloadTaskerNetU...
2,PandaGamepadPro(BETA).txt,Privacy Policy of Panda Gaming StudioSearch th...
3,GaanaMusicHindiSongFreeTamilTeluguMP3App.txt,Privacy PolicyThis Privacy Policy explains ou...
4,WynkMusic-NewMP3HindiTamilSong&PodcastApp.txt,Privacy PolicyData CollectionAt the time you (...
...,...,...
71,SAXVideoPlayer-AllinoneHdFormatpro2021.txt,HomeSearch this sitePrivacy PolicyProtecting y...
72,"KineMaster-VideoEditor,VideoMaker.txt",Privacy Policy (Android) – HELP CENTER ...
73,RAM&GameBoosterbyAugustro(67%OFF).txt,Augustro Privacy PolicyPrivacy PolicyAugustro ...
74,RhythmwithTabla&TanpuraPREMIUM.txt,PSS Labs | Rhythm with Tabla & Tanpura - Priva...


## Index Tracking


In [35]:
def len_str(x):
  return len(x)

In [36]:
keepWords = Condition + Generalization + Modality + Numeric_quantifier

index_tracker = {}
for num, i in enumerate(keepWords):
  index_tracker[i] = num
_ = keepWords.sort(key=len_str, reverse=True)

## Matcher

In [37]:
nlp = spacy.load('en')

matcher = Matcher(nlp.vocab)
for i in keepWords:
  rule = [ {"LOWER": j } for j in i.split() ]
  matcher.add(i, None, rule)

## Functions

In [38]:
def generateVec(sentence):
  text = nlp(sentence)

  final = []
  category_vaguesness = {"C": 0, "G": 0, "M": 0, "N": 0}
  for i in range(len(keepWords)):
    final.append(0)

  buffer_start = -1
  for word, match_start, match_end in matcher(text):
    if buffer_start < match_start:
      # print(nlp.vocab.strings[word])
      final[index_tracker[nlp.vocab.strings[word]]] += 1
      if nlp.vocab.strings[word] in Condition:
        category_vaguesness["C"] = 1
      elif nlp.vocab.strings[word] in Modality:
        category_vaguesness["M"] = 1
      elif nlp.vocab.strings[word] in Numeric_quantifier:
        category_vaguesness["N"] = 1
      else:
        category_vaguesness["G"] = 1 
    buffer_start = match_end - 1

  if buffer_start==-1:
    return None
  
  temp = "".join([ i for i in category_vaguesness if category_vaguesness[i] ])
  final.append(temp)
  final.append(bt_coef[temp])

  return final

In [39]:
def generateMatrix(text_string):
  final = []
  tok = nlp(text_string)
  for i in tok.sents:
    vector = generateVec(i.text)
    if vector != None:
      final.append(vector)
  return final

In [40]:
def make_df(intext):
  visualization = pd.DataFrame(generateMatrix(intext))
  if len(visualization) == 0:
    return "None"
  keepWords = Condition + Generalization + Modality + Numeric_quantifier + ["Category", "BT Coeff"]
  visualization.columns = keepWords
  return visualization

## Generating Outputs

In [41]:
coefs = []
for i in Policies.iterrows():
  # print(i[1][0])
  file_name = i[1][0][:-4]
  file_path = "/content/output/" + file_name + ".csv"
  temp_df = make_df(i[1][1])
  if "may" in temp_df:
    temp_df.to_csv(file_path)
    coefs.append(temp_df["BT Coeff"].mean())
  else:
    coefs.append("NA")

Policies["vague_score"] = coefs

In [42]:
print("CSVs GENERATED")

CSVs GENERATED


In [43]:
!zip -r result.zip output

  adding: output/ (stored 0%)
  adding: output/JioSaavnMusic&Radio–JioTunes,Podcasts,Songs.csv (deflated 91%)
  adding: output/FacebookLite.csv (deflated 90%)
  adding: output/InstagramLite.csv (deflated 90%)
  adding: output/GameBooster4xFasterPro-GFXTool&LagFix.csv (deflated 84%)
  adding: output/1DM+[formerlyIDM+]:Video,TorrentDownloader.csv (deflated 70%)
  adding: output/MXTakaTakShortVideoApp|MadeinIndiaforYou.csv (deflated 82%)
  adding: output/InternetSpeedMeter.csv (deflated 61%)
  adding: output/Facebook.csv (deflated 90%)
  adding: output/Dailyhunt-100%IndianAppforNews&Videos.csv (deflated 90%)
  adding: output/Moj-SnackonIndianShortVideos|MadeinIndia.csv (deflated 88%)
  adding: output/PGT+🔧:ProGFX&Optimizer(withadvancesetting).csv (deflated 72%)
  adding: output/PowerampFullVersionUnlocker.csv (deflated 88%)
  adding: output/DiskDiggerProfilerecovery.csv (deflated 54%)
  adding: output/SnackyTakatak.csv (deflated 82%)
  adding: output/AJIOOnlineShopping-HandpickedCuratedFa

# Manual Testing

In [44]:
in_text = "I may am so smart"
# temp = 0
temp_df = make_df(in_text)
if "may" in temp_df:
  print("ahha")
# for i in make_df(in_text)["BT Coeff"]:
#   temp += i
# temp /= len(make_df(in_text)["BT Coeff"])
# print(temp)

ahha


In [45]:
Policies.to_csv("Policies.csv")

In [46]:
for num, i in enumerate(Policies["File_name"]):
  print(i,": ", Policies["vague_score"][num])

Policies["vague_score"].describe()

GamersGLToolProwithGameTurbo&PingBooster.txt :  2.566222222222222
Tasker.txt :  2.714666666666666
PandaGamepadPro(BETA).txt :  2.855882352941178
GaanaMusicHindiSongFreeTamilTeluguMP3App.txt :  2.673979166666666
WynkMusic-NewMP3HindiTamilSong&PodcastApp.txt :  2.7548148148148153
Truecaller:CallerID,SpamBlocking&Chat.txt :  2.7043153153153168
MXTakaTakShortVideoApp|MadeinIndiaforYou.txt :  2.7416666666666676
MXPlayerPro.txt :  2.6862037037037036
AmazonShopping,UPI,MoneyTransfer,BillPayment.txt :  2.7600204081632644
JioTV–News,Movies,Entertainment,LIVETV.txt :  2.792305555555555
AJIOOnlineShopping-HandpickedCuratedFashion.txt :  2.7960851063829772
InternetOptimizerPro|No-Ads.txt :  2.7611428571428576
Meesho-Resell,WorkFromHome,EarnMoneyOnline.txt :  2.6548888888888893
PGT+🔧:ProGFX&Optimizer(withadvancesetting).txt :  2.566222222222222
SnackyTakatak.txt :  2.7416666666666676
VideoEditor&VideoMaker-InShot.txt :  2.791
SimpleGalleryPro:Video&PhotoManager&Editor.txt :  2.3906666666666667
Game

count    76.000000
mean      2.713307
std       0.096532
min       2.368786
25%       2.681618
50%       2.713304
75%       2.772303
max       2.920571
Name: vague_score, dtype: float64