**Environment Setting** 

In [1]:
!pip install pyspark



# Dataset Preprocessing

## Initialising Spark

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .appName("Comp5349 assignment1_luna") \
    .getOrCreate()

## Load dataset

In [3]:
# Governing law
input_path1 = 'data/Governing_Law.csv'
# Change of control, anti-assignment
input_path2 = 'data/Anti_assignment_CIC_g3.csv'

rdd_1 = spark.read.csv(input_path1,header=True).rdd
rdd_2 = spark.read.csv(input_path2,header=True).rdd

# check if the dataset is complete, the rdds shoud contain 510 and 376 rows respectively
print(rdd_1.count())
print(rdd_2.count())

510
376


Row to tuple, pair category with documents

In [4]:
category_list = ['Governing Law', 'Change of Control', 'Anti-assignment']

def govern_KV (row):
  row_dict = row.asDict()
  return (row_dict["Filename"], (category_list[0], row_dict["Governing Law"]))
  #return (category, (row_dict["Filename"], row_dict["Governing Law"]))

def control_KV (row):
  row_dict = row.asDict()
  return (row_dict["Filename"], (category_list[1], row_dict["Change of Control"]))
  #return (category, (row_dict["Filename"], row_dict["Change of Control"]))

def anti_KV (row):
  row_dict = row.asDict()
  return (row_dict["Filename"], (category_list[2], row_dict["Anti-assignment"]))
  #return (category, (row_dict["Filename"], row_dict["Anti-assignment"]))

Remove nan values in each category

In [5]:
# Governing RDD
g_rdd = rdd_1.map(govern_KV).filter(lambda rec: rec[1][1] != 'nan')
# Change of Control RDD
c_rdd = rdd_2.map(control_KV).filter(lambda rec: rec[1][1] is not None)
# Anti_assignment RDD
a_rdd = rdd_2.map(anti_KV).filter(lambda rec: rec[1][1] is not None)
print(g_rdd.count())
print(c_rdd.count())
print(a_rdd.count())

437
121
374


Combine three rdd to one rdd for quick computation and lowercase all contents

In [6]:
# lowercase
def lowercase (rec):
  return (rec[0], (rec[1][0], rec[1][1].lower()))

# combine three rdd into one rdd
dataset_lower = g_rdd.union(c_rdd.union(a_rdd)).map(lowercase)
dataset_lower.take(1)

[('CybergyHoldingsInc_20140520_10-Q_EX-10.27_8605784_EX-10.27_Affiliate Agreement.pdf',
  ('Governing Law',
   'this agreement is accepted by company in the state of nevada and shall be governed by and construed in accordance with the laws thereof, which laws shall prevail in the event of any conflict. (page 13)'))]

# RAKE Algorithm

## Candidate phrase identification

In [7]:
import re
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
# Adding page and number into stopwords
roman_num_list = ['i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'x']
stopwords += roman_num_list
stopwords.append('page')
stopwords[-10:]

['ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'x', 'page']

In [9]:
def isNum(s):
  try:
    float(s) if '.' in s else int(s)
    return True
  except ValueError:
    return False

def sentenceSplit (rec):
  sentenceList = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\:|\;|\!|\)|\,)\s',rec)
  return sentenceList

def wordSplit (sent):
  word_splitter = sent.split(' ')
  word_list =[]
  for word in word_splitter:
    if (len(word) >=3) and (word != '') and (not isNum(word)):
      word_list.append(word)
  return word_list

# punctuation removal
def punctuationRemoval (doc):
  clean_docs = []
  for i in doc:
    # remove punctuation
    # make sure numbers can be seperate from words
    i = i.replace(')', ' ')
    clean_doc = re.sub(r'[^\w\s]','',i)
    clean_docs.append(clean_doc)
  return clean_docs

def generateCandidateKeywords(sent_list):
  # generage regularised expression pattern of stopwords
  stopword_reglist = []
  for wrd in stopwords:
    wrdregex = '\\b' + wrd + '\\b'
    stopword_reglist.append(wrdregex)
  stopword_pattern = re.compile('|'.join(stopword_reglist))
  
  # stopwords removal
  phrase_list = []
  for sent in sent_list:
    tmp = re.sub(stopword_pattern, '|', sent.strip())
    phrases = tmp.split("|")
    for phrase in phrases:
      phrase = phrase.strip()
      # phrase cannot be null and maximum length of phrase is 4
      if (phrase != "") & (len(phrase) > 2) & (len(phrase.split()) <= 4):
        phrase_list.append(phrase)
  return phrase_list 

In [10]:
def phraseIdentification (rec):
  sent_list = sentenceSplit(rec[1])
  clean_sent = punctuationRemoval(sent_list)
  phrase_list = generateCandidateKeywords(clean_sent)
  return (rec[0], phrase_list)

## RAKE score computation

In [11]:
def calculateWordFrequency (phrase_list):
  word_freq = {}
  for phrase in phrase_list:
    # separate phrases into words
    # word_list = phrase.split(' ')
    word_list = wordSplit(phrase)
    for word in word_list:
      # initialise word frequency with 0
      word_freq.setdefault(word, 0)
      word_freq[word] += 1
  return word_freq

def calculateWordDegree (phrase_list):
  word_deg = {}
  for phrase in phrase_list:
    # separate phrases into words
    # word_list = phrase.split(' ')
    word_list = wordSplit(phrase)
    wordlist_deg = len(word_list) - 1
    for word in word_list:
      word_deg.setdefault(word, 0)
      word_deg[word] += 1
      word_deg[word] += wordlist_deg
  return word_deg

In [12]:
def wordScore (rec):  
  word_score = {}
  word_freq = calculateWordFrequency(rec)
  word_deg = calculateWordDegree(rec)
  for word in word_freq:
    word_score.setdefault(word, 0)
    # Word score = word degree/word frequency
    word_score[word] = word_deg[word] / (word_freq[word] * 1.0) 
  return (rec, word_score)

In [13]:
def candidateScore(phrase_list, word_score):
  phrase_score = {}
  candidate_score = {}
  # word_score = wordScore(phrase_list)
  for phrase in phrase_list:
      # word_list = phrase.split(' ')
      word_list = wordSplit(phrase)
      phrase = ' '.join(word_list)
      phrase_score.setdefault(phrase, 0)
      c_score = 0
      for word in word_list:
          c_score += word_score[word]
      phrase_score[phrase] = c_score
  # Combine word_score with phrase_score
  candidate_score.update(word_score)
  candidate_score.update(phrase_score)
  return candidate_score

# Workload 1
Take each clause as a single document and each ategory as a corpus. Rank keywords of each corpus based on ESS score.

### Phrase identification
Clause identification

In [14]:
def clauseSplit (rec):
  clause_list = rec[1][1].split("(page ")
  for clause in clause_list:
    if len(clause) < 10:
      clause_list.remove(clause)
  return [(rec[1][0], clause) for clause in clause_list]

In [15]:
# (category, clause)
w1_doc = dataset_lower.flatMap(clauseSplit)
w1_doc.take(5)

[('Governing Law',
  'this agreement is accepted by company in the state of nevada and shall be governed by and construed in accordance with the laws thereof, which laws shall prevail in the event of any conflict. '),
 ('Governing Law',
  'this agreement shall be governed by laws of the province of ontario and the federal laws of canada applicable therein. '),
 ('Governing Law',
  '9)  this agreement is subject to all laws, regulations, license conditions and decisions of the canadian radio-television and telecommunications commission (‚äúcrtc‚äù) municipal, provincial and federal governments or other authorities which are applicable to rogers and/or licensor, and which are now in force or hereafter adopted (‚äúapplicable law‚äù). '),
 ('Governing Law',
  'all questions with respect to the construction of this agreement, and the rights and liabilities of the parties hereto, shall be governed by the laws of the state of florida. '),
 ('Governing Law',
  'this agreement shall be governed

Phrase identification for each clause

In [16]:
# (category, phrase_list) of each clause
w1_clause_phrase_list = w1_doc.map(phraseIdentification)
w1_clause_phrase_list.take(5)

[('Governing Law',
  ['agreement',
   'accepted',
   'company',
   'state',
   'nevada',
   'shall',
   'governed',
   'construed',
   'accordance',
   'laws thereof',
   'laws shall prevail',
   'event',
   'conflict']),
 ('Governing Law',
  ['agreement shall',
   'governed',
   'laws',
   'province',
   'ontario',
   'federal laws',
   'canada applicable therein']),
 ('Governing Law',
  ['agreement',
   'subject',
   'laws',
   'regulations',
   'license conditions',
   'decisions',
   'canadian radiotelevision',
   'telecommunications commission äúcrtcäù',
   'municipal',
   'provincial',
   'federal governments',
   'authorities',
   'applicable',
   'rogers andor licensor',
   'force',
   'hereafter adopted äúapplicable lawäù']),
 ('Governing Law',
  ['questions',
   'respect',
   'construction',
   'agreement',
   'rights',
   'liabilities',
   'parties hereto',
   'shall',
   'governed',
   'laws',
   'state',
   'florida']),
 ('Governing Law',
  ['agreement shall',
   'governed

### Word score caculation

In [17]:
# (category, (phrase_list ,dict(word, score))) of each clause
w1_clause_wordScore = w1_clause_phrase_list.mapValues(wordScore)
w1_clause_wordScore.values().values().take(5)

[{'accepted': 1.0,
  'accordance': 1.0,
  'agreement': 1.0,
  'company': 1.0,
  'conflict': 1.0,
  'construed': 1.0,
  'event': 1.0,
  'governed': 1.0,
  'laws': 2.5,
  'nevada': 1.0,
  'prevail': 3.0,
  'shall': 2.0,
  'state': 1.0,
  'thereof': 2.0},
 {'agreement': 2.0,
  'applicable': 3.0,
  'canada': 3.0,
  'federal': 2.0,
  'governed': 1.0,
  'laws': 1.5,
  'ontario': 1.0,
  'province': 1.0,
  'shall': 2.0,
  'therein': 3.0},
 {'adopted': 4.0,
  'agreement': 1.0,
  'andor': 3.0,
  'applicable': 1.0,
  'authorities': 1.0,
  'canadian': 2.0,
  'commission': 3.0,
  'conditions': 2.0,
  'decisions': 1.0,
  'federal': 2.0,
  'force': 1.0,
  'governments': 2.0,
  'hereafter': 4.0,
  'laws': 1.0,
  'lawäù': 4.0,
  'license': 2.0,
  'licensor': 3.0,
  'municipal': 1.0,
  'provincial': 1.0,
  'radiotelevision': 2.0,
  'regulations': 1.0,
  'rogers': 3.0,
  'subject': 1.0,
  'telecommunications': 3.0,
  'äúapplicable': 4.0,
  'äúcrtcäù': 3.0},
 {'agreement': 1.0,
  'construction': 1.0,
  'f

### Candidate score caculation

In [18]:
def candidateScore1(rec):
  phrase_list = rec[0]
  word_score = rec[1]
  candidate_score = candidateScore(phrase_list, word_score)
  return candidate_score

In [19]:
# (category, dict(clause_candidate, score)) of each clause
w1_clause_candidateScore = w1_clause_wordScore.mapValues(candidateScore1)
w1_clause_candidateScore.take(3)

[('Governing Law',
  {'accepted': 1.0,
   'accordance': 1.0,
   'agreement': 1.0,
   'company': 1.0,
   'conflict': 1.0,
   'construed': 1.0,
   'event': 1.0,
   'governed': 1.0,
   'laws': 2.5,
   'laws shall prevail': 7.5,
   'laws thereof': 4.5,
   'nevada': 1.0,
   'prevail': 3.0,
   'shall': 2.0,
   'state': 1.0,
   'thereof': 2.0}),
 ('Governing Law',
  {'agreement': 2.0,
   'agreement shall': 4.0,
   'applicable': 3.0,
   'canada': 3.0,
   'canada applicable therein': 9.0,
   'federal': 2.0,
   'federal laws': 3.5,
   'governed': 1.0,
   'laws': 1.5,
   'ontario': 1.0,
   'province': 1.0,
   'shall': 2.0,
   'therein': 3.0}),
 ('Governing Law',
  {'adopted': 4.0,
   'agreement': 1.0,
   'andor': 3.0,
   'applicable': 1.0,
   'authorities': 1.0,
   'canadian': 2.0,
   'canadian radiotelevision': 4.0,
   'commission': 3.0,
   'conditions': 2.0,
   'decisions': 1.0,
   'federal': 2.0,
   'federal governments': 4.0,
   'force': 1.0,
   'governments': 2.0,
   'hereafter': 4.0,
   'he

Get candidate list of each clause and store them in a list. Each item in the list rpresents a candidate list ectrcted from one clause of corresponding category.

In [20]:
def clauseCandidate (rec):
  c_list = list(rec.keys())
  return c_list

In [21]:
# candidate list of each clause
clause_clist_gov = w1_clause_candidateScore.mapValues(clauseCandidate)\
                  .filter(lambda rec: rec[0] == category_list[0])\
                  .values().collect()
clause_clist_change = w1_clause_candidateScore.mapValues(clauseCandidate)\
                  .filter(lambda rec: rec[0] == category_list[1])\
                  .values().collect()
clause_clist_anti = w1_clause_candidateScore.mapValues(clauseCandidate)\
                  .filter(lambda rec: rec[0] == category_list[2])\
                  .values().collect()
len(clause_clist_gov)

455

### Keywords identification
Choose top 4 candidates as keywords from each clause.

In [22]:
# extract keywords from each clause
def clauseKeywordsExtraction (rec):
  klist = []
  sorted_dic = dict(sorted(rec.items(), key = lambda kv:(kv[1], kv[0])))
  klist = list(sorted_dic.keys())
  if len(klist) > 4:
    return klist[:4]
  else:
    return klist

In [23]:
# (category, list(keywords))
clause_keywords = w1_clause_candidateScore.mapValues(clauseKeywordsExtraction)
clause_keywords.take(1)

[('Governing Law', ['accepted', 'accordance', 'agreement', 'company'])]


Get keywords from each clause and store them in a list. Each item in the list rpresents a keyword list ectrcted from one clause of corresponding category.


In [24]:
# keywords list of each clause
clause_klist_gov = clause_keywords.filter(lambda rec: rec[0] == category_list[0])\
                  .values().collect()
clause_klist_change = clause_keywords.filter(lambda rec: rec[0] == category_list[1])\
                  .values().collect()
clause_klist_anti = clause_keywords.filter(lambda rec: rec[0] == category_list[2])\
                  .values().collect()
len(clause_klist_gov)

455

Pair each keyword with corresponding corpus.

In [25]:
def corpusKeywords(rec):
  new_set = set()
  for l in rec[1]:
    new_set.update(set(l))
  new_list = list(filter(None, list(new_set))) 
  return [(rec[0], kw) for kw in new_list]

In [26]:
# (category, keyword)
w1_corpusKeywords = clause_keywords.groupByKey().flatMap(corpusKeywords)
w1_corpusKeywords.take(2)

[('Governing Law', 'arising'), ('Governing Law', 'acts')]

### ESS caculation
RDF caculation

In [27]:
# referenced document frequency of a keyword
# in candidate list
def caculateRDF (rec):
  word_rdf = 0
  # to check which candidate list should be used
  if rec[0] == category_list[0]:
    clause_clist = clause_clist_gov
  elif rec[0] == category_list[1]:
    clause_clist = clause_clist_change
  else:
    clause_clist = clause_clist_anti

  for c in clause_clist:
    if rec[1] in c:
      word_rdf += 1
  return (rec[0], (rec[1], word_rdf))

In [28]:
# RDF
# (category, (keyword, rdf))
corpus_rdf = w1_corpusKeywords.map(caculateRDF)
corpus_rdf.take(2)

[('Governing Law', ('arising', 51)), ('Governing Law', ('acts', 2))]

EDF caculation

In [29]:
# extracted document frequency of a keyword
# in keyword list
def caculateEDF (rec):
  word_edf = 0
  # to check which candidate list should be used
  if rec[0] == category_list[0]:
    clause_klist = clause_klist_gov
  elif rec[0] == category_list[1]:
    clause_klist = clause_klist_change
  else:
    clause_klist = clause_klist_anti
  #    
  for k in clause_klist:
    if rec[1][0] in k:
      word_edf += 1
  return (rec[0], (rec[1][0], [rec[1][1], word_edf]))

In [30]:
# EDF
# (category, (keyword, [rdf, corpus_edf]))
corpus_edf = corpus_rdf.map(caculateEDF)
corpus_edf.take(2)

[('Governing Law', ('arising', [51, 7])), ('Governing Law', ('acts', [2, 2]))]

In [31]:
# essentiality of a keyword
def caculateESS (rec):
  word_rdf = rec[1][0]
  word_edf = rec[1][1]
  word_ess = (word_edf/word_rdf) * word_edf
  return (word_ess, (rec[0], [('RDF', word_rdf), ('EDF', word_edf)]))

In [32]:
# ESS
# (category, (ess, (keyword, [rdf, corpus_edf])))
corpus_ess = corpus_edf.mapValues(caculateESS)
corpus_ess.take(2)

[('Governing Law',
  (0.9607843137254903, ('arising', [('RDF', 51), ('EDF', 7)]))),
 ('Governing Law', (2.0, ('acts', [('RDF', 2), ('EDF', 2)])))]

### Top 20 keywords

In [33]:
# Governing
w1_ess_gov = corpus_ess.filter(lambda rec: rec[0] == category_list[0])\
                  .values().sortByKey(False)
w1_ess_gov.take(20)

[(285.0034843205575, ('accordance', [('RDF', 287), ('EDF', 286)])),
 (167.48188405797103, ('construed', [('RDF', 276), ('EDF', 215)])),
 (132.25797872340425, ('governed', [('RDF', 376), ('EDF', 223)])),
 (95.40833333333335, ('conflict', [('RDF', 120), ('EDF', 107)])),
 (86.73451327433628, ('conflicts', [('RDF', 113), ('EDF', 99)])),
 (61.134433962264154, ('agreement', [('RDF', 424), ('EDF', 161)])),
 (34.56818181818182, ('choice', [('RDF', 44), ('EDF', 39)])),
 (24.5, ('application', [('RDF', 32), ('EDF', 28)])),
 (23.757575757575758, ('construction', [('RDF', 33), ('EDF', 28)])),
 (19.53066037735849, ('laws', [('RDF', 424), ('EDF', 91)])),
 (19.173913043478258, ('commonwealth', [('RDF', 23), ('EDF', 21)])),
 (18.77586206896552, ('california', [('RDF', 58), ('EDF', 33)])),
 (14.0, ('america', [('RDF', 14), ('EDF', 14)])),
 (7.895522388059701, ('interpreted', [('RDF', 67), ('EDF', 23)])),
 (7.142857142857143, ('claims', [('RDF', 14), ('EDF', 10)])),
 (6.722222222222223, ('florida', [('R

In [34]:
# Change of control
w1_ess_change = corpus_ess.filter(lambda rec: rec[0] == category_list[1])\
                  .values().sortByKey(False)
w1_ess_change.take(20)

[(64.0, ('change', [('RDF', 121), ('EDF', 88)])),
 (37.61940298507462, ('agreement', [('RDF', 134), ('EDF', 71)])),
 (20.081818181818182, ('control', [('RDF', 110), ('EDF', 47)])),
 (16.53125, ('assignment', [('RDF', 32), ('EDF', 23)])),
 (12.6, ('assets', [('RDF', 35), ('EDF', 21)])),
 (10.31578947368421, ('acquisition', [('RDF', 19), ('EDF', 14)])),
 (10.0, ('accordance', [('RDF', 10), ('EDF', 10)])),
 (8.066666666666666, ('event', [('RDF', 60), ('EDF', 22)])),
 (6.76, ('assign', [('RDF', 25), ('EDF', 13)])),
 (6.050000000000001, ('consolidation', [('RDF', 20), ('EDF', 11)])),
 (6.0, ('addition', [('RDF', 6), ('EDF', 6)])),
 (5.761904761904762, ('affiliate', [('RDF', 21), ('EDF', 11)])),
 (5.142857142857142, ('acquired', [('RDF', 7), ('EDF', 6)])),
 (5.142857142857142, ('controlled', [('RDF', 7), ('EDF', 6)])),
 (4.571428571428571, ('case', [('RDF', 14), ('EDF', 8)])),
 (4.263157894736842, ('connection', [('RDF', 19), ('EDF', 9)])),
 (4.083333333333334, ('assigned', [('RDF', 12), ('E

In [35]:
# Anti-assignment
w1_ess_anti = corpus_ess.filter(lambda rec: rec[0] == category_list[2])\
                  .values().sortByKey(False)
w1_ess_anti.take(20)

[(150.55679287305122, ('agreement', [('RDF', 449), ('EDF', 260)])),
 (98.84745762711864, ('assigned', [('RDF', 118), ('EDF', 108)])),
 (82.814696485623, ('assign', [('RDF', 313), ('EDF', 161)])),
 (49.42285714285715, ('assignment', [('RDF', 175), ('EDF', 93)])),
 (46.666666666666664, ('assets', [('RDF', 105), ('EDF', 70)])),
 (33.75, ('affiliate', [('RDF', 60), ('EDF', 45)])),
 (27.272727272727273, ('assignable', [('RDF', 33), ('EDF', 30)])),
 (24.532258064516128, ('delegate', [('RDF', 62), ('EDF', 39)])),
 (23.09160305343511, ('except', [('RDF', 131), ('EDF', 55)])),
 (21.35294117647059, ('null', [('RDF', 51), ('EDF', 33)])),
 (20.930232558139537, ('delayed', [('RDF', 43), ('EDF', 30)])),
 (18.892857142857142, ('benefit', [('RDF', 28), ('EDF', 23)])),
 (17.36111111111111, ('affiliates', [('RDF', 36), ('EDF', 25)])),
 (16.65765765765766, ('void', [('RDF', 111), ('EDF', 43)])),
 (16.0, ('change', [('RDF', 25), ('EDF', 20)])),
 (15.210526315789474, ('acquisition', [('RDF', 19), ('EDF', 1

# Workload 2
Take all files in each category as a long single document and rank candidates based on RAKE score.

### Phrase identification

In [36]:
# (filename, (category, phrase_list)) of each file
w2_phrase_list = dataset_lower.mapValues(phraseIdentification)
w2_phrase_list.take(2)

[('CybergyHoldingsInc_20140520_10-Q_EX-10.27_8605784_EX-10.27_Affiliate Agreement.pdf',
  ('Governing Law',
   ['agreement',
    'accepted',
    'company',
    'state',
    'nevada',
    'shall',
    'governed',
    'construed',
    'accordance',
    'laws thereof',
    'laws shall prevail',
    'event',
    'conflict'])),
 ('EuromediaHoldingsCorp_20070215_10SB12G_EX-10.B(01)_525118_EX-10.B(01)_Content License Agreement.pdf',
  ('Governing Law',
   ['agreement shall',
    'governed',
    'laws',
    'province',
    'ontario',
    'federal laws',
    'canada applicable therein',
    'agreement',
    'subject',
    'laws',
    'regulations',
    'license conditions',
    'decisions',
    'canadian radiotelevision',
    'telecommunications commission äúcrtcäù',
    'municipal',
    'provincial',
    'federal governments',
    'authorities',
    'applicable',
    'rogers andor licensor',
    'force',
    'hereafter adopted äúapplicable lawäù']))]

Combine phraselists from different files based on their categories

In [37]:
# (category, phrase_list) of each category
w2_combined_phraseList = w2_phrase_list.values().reduceByKey(lambda w1,w2: w1+w2)
w2_combined_phraseList.count()

3

### Word score caculation

In [38]:
# (category, (phrase_list ,dict(word, score))) of each category
w2_wordScore = w2_combined_phraseList.mapValues(wordScore)
w2_wordScore.values().values().take(1)

[{'abide': 1.0,
  'accept': 2.0,
  'acceptance': 1.0,
  'accepted': 1.0,
  'accordance': 1.0033898305084745,
  'according': 1.9285714285714286,
  'accordingly': 1.0,
  'acknowledgments': 2.0,
  'act': 2.8181818181818183,
  'action': 2.272727272727273,
  'actions': 2.6666666666666665,
  'acts': 1.0,
  'actual': 2.0,
  'addendum': 1.0,
  'adopted': 4.0,
  'affecting': 2.0,
  'affiliate': 2.0,
  'affiliates': 1.0,
  'africa': 2.0,
  'agent': 2.0,
  'agree': 2.2666666666666666,
  'agreed': 2.5,
  'agreement': 1.631027253668763,
  'agreements': 2.0,
  'agrees': 2.0,
  'airsopure': 1.0,
  'alabama': 1.0,
  'alexandria': 1.0,
  'also': 3.0,
  'amended': 1.5,
  'amendment': 1.5555555555555556,
  'amendments': 2.25,
  'america': 1.0,
  'american': 3.0,
  'among': 3.0,
  'andor': 2.75,
  'angeles': 3.0,
  'another': 2.1875,
  'antonio': 2.0,
  'appellate': 2.0,
  'applicability': 1.0,
  'applicable': 2.1923076923076925,
  'application': 1.1764705882352942,
  'applied': 1.125,
  'applies': 3.0,
 

### Candidate score caculation

candidateScore2 to get the corresponding rdd KV pairs.

In [39]:
def candidateScore2(rec):  
  phrase_list = rec[1][0]
  word_score = rec[1][1]
  candidate_score = candidateScore(phrase_list, word_score)
  return [(rec[0], candidate) for candidate in candidate_score.items()]

In [40]:
# (category, (candidate, rake_score))
w2_rake = w2_wordScore.flatMap(candidateScore2)
w2_rake.take(10)

[('Governing Law', ('agreement', 1.631027253668763)),
 ('Governing Law', ('accepted', 1.0)),
 ('Governing Law', ('company', 1.5)),
 ('Governing Law', ('state', 1.1023809523809525)),
 ('Governing Law', ('nevada', 1.7)),
 ('Governing Law', ('shall', 2.0615384615384613)),
 ('Governing Law', ('governed', 1.0334190231362468)),
 ('Governing Law', ('construed', 1.053763440860215)),
 ('Governing Law', ('accordance', 1.0033898305084745)),
 ('Governing Law', ('laws', 1.3042071197411003))]

### Top 20 keywords

Top 20 keywords from each category based on RAKE score.

In [41]:
# Governing
# (candidate, rake_score)
w2_rake_gov = w2_rake.filter(lambda rec: rec[0] == category_list[0]).values().sortBy(lambda x:x[1], False)
w2_rake_gov.take(20)

[('hereafter adopted äúapplicable lawäù', 16.0),
 ('franchisoräôs thencurrent headquarters currently', 16.0),
 ('either party herein initiate', 13.75),
 ('met independently without reference', 13.410185185185185),
 ('intellectual property right applies', 12.238095238095237),
 ('german private international law', 12.049915397631134),
 ('parties hereto expressly attorns', 11.961502347417841),
 ('transactions contemplated hereby andor', 11.936538461538461),
 ('agreement shall become valid', 11.692565715207223),
 ('transactions contemplated herein shall', 11.623076923076923),
 ('agreement takes effect upon', 11.30856348555282),
 ('maryland without giving effect', 11.308476134050377),
 ('transactions contemplated hereby shall', 11.248076923076923),
 ('without omitted giving effect', 11.225142800717041),
 ('issues collateral thereto shall', 11.168681318681319),
 ('united states trademark act', 11.073937573937574),
 ('massachusetts without giving effect', 11.058476134050377),
 ('pennsylvania 

In [42]:
# Change of control
# (candidate, rake_score)
w2_rake_change = w2_rake.filter(lambda rec: rec[0] == category_list[1]).values().sortBy(lambda x:x[1], False)
w2_rake_change.take(20)

[('reasonable detail based upon', 13.444444444444445),
 ('enable sellerexxonmobil selling affiliate', 13.109677419354838),
 ('posttermination royalty term therefor', 13.083333333333334),
 ('smiths prior written approval', 12.85562119584676),
 ('without thereby becoming liable', 12.804545454545455),
 ('golf instruction related products', 12.566666666666666),
 ('ehave companion solution within', 12.441176470588236),
 ('spinco house marks whether', 12.385964912280702),
 ('agents prior written consent', 12.333763272349492),
 ('providing ebix written notice', 12.326698867421719),
 ('dova hereunder whether accruing', 12.271381578947368),
 ('sellerexxonmobil selling affiliates shall', 12.227113641155386),
 ('upon sending written notice', 12.146143311866163),
 ('vss outstanding voting securities', 12.106280193236715),
 ('first refusal shall cease', 12.087021475256769),
 ('janssens confidential information hereunder', 12.052083333333334),
 ('maintenance services performed prior', 12.03544575725

In [43]:
# Anti-assignment
# (candidate, rake_score)
w2_rake_anti = w2_rake.filter(lambda rec: rec[0] == category_list[1]).values().sortBy(lambda x:x[1], False)
w2_rake_anti.take(20)

[('reasonable detail based upon', 13.444444444444445),
 ('enable sellerexxonmobil selling affiliate', 13.109677419354838),
 ('posttermination royalty term therefor', 13.083333333333334),
 ('smiths prior written approval', 12.85562119584676),
 ('without thereby becoming liable', 12.804545454545455),
 ('golf instruction related products', 12.566666666666666),
 ('ehave companion solution within', 12.441176470588236),
 ('spinco house marks whether', 12.385964912280702),
 ('agents prior written consent', 12.333763272349492),
 ('providing ebix written notice', 12.326698867421719),
 ('dova hereunder whether accruing', 12.271381578947368),
 ('sellerexxonmobil selling affiliates shall', 12.227113641155386),
 ('upon sending written notice', 12.146143311866163),
 ('vss outstanding voting securities', 12.106280193236715),
 ('first refusal shall cease', 12.087021475256769),
 ('janssens confidential information hereunder', 12.052083333333334),
 ('maintenance services performed prior', 12.03544575725