In [4]:
import os
import string
import operator
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from collections import Counter
from bs4 import BeautifulSoup

In [5]:
filesList = os.listdir('./3')
if '.DS_store ' in filesList:
    filesList.remove('.DS_store ')
fileNameList = list(map(lambda x: './3/' + x, filesList))

### Functions

In [6]:
def readHtml(fileName):
    html = open(fileName)
    soup = BeautifulSoup(html, 'html.parser') 
    return soup

In [7]:
fileName = './3/02_OF_3.htm'
soup = readHtml(fileName)

In [8]:
def read_stop_words(stopFileName):
    file = open(stopFileName)
    words = list(map(lambda x: x[0:-1], file.readlines()))
    return words

In [9]:
stopFileName = 'stopWords.txt'
stopWords = read_stop_words(stopFileName)

In [10]:
def class_contents(className):
    lst = soup.find_all(class_=className)
    text = ''.join([''.join(l.findAll(text=True)) + ' ' for l in lst])
    return text

In [11]:
def list_classes():
    classes = [value for element in soup.find_all(class_=True) for value in element["class"]]
    return classes

In [12]:
def found_in_class(className, value):
    lst = soup.find_all(class_=className)
    return any(value in l.findAll(text=True) for l in lst)

In [13]:
def found_in_text(lst, value):
    lst = lst[::-1]
    found = lst.index(value) if value in lst else None
    if found:
        return found
    return None

In [14]:
def clean_soup(soup):
    case = list(soup.get_text().split())
    puncs = string.punctuation.translate({ord('('): None, ord(')'): None}) + '’'
    return list(map(lambda x: x.lower(), list(filter(lambda x:  4 < len(x) <= 20 and not any(p in x for p in puncs) and x not in stopWords and x.count('(') == x.count(')'), case))))

In [15]:
def clean_text(lst):
    puncs = string.punctuation.translate({ord('('): None, ord(')'): None}) + '’'
    return list(map(lambda x: x.lower(), list(filter(lambda x:  4 < len(x) <= 20 and not any(p in x for p in puncs) and x not in stopWords and x.count('(') == x.count(')'), case))))

In [16]:
def get_classes(classesList):
    lst = ['Judg-Heading-']
    headingsList = set(filter(lambda x: x if lst[0] in x else None, classesList))
    return headingsList

# A. Section of the law

In [17]:
print(class_contents('txt-body'))

Tribunal Appeal No 12 of 2015 29 March 2016 High Court  George Wei J   Prithipal Singh and Chow Jian Hong (Mirandah Law LLP) for the appellant; Melvin Pang (Amica Law LLC) for the respondent.   CAESARSTONE SDOT-YAM LTD — CERAMICHE CAESAR SpA  Evidence – Proof of evidence – Judicial notice – Applicant tendered Accounting and Corporate Regulatory Authority report showing number of entities whose names contained common denominator “Caesar” – Whether judicial notice should be taken of such fact Trade Marks and Trade Names – Registration criteria – Conflicts with earlier marks – Registration of trade mark opposed based on earlier trade mark – Whether trade mark similar – Whether likelihood of confusion arising – Whether trade mark well known – Sections 2(7), 2(8), 8(2)(b) and 8(4)(b)(i) Trade Marks Act (Cap 332, 2005 Rev Ed) 


# B. Conclusion exits?

In [18]:
className = 'Judg-Heading-1'
value = 'Conclusion'
found_in_class(className, value)

False

# Clean the text

In [19]:
# get all the text from the html
case = list(soup.get_text().split())
print(len(case))

14151


In [20]:
case = clean_soup(soup)
print(len(case))

4036


In [21]:
# Find the last 'Conclustion' index
lst = case
value = 'Conclusion'
idx = found_in_text(lst, value)
if idx is not None:  
    print(case[-idx:])

# Make Dataset

In [22]:
wordsBag = []
for fileName in fileNameList:
    soup = readHtml(fileName)
    wordsBag += clean_soup(soup)
wordsBagHist = dict(Counter(wordsBag))
sortedBag = sorted(wordsBagHist.items(), key=operator.itemgetter(1), reverse=True)

In [23]:
len(wordsBagHist)

10270

In [24]:
mostFreqWords1 = list(sortedBag[0:650])
mostFreqWords2 = list(filter(lambda x: x[1] > 43, wordsBagHist.items()))

In [34]:
len(mostFreqWords1)
print(mostFreqWords1[0:10])
print(mostFreqWords2[0:10])

[('marks', 3359), ('trade', 3169), ('goods', 2188), ('application', 1666), ('court', 1473), ('earlier', 1420), ('similarity', 1397), ('evidence', 1255), ('singapore', 1102), ('section', 1046)]
[('access', 50), ('sgipos', 181), ('print', 47), ('number', 168), ('trade', 3169), ('decision', 486), ('intellectual', 161), ('property', 177), ('office', 112), ('singapore', 1102)]


In [35]:
wordsBag = list(map(lambda x: x[0], mostFreqWords1))

In [36]:
wordsBag.remove('(refd)')
wordsBag.remove('(iii)')
wordsBag.remove('slr(r)')

In [37]:
len(wordsBag)

647

In [38]:
def tf_vectorizer(cleanSoup, wordsBag):
    length = len(cleanSoup)
    vec = np.zeros(647 + 3) # win or lose then 299
    for idx, word in enumerate(wordsBag):
        vec[idx] = cleanSoup.count(word) / length
    return vec

In [39]:
vec = tf_vectorizer(case, wordsBag)

In [40]:
def case_class(fileName):
    return fileName[-fileName[::-1].index('.') - 2]

In [41]:
def has_conclusion(headingsList):
    for heading in headingsList:
        if found_in_class(heading, 'Conclusion'):
            return True
    return False  

### Add labels

In [43]:
# vec[-3] = int(has_conclusion(headingsList))
vec[-2] = 1 if 'OS' in fileName else 0
vec[-1] = case_class(fileName)