In [96]:
import io
import csv
import os
import pickle
import string
import operator
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from bs4 import BeautifulSoup

### Functions

In [2]:
def file_name_list(cate):
    fileList = os.listdir('./{}'.format(cate))
    if '.DS_store' in fileList:
        fileList.remove('.DS_store ')
    fileNameList = list(map(lambda x: './{}/{}'.format(cate, x), fileList))
    return fileNameList

In [3]:
def read_html(fileName):
    html = io.open(fileName, mode="r", encoding="utf-8")
    soup = BeautifulSoup(html, 'html.parser') 
    return soup

In [4]:
def read_stop_words(stopFileName):
    file = open(stopFileName, mode="r", encoding="utf-8")
    words = list(map(lambda x: x[0:-1], file.readlines()))
    return words

In [5]:
def clean_text(s, stopWords):
    puncs = string.punctuation.translate({ord('('): None, ord(')'): None}) + '’' + '“' + '”' + '\\'
    return list(map(lambda x: x.lower(), list(filter(lambda x:  4 < len(x) <= 16 and not any(p in x for p in puncs) and x not in stopWords and x.count('(') == x.count(')'), s.split(' ')))))

In [18]:
def class_contents(soup, className):
    lst = soup.find_all(class_=className)
    text = ''.join([''.join(l.findAll(text=True)) + ' ' for l in lst])
    return text

In [84]:
def word_counter_vectorizer(text, wordsBag):
    vecLength = len(wordsBag)
    vec = [None] * vecLength
    for idx, word in enumerate(wordsBag):
        vec[idx] = text.count(word)
    return vec

In [112]:
def create_dataset(fileNameList, wordsBag, className='txt-body', stopWords='./stopWords.txt'):
    dataSet = []
    for file in fileNameList: 
        vec = []
        soup = read_html(file)
        text = class_contents(soup, className)
        cleanText = clean_text(text, read_stop_words(stopWords))
        countVec = word_counter_vectorizer(cleanText, wordsBag)
        if file[2] == 'A':
            label = [1]
        else:
            label = [0]
        countVec.extend(label)
        dataSet.append(countVec)
    return dataSet 

### Load the html

In [104]:
# Trade Mark Cases
cateA = 'A'
fileNameListA = file_name_list(cateA)

# Non Trade Mark Cases
cateB = 'B'
fileNameListB = file_name_list(cateB)

# Combined
fileNameList = fileNameListA + fileNameListB

### StopWords

In [120]:
stopWords = read_stop_words('./stopWords.txt')

### Words Bag

In [121]:
wordsBag = ['likelyhood', 'breach', 'trade', 'evidence', 'property', 'public', 'opponents', 'constitution', 'fiduciary', 'confusion', 'crimes', 'copyright', 'intellectual', 'misuse', 'death', 'marks', 'constitutional', 'unregistered', 'contract', 'drugs', 'proprietor', 'similar', 'penalty', 'company', 'criminal'] 

In [125]:
dataSet = create_dataset(fileNameList, wordsBag)

In [126]:
np.savetxt("dataNew.csv", dataSet, fmt='%i', delimiter=",")