In [1]:
import pandas as pd
import numpy as np
import pickle
import json
import nltk
import re 

#### Functions to prepare dataset ####

In [None]:
# cleaning, stemming, tagging, removing stopwords
def clean_up(s):
    words = re.findall('[^\d\W]+', str(s))#, ' '.join(s))
    #words = words.split()
    words = [w.lower() for w in words if not w.startswith('http://') and len(w)>2]
    return ' '.join(words)

def remove_stopwords(l):
    stop_words = stopwords.words('portuguese')
    stop_words.append('ii')
    stop_words.append('iii')
    stop_words.append('iv')
    stop_words.append('º')
    stop_words.append('ª')
    stop_words.append('nº')
    return [word for word in l if word not in stop_words]

# here we try stem and pos_tag to check, the lemmatization does not work well in portuguese
## the pos tag was trainned below as 'tagger' with a specific algorithm for portuguese language
def tag_stem(l):
    l = ' '.join(l)
    tagged = tagger.tag(nltk.word_tokenize(l))
    words, tags = zip(*tagged)
    stem = [RSLPStemmer().stem(word) for word in words]
    return list(zip(stem,tags))

# applying all
def set_up(x):
    return x.apply(clean_up).apply(word_tokenize).apply(remove_stopwords).apply(tag_stem)

# this funcion will clean the english tags to be portuguese friendly
def convert_to_universal_tag(t, reverse=False):
    tagdict = {
        'n': "NOUN",
        'num': "NUM",
        'v-fin': "VERB",
        'v-inf': "VERB",
        'v-ger': "VERB",
        'v-pcp': "VERB",
        'pron-det': "PRON",
        'pron-indp': "PRON",
        'pron-pers': "PRON",
        'art': "DET",
        'adv': "ADV",
        'conj-s': "CONJ",
        'conj-c': "CONJ",
        'conj-p': "CONJ",
        'adj': "ADJ",
        'ec': "PRT",
        'pp': "ADP",
        'prp': "ADP",
        'prop': "NOUN",
        'pro-ks-rel': "PRON",
        'proadj': "PRON",
        'prep': "ADP",
        'nprop': "NOUN",
        'vaux': "VERB",
        'propess': "PRON",
        'v': "VERB",
        'vp': "VERB",
        'in': "X",
        'prp-': "ADP",
        'adv-ks': "ADV",
        'dad': "NUM",
        'prosub': "PRON",
        'tel': "NUM",
        'ap': "NUM",
        'est': "NOUN",
        'cur': "X",
        'pcp': "VERB",
        'pro-ks': "PRON",
        'hor': "NUM",
        'pden': "ADV",
        'dat': "NUM",
        'kc': "ADP",
        'ks': "ADP",
        'adv-ks-rel': "ADV",
        'npro': "NOUN",
    }
    if t in ["N|AP","N|DAD","N|DAT","N|HOR","N|TEL"]:
        t = "NUM"
    if reverse:
        if "|" in t: t = t.split("|")[0]
    else:
        if "+" in t: t = t.split("+")[1]
        if "|" in t: t = t.split("|")[1]
        if "#" in t: t = t.split("#")[0]
    t = t.lower()
    return tagdict.get(t, "." if all(tt in punctuation for tt in t) else t)

In [3]:
# bag of words
def bow(x, n=5000):
    allwords = [w for words in x for w in words if len(w) > 1]
    bag = {k:allwords.count(k) for k in allwords}
    sorted_bag = sorted(bag.items(), key=lambda kv: kv[1], reverse=True)
    sb = {k:v for k,v in sorted_bag[:n]}
    return pd.DataFrame(sb, index=['values'])

# check if words are in text
def find_features(document):
    word = set(document)
    return {w:(w in word) for w in words.columns}

# multiply two lists
def mult(a,b, c=[]):
    for i in range(len(a)):
        c.append(a[i]*b[i])
    return c

# separating words from tags
def words(l):
    words, tags = zip(*l)
    return [word for word in words]

def tags(l):
    words, tags = zip(*l)
    return [tag.lower() for tag in tags]

# adding weights to each
def weight(l, d):
    res = []
    for i in l:
        try:
            res.append(d[i])
        except:
            res.append(0)
    return res

# assign weights for vectors
def assign_weights(a, b, c=[]):
    for i in range(len(a)):
        return [(a[i] * np.array(b[i])).tolist()]

In [4]:
# importing dataset and getting only unique values
df = pd.read_csv('decisions.csv', index_col='Unnamed: 0')
uniq = pd.DataFrame()
uniq['main'] = df['Main Judgement'].unique()
uniq['main'] = uniq['main'].dropna()
uniq['len'] = uniq['main'].apply(str).apply(len)
uniq = uniq[uniq['len'] > 50]