paice.py

"""This module contains an object that implements the Paice-Husk stemming
algorithm.
If you just want to use the standard Paice-Husk stemming rules, use the
module's ``stem()`` function::
    stemmed_word = stem(word)
If you want to use a custom rule set, read the rules into a string where the
rules are separated by newlines, and instantiate the object with the string,
then use the object's stem method to stem words::
    stemmer = PaiceHuskStemmer(my_rules_string)
    stemmed_word = stemmer.stem(word)
"""

from nltk import word_tokenize
import re
from collections import defaultdict


class PaiceHuskStemmer(object):
    """Implements the Paice-Husk stemming algorithm.
    """

    rule_expr = re.compile(r"""
    ^(?P<ending>\w+)
    (?P<intact>[*]?)
    (?P<num>\d+)
    (?P<append>\w*)
    (?P<cont>[.>])
    """, re.UNICODE | re.VERBOSE)

    # rule_explr bahasa unicode regrex yang di compile
    stem_expr = re.compile("^\w+", re.UNICODE)

    def __init__(self, ruletable):
        """
        :param ruletable: a string containing the rule data, separated
            by newlines.
        """
        self.rules = defaultdict(list)
        self.read_rules(ruletable)

    def read_rules(self, ruletable):
        rule_expr = self.rule_expr
        rules = self.rules

        for line in ruletable.split("\n"):
            line = line.strip()
            if not line:
                continue

            match = rule_expr.match(line)
            if match:
                ending = match.group("ending")[::-1]
                lastchar = ending[-1]
                intact = match.group("intact") == "*"
                num = int(match.group("num"))
                append = match.group("append")
                cont = match.group("cont") == ">"

                rules[lastchar].append((ending, intact, num, append, cont))
            else:
                raise Exception("Bad rule: %r" % line)

    def first_vowel(self, word):

        vowels_occurence = [p for p in [word.find(v) for v in "aeiou"]
                  if p > -1]

        if not vowels_occurence:
            vp = -1
        else :
            vp = min(vowels_occurence)

        yp = word.find("y")

        if (yp > 0 and yp < vp) or (vp == -1 and yp) :
            return yp
        # print(vp)
        return vp

    def strip_prefix(self, word):
        for prefix in ("kilo", "micro", "milli", "intra", "ultra", "mega",
                       "nano", "pico", "pseudo"):
            if word.startswith(prefix):
                return word[len(prefix):]
        return word

    def stem(self, word):
        """Returns a stemmed version of the argument string.
        """

        # rules : isinya dictionary
        # struktur dict nya kira2 gini:
        #   {
        #       {
        #           key : a
        #           value : {
        #               {
        #                   ending: 'ia',
        #                   intact: true, ini pake ngecek kata nya itu perlu di stem apa enggak (kl true dan katanya masih belum pernah diapa2in(is_intact = true) -> jalan proses stem)
        #                   num   : 2, // ini ni panjang endingnya, nanti dipake untuk motong akhiran kata
        #                   append: '', // ini nanti yang dipake untuk ganti akhiran yang udah dipotong tu
        #                   cont: false, // stop loop kl false, jadi kl masih true berarti setelah step ini ada kemungkinan dia masih belum stem
        #                           contoh misal katanya hopping
        #                           pertama kali kena dia aturan ing, dihilangkan jadi 'ing' -> ''. sehingga hopping -> hopp. anggap diaturannya ni nilai cont nya true.
        #                               jadi loopnya lanjut untuk ngecek apakah hopp ini kena aturan stem yang lain
        #                           kedua hopp ni ternyata kena aturan di menghilangkan akhiran huruf double 'pp' -> 'p'. anggap diaturannya ni nilai cont nya false
        #
        #                           udah selesai loop akhirnya kita dapet stemnya hop
        #               },
        #               {
        #                   ending: 'a',
        #                   intact: true,
        #                   num   : 1,
        #                   append: '',
        #                   cont: false
        #               }
        #           }
        #       },
        #       {
        #           key : c
        #           value : {
        #               {
        #                   ending: 'c',
        #                   intact: true,
        #                   num   : 1,
        #                   append: '',
        #                   cont: false
        #               }
        #           }
        #       },
        #   }
        rules = self.rules

        # print( rules )
        # print ('----------------------------')

        # print ('\ndaftar akhiran yang kena rule: ')
        # for key in rules:
        #     print (key)

        # liat ini biar tau rule dengan aturan a kyak gimana itu ada 'ia' sama 'a'
        # nah tau kata itu bakal kena aturan yang mana tar liat dibawah
        # print ("\nrules a")
        # print ( rules.get('a') )
        match = self.stem_expr.match(word)

        # kl enngak match berarti gak perlu di stem
        if not match: return word

        # ngilangin misal ada awalan kilo etc
        stem = self.strip_prefix(match.group(0))

        #  pertama kali is_intact pasti true - gunanya nandain katanya belum diapa2in
        is_intact = True
        continuing = True
        while continuing:
            pfv = self.first_vowel(stem)
            # print(pfv)

            #  daftar rule yang mau kita pake, didapet dari huruf terakhir kata, misal rule a, isi list kayak diatas tadi
            rulelist = rules.get(stem[-1])

            if not rulelist: break
            # print('Start')
            continuing = False
            for ending, intact, num, append, cont in rulelist:
                # print('______________')
                # print(ending)
                # print('--------------')
                # print(intact)
                # print('--------------')
                # print(num)
                # print('--------------')
                # print(append)
                # print('--------------')
                # print(cont)
                # print('**************')

                #  nah disini dah ditauin pake rule yang mana, setelah ngambil rulelist diatas
                #  misal rule a  ada rule untuk ia, a
                # print('****END******')
                if stem.endswith(ending):
                    # print('########################################')
                    # print(stem.endswith(ending))
                    # print('########################################')
                    # skip kl gak is_intact
                    if intact and not is_intact: continue

                    # panjang kata baru setelah stem dan penambahan di rule pas iterasi ini
                    newlen = len(stem) - num + len(append)

                    if ((pfv == 0 and newlen < 2)
                            or (pfv > 0 and newlen < 3)):
                        # If word starts with vowel, minimum stem length is 2.
                        # If word starts with consonant, minimum stem length is
                        # 3.
                        continue

                    is_intact = False
                    stem = stem[:0 - num] + append

                    continuing = cont
                    break

        return stem


# The default rules for the Paice-Husk stemming algorithm
defaultrules = """
ai*2.     { -ia > -   if intact }
a*1.      { -a > -    if intact }
bb1.      { -bb > -b   }
city3s.   { -ytic > -ys }
ci2>      { -ic > -    }
cn1t>     { -nc > -nt  }
dd1.      { -dd > -d   }
dei3y>    { -ied > -y  }
deec2ss.  { -ceed > -cess }
dee1.     { -eed > -ee }
de2>      { -ed > -    }
dooh4>    { -hood > -  }
e1>       { -e > -     }
feil1v.   { -lief > -liev }
fi2>      { -if > -    }
gni3>     { -ing > -   }
gai3y.    { -iag > -y  }
ga2>      { -ag > -    }
gg1.      { -gg > -g   }
ht*2.     { -th > -   if intact }
hsiug5ct. { -guish > -ct }
hsi3>     { -ish > -   }
i*1.      { -i > -    if intact }
i1y>      { -i > -y    }
ji1d.     { -ij > -id   --  see nois4j> & vis3j> }
juf1s.    { -fuj > -fus }
ju1d.     { -uj > -ud  }
jo1d.     { -oj > -od  }
jeh1r.    { -hej > -her }
jrev1t.   { -verj > -vert }
jsim2t.   { -misj > -mit }
jn1d.     { -nj > -nd  }
j1s.      { -j > -s    }
lbaifi6.  { -ifiabl > - }
lbai4y.   { -iabl > -y }
lba3>     { -abl > -   }
lbi3.     { -ibl > -   }
lib2l>    { -bil > -bl }
lc1.      { -cl > c    }
lufi4y.   { -iful > -y }
luf3>     { -ful > -   }
lu2.      { -ul > -    }
lai3>     { -ial > -   }
lau3>     { -ual > -   }
la2>      { -al > -    }
ll1.      { -ll > -l   }
mui3.     { -ium > -   }
mu*2.     { -um > -   if intact }
msi3>     { -ism > -   }
mm1.      { -mm > -m   }
nois4j>   { -sion > -j }
noix4ct.  { -xion > -ct }
noi3>     { -ion > -   }
nai3>     { -ian > -   }
na2>      { -an > -    }
nee0.     { protect  -een }
ne2>      { -en > -    }
nn1.      { -nn > -n   }
pihs4>    { -ship > -  }
pp1.      { -pp > -p   }
re2>      { -er > -    }
rae0.     { protect  -ear }
ra2.      { -ar > -    }
ro2>      { -or > -    }
ru2>      { -ur > -    }
rr1.      { -rr > -r   }
rt1>      { -tr > -t   }
rei3y>    { -ier > -y  }
sei3y>    { -ies > -y  }
sis2.     { -sis > -s  }
si2>      { -is > -    }
ssen4>    { -ness > -  }
ss0.      { protect  -ss }
suo3>     { -ous > -   }
su*2.     { -us > -   if intact }
s*1>      { -s > -    if intact }
s0.       { -s > -s    }
tacilp4y. { -plicat > -ply }
ta2>      { -at > -    }
tnem4>    { -ment > -  }
tne3>     { -ent > -   }
tna3>     { -ant > -   }
tpir2b.   { -ript > -rib }
tpro2b.   { -orpt > -orb }
tcud1.    { -duct > -duc }
tpmus2.   { -sumpt > -sum }
tpec2iv.  { -cept > -ceiv }
tulo2v.   { -olut > -olv }
tsis0.    { protect  -sist }
tsi3>     { -ist > -   }
tt1.      { -tt > -t   }
uqi3.     { -iqu > -   } 
ugo1.     { -ogu > -og }
vis3j>    { -siv > -j  }
vie0.     { protect  -eiv }
vi2>      { -iv > -    }
ylb1>     { -bly > -bl }
yli3y>    { -ily > -y  }
ylp0.     { protect  -ply }
yl2>      { -ly > -    }
ygo1.     { -ogy > -og }
yhp1.     { -phy > -ph }
ymo1.     { -omy > -om }
ypo1.     { -opy > -op }
yti3>     { -ity > -   }
yte3>     { -ety > -   }
ytl2.     { -lty > -l  }
yrtsi5.   { -istry > - }
yra3>     { -ary > -   }
yro3>     { -ory > -   }
yfi3.     { -ify > -   }
ycn2t>    { -ncy > -nt }
yca3>     { -acy > -   }
zi2>      { -iz > -    }
zy1s.     { -yz > -ys  }
"""