In [806]:
class AbstractStemmer:
    def stem(self, word):
        pass
class PorterStemmer(AbstractStemmer):
    consonants = "bcdfghjklmnpqrstwxvzBCDFGHJKLMNPQRSTWXVZ"
    special_case = "yY"
    vowels = "aeiouAEIOU"
    
    def _divide_into_groups(self, word):
        groups = []
        preceding = ""
        for idx, letter in enumerate(word.lower()):
            if preceding == "":
                preceding = letter
            else:
                if self._compare_same_class(preceding, letter):
                    preceding+= letter
                    if idx == len(word)-1:
                        groups.append(preceding)
                else:
                    groups.append(preceding)
                    preceding = letter
                    if idx == len(word)-1:
                        groups.append(letter)
        return groups
    
    def _compare_same_class(self, l1, l2):
        if l1 in self.consonants and l2 in self.consonants:
            return True
        elif l1 in self.vowels and l2 in self.vowels:
            return True
        else:
            return False
        return False
    
    def _determine_class(self, group):
        if group[0] in self.consonants:
            return 'C'
        return 'V'
    
    def _encode_word(self, word):
        encoded = self._divide_into_groups(word)
        classified = [self._determine_class(group) for group in encoded]
        return classified
    
    def _det_m(self, word):
        classes = self._encode_word(word)
        if len(classes) < 2:
            return 0
        if classes[0] == 'C':
            classes = classes[1:]
        if classes[-1] == 'V':
            classes = classes[:len(classes)-1]
        m = len(classes)//2 if (len(classes)/2) >= 1 else 0
        return m
    
    def _chk_LT(self, stem, lt):
        if stem[-1] in lt:
            return True
        return False
    
    def _chk_v(self, stem):
        for a in stem:
            if a in self.vowels:
                return True
        return False
        
    def _chk_d(self, stem):
        for a in stem[-2:]:
            if a in self.consonants:
                result = True
            else:
                return False
        return result
    def _chk_o(self, stem):
        if (stem[-1] in self.consonants) & (stem[-2] in self.vowels) & (stem[-3] in self.consonants):
            if stem[-1] not in 'wxy':
                return True
        return False
    
    def _porter_step_1(self, word):
        stem = word
        #step 1a
        if word[-4:] == 'sses':
            stem = word[:-4] + 'ss'
        elif word[-3:] == 'ies':
            stem = word[:-3] + 'i'
        elif word[-2:] == 'ss':
            stem = word
        elif (word[-1] == 's') & (len(word) > 3):
            stem = word[:-1]
        #step 1b
        is_true = False
        if stem[-3:] == 'eed':
            tmp = stem[:-3]
            if self._det_m(tmp) > 0:
                stem = stem[:-1]
                is_true = True
        elif stem[-2:] == 'ed':
            tmp = stem[:-2]
            if self._chk_v(tmp):
                stem = stem[:-2]
                is_true = True
        elif stem[-3:] == 'ing':
            tmp = stem[:-3]
            if self._chk_v(tmp):
                stem = stem[:-3]
                is_true = True
        if is_true == True:
            if stem[-2:] == 'at':
                stem = stem[:-2] + 'ate'
            elif stem[-2:] == 'bl':
                stem = stem[:-2] + 'ble'
            elif stem[-2:] == 'iz':
                stem = stem[:-2] + 'ize'
            elif (self._chk_d(stem)):
                if not (self._chk_LT(stem, 'lsz')):
                    stem = stem[:-1]
            elif (self._det_m(stem) == 1) & (self._chk_o(stem)):
                stem = stem + 'e'
        #step 1c
        if stem[-1] == 'y':
            if self._chk_v(stem[:-1]):
                stem = stem[:-1] + 'i'
        return stem
    
    def _porter_step_2(self, stem):
        if stem[-7:] == 'ational':
            if self._det_m(stem[:-7]) > 0:
                stem = stem[:-7] + 'ate'               
        elif stem[-6:] == 'tional':
            if self._det_m(stem[:-6]) > 0:
                stem = stem[:-6] + 'tion'
        elif stem[-4:] == 'enci':
            if self._det_m(stem[:-4]) > 0:
                stem = stem[:-4] + 'ence'
        elif stem[-4:] == 'anci':
            if self._det_m(stem[:-4]) > 0:
                stem = stem[:-4] + 'ance'
        elif stem[-4:] == 'izer':
            if self._det_m(stem[:-4]) > 0:
                stem = stem[:-4] + 'ize'
        elif stem[-4:] == 'abli':
            if self._det_m(stem[:-4]) > 0:
                stem = stem[:-4] + 'able'
        elif stem[-4:] == 'alli':
            if self._det_m(stem[:-4]) > 0:
                stem = stem[:-4] + 'al'
        elif stem[-5:] == 'entli':
            if self._det_m(stem[:-5]) > 0:
                stem = stem[:-5] + 'ent'
        elif stem[-3:] == 'eli':
            if self._det_m(stem[:-3]) > 0:
                stem = stem[:-3] + 'e'
        elif stem[-5:] == 'ousli':
            if self._det_m(stem[:-5]) > 0:
                stem = stem[:-5] + 'ous'
        elif stem[-7:] == 'ization':
            if self._det_m(stem[:-7]) > 0:
                stem = stem[:-7] + 'ize'
        elif stem[-5:] == 'ation':
            if self._det_m(stem[:-5]) > 0:
                stem = stem[:-5] + 'ate'
        elif word[-4:] == 'ator':
            if self._det_m(stem[:-4]) > 0:
                stem = stem[:-4] + 'ate'
        elif stem[-5:] == 'alism':
            if self._det_m(stem[:-5]) > 0:
                stem = stem[:-5] + 'al'
        elif stem[-7:] == 'iveness':
            if self._det_m(stem[:-7]) > 0:
                stem = stem[:-7] + 'ive'
        elif stem[-7:] == 'fulness':
            if self._det_m(stem[:-7]) > 0:
                stem = stem[:-7] + 'ful'                
        elif stem[-7:] == 'ousness':
            if self._det_m(stem[:-7]) > 0:
                stem = stem[:-7] + 'ous'
        elif stem[-5:] == 'aliti':
            if self._det_m(stem[:-5]) > 0:
                stem = stem[:-5] + 'al'
        elif stem[-5:] == 'iviti':
            if self._det_m(stem[:-5]) > 0:
                stem = stem[:-5] + 'ive'
        return stem
    
    def _porter_step_3(self, stem):
        if stem[-5:] == 'icate':
            if self._det_m(stem[:-5]) > 0:
                stem = stem[:-5] + 'ic'               
        elif stem[-5:] == 'ative':
            if self._det_m(stem[:-5]) > 0:
                stem = stem[:-5] 
        elif stem[-5:] == 'alize':
            if self._det_m(stem[:-5]) > 0:
                stem = stem[:-5] + 'al'
        elif stem[-5:] == 'iciti':
            if self._det_m(stem[:-5]) > 0:
                stem = stem[:-5] + 'ic'
        elif stem[-4:] == 'ical':
            if self._det_m(stem[:-4]) > 0:
                stem = stem[:-4] + 'ic'
        elif stem[-3:] == 'ful':
            if self._det_m(stem[:-3]) > 0:
                stem = stem[:-3] 
        elif stem[-4:] == 'ness':
            if self._det_m(stem[:-4]) > 0:
                stem = stem[:-4]
        return stem
    
    def _porter_step_4(self, stem):        
        if stem[-2:] == 'al':
            if self._det_m(stem[:-2]) > 1:                
                stem = stem[:-2]               
        elif stem[-4:] == 'ance':
            if self._det_m(stem[:-4]) > 1:
                stem = stem[:-4] 
        elif stem[-4:] == 'ence':
            if self._det_m(stem[:-4]) > 1:
                stem = stem[:-4]
        elif stem[-2:] == 'er':
            if self._det_m(stem[:-2]) > 1:
                stem = stem[:-2]
        elif stem[-2:] == 'ic':
            if self._det_m(stem[:-2]) > 1:
                stem = stem[:-2]
        elif stem[-4:] == 'able':
            if self._det_m(stem[:-4]) > 1:
                stem = stem[:-4] 
        elif stem[-4:] == 'ible':
            if self._det_m(stem[:-4]) > 1:
                stem = stem[:-4]
        elif stem[-3:] == 'ant':
            if self._det_m(stem[:-3]) > 1:
                stem = stem[:-3] 
        elif stem[-5:] == 'ement':
            if self._det_m(stem[:-5]) > 1:
                stem = stem[:-5]
        elif stem[-4:] == 'ment':
            if self._det_m(stem[:-4]) > 1:
                stem = stem[:-4]
        elif stem[-3:] == 'ent':
            if self._det_m(stem[:-3]) > 1:
                stem = stem[:-3]
        elif stem[-3:] == 'ion':         
            if self._det_m(stem[:-3]) > 1 & (self._chk_LT(stem[:-3], 'st')):
                stem = stem[:-3] 
        elif stem[-2:] == 'ou':
            if self._det_m(stem[:-2]) > 1:
                stem = stem[:-2]
        elif stem[-3:] == 'ism':
            if self._det_m(stem[:-3]) > 1:
                stem = stem[:-3]                
        elif stem[-3:] == 'ate':
            if self._det_m(stem[:-3]) > 1:
                stem = stem[:-3]                
        elif stem[-3:] == 'iti':
            if self._det_m(stem[:-3]) > 1:
                stem = stem[:-3] 
        elif stem[-3:] == 'ous':
            if self._det_m(stem[:-3]) > 1:
                stem = stem[:-3]
        elif stem[-3:] == 'ive':
            if self._det_m(stem[:-3]) > 1:
                stem = stem[:-3] 
        elif stem[-3:] == 'ize':
            if self._det_m(stem[:-3]) > 1:
                stem = stem[:-3] 
        return stem
    
    def _porter_step_5(self, stem):
        if stem[-1:] == 'e':
            if self._det_m(stem[:-1]) > 1:                
                stem = stem[:-1] 
        elif stem[-1:] == 'e':
            if self._det_m(stem[:-1]) == 1:
                if not (self._chk_o(stem[:-1])):
                    stem = stem[:-1]
        if self._det_m(stem[:-1]) > 1:
            if (self._chk_d(stem)) & (self._chk_LT(stem, 'l')):
                stem = stem[:-1]
        return stem
    
    def stem(self, word):
        stem = word.lower().strip()
        stem = self._porter_step_1(stem)
        stem = self._porter_step_2(stem)
        stem = self._porter_step_3(stem)
        stem = self._porter_step_4(stem)
        stem = self._porter_step_5(stem)
        return stem

In [807]:
print('Word monkey is divided in the following groups:', porter_stem._divide_into_groups('monkey'))
print('Word apparatus is divided in the following groups:', porter_stem._divide_into_groups('apparatus'))
print('Word eye is divided in the following groups:', porter_stem._divide_into_groups('eye'))

Word monkey is divided in the following groups: ['m', 'o', 'nk', 'e', 'y']
Word apparatus is divided in the following groups: ['a', 'pp', 'a', 'r', 'a', 't', 'u', 's']
Word eye is divided in the following groups: ['e', 'y', 'e']


In [808]:
print('Word monkey is divided in the following groups:', porter_stem._encode_word('monkey'))
print('Word apparatus is divided in the following groups:', porter_stem._encode_word('apparatus'))
print('Word eye is divided in the following groups:', porter_stem._encode_word('eye'))

Word monkey is divided in the following groups: ['C', 'V', 'C', 'V', 'V']
Word apparatus is divided in the following groups: ['V', 'C', 'V', 'C', 'V', 'C', 'V', 'C']
Word eye is divided in the following groups: ['V', 'V', 'V']


In [809]:
print('Word Tree is of size m:', porter_stem._det_m('Tree'))
print('Word by is of size m:', porter_stem._det_m('by'))
print('Word Trouble is of size m:', porter_stem._det_m('Trouble'))
print('Word oats is of size m:', porter_stem._det_m('oats'))
print('Word trees is of size m:', porter_stem._det_m('trees'))
print('Word ivy is of size m:', porter_stem._det_m('ivy'))
print('Word Troubles is of size m:', porter_stem._det_m('Troubles'))
print('Word private is of size m:', porter_stem._det_m('private'))
print('Word oaten is of size m:', porter_stem._det_m('oaten'))

Word Tree is of size m: 0
Word by is of size m: 0
Word Trouble is of size m: 1
Word oats is of size m: 1
Word trees is of size m: 1
Word ivy is of size m: 1
Word Troubles is of size m: 2
Word private is of size m: 2
Word oaten is of size m: 2


In [810]:
print('Check *V* :', porter_stem._chk_v('Apparatus'))
print('Check *S :', porter_stem._chk_LT('monkey', 'nul'))
print('Check *d :', porter_stem._chk_d('monkeybc'))
print('Check *o :', porter_stem._chk_o('monkek'))

Check *V* : True
Check *S : False
Check *d : True
Check *o : True


In [811]:
import string

text = "I love programming and say computers what has to be done!"
for punct in string.punctuation:
    text = text.replace(punct, '')
    
text = text.replace('\n', '')
stemmer = PorterStemmer()
not_stemmed = set()
stemmed = set()
for word in text.split():
    not_stemmed.add(word)
for word in text.split():
    stemmed.add(stemmer.stem(word))
print(stemmed)

{'to', 'be', 'has', 'done', 'sai', 'program', 'love', 'and', 'what', 'comput', 'i'}
