In [1]:
from nltk.stem import PorterStemmer

In [2]:
word_stemmer = PorterStemmer()

In [3]:
word_stemmer.stem('writing')

'write'

In [4]:
word_stemmer.stem('eating')

'eat'

In [5]:
from nltk.stem import LancasterStemmer

In [6]:
Lanc_stemmer = LancasterStemmer()

In [7]:
Lanc_stemmer.stem('eats')

'eat'

In [8]:
from nltk.stem import RegexpStemmer

In [9]:
Reg_stemmer = RegexpStemmer('ing')

In [10]:
Reg_stemmer.stem('eating')

'eat'

In [11]:
from nltk.stem import SnowballStemmer

In [12]:
SnowballStemmer.languages

('arabic',
 'danish',
 'dutch',
 'english',
 'finnish',
 'french',
 'german',
 'hungarian',
 'italian',
 'norwegian',
 'porter',
 'portuguese',
 'romanian',
 'russian',
 'spanish',
 'swedish')

In [13]:
French_stemmer = SnowballStemmer('french')

In [14]:
French_stemmer.stem('Bonjoura')

'bonjour'

In [15]:
from nltk.stem import WordNetLemmatizer

In [16]:
lemmatizer = WordNetLemmatizer()

In [17]:
lemmatizer.lemmatize('eating')

'eating'

In [18]:
lemmatizer.lemmatize('books')

'book'

In [19]:
lemmatizer.lemmatize('bookes')

'bookes'

In [20]:
lemmatizer.lemmatize('bookies')

'bookie'

In [21]:
import re

In [22]:
from nltk.corpus import wordnet

In [23]:
R_patterns = [
    (r'won\'t', 'will not'),
    (r'can\'t', 'cannot'),
    (r'i\'m', 'i am'),
    (r'(\w+)\'ll', 'g<1> will'),
    (r'(\w+)n\'t', 'g<1> not'),
    (r'(\w+)\'ve', 'g<1> have'),
    (r'(\w+)\'s', 'g<1> is'),
    (r'(\w+)\'re', 'g<1> are'),
]

In [24]:
class REReplacer(object):
    def __init__ (self, pattern = R_patterns):
        self.pattern = [(re.compile(regex), repl) for (regex, repl) in pattern]
    
    def replace(self, text):
        s = text
        for (pattern, repl) in self.pattern:
            s = re.sub(pattern, repl, s)
        return s

In [25]:
rep_word = REReplacer()
rep_word.replace("I won't do it")

'I will not do it'

In [26]:
from nltk.tokenize import word_tokenize
rep_word = REReplacer()
word_tokenize("I won't be able to do this now")

['I', 'wo', "n't", 'be', 'able', 'to', 'do', 'this', 'now']

In [27]:
word_tokenize(rep_word.replace("I won't be able to do this now"))

['I', 'will', 'not', 'be', 'able', 'to', 'do', 'this', 'now']

In [28]:
class Rep_word_removal(object):
    def __init__(self):
        self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
        self.repl = r'\1\2\3'
    
    def replace(self, word):
        if wordnet.synsets(word):
            return word
        repl_word = self.repeat_regexp.sub(self.repl, word)
        if repl_word != word:
            return self.replace(repl_word)
        else:
            return repl_word

In [29]:
rep_word = Rep_word_removal()
rep_word.replace("Hiiiiiiiiiiiiiiiiii")

'Hi'

In [30]:
rep_word.replace('Helloooooooooooooo')

'Hello'

In [31]:
class word_syn_replacer(object):
    def __init__(self, word_map):
        self.word_map = word_map
    
    def replace(self, word):
        return self.word_map.get(word, word)

In [32]:
rep_syn = word_syn_replacer({'bday' : 'birthday'})
rep_syn.replace('bday')

'birthday'

In [44]:
import csv

In [45]:
class CSVword_syn_replacer(word_syn_replacer):
    def __init__(self, fname):
        word_map = {}
        for line in csv.reader(open(fname)):
            word, syn = line
            word_map[word] = syn
        super(CSVword_syn_replacer, self).__init__(word_map)
            

In [46]:
rep_syn = CSVword_syn_replacer('syn.csv')
rep_syn.replace('bday')

'Birthday'

In [83]:
!pip install PyYAML



In [84]:
import yaml

In [87]:
class YAMLword_syn_replacer(word_syn_replacer):
    def __init__(self, fname):
        word_map = yaml.safe_load(open(fname))
        super(YAMLword_syn_replacer, self).__init__(word_map)   

In [88]:
rep_syn1 = YAMLword_syn_replacer('syn.yaml')
rep_syn1.replace('bday')

'Birthday'

In [89]:
class word_antonym_replacer(object):
    def replace(self, word, pos = None):
        antonyms = set()
        for syn in wordnet.synsets(word, pos = pos):
            for lemma in syn.lemmas():
                for antonym in lemma.antonyms():
                    antonyms.add(antonym.name())
        if len(antonyms) == 1:
            return antonyms.pop()
        else:
            return None
    
    def replace_negations(self, sent):
        i, l = 0, len(sent)
        words = []
        while i < l:
            word = sent[i]
            if word == 'not' and i+1 < l:
                ant = self.replace(sent[i+1])
                if ant:
                    words.append(ant)
                    i += 2 
                    continue
            words.append(word)
            i += 1
        return words

In [90]:
rep_antonym = word_antonym_replacer()
rep_antonym.replace('uglify')

'beautify'

In [91]:
sentence = ["Let us", "not", "uglify", "our", "country"]
rep_antonym.replace_negations(sentence)

['Let us', 'beautify', 'our', 'country']