<small><i>This notebook was put together by [Alexander Fridman](http://www.rocketscience.ai) and [Volha Hedranovich](http://www.rocketscience.ai) for the Lecture Course. Source and license info is on [GitHub](https://github.com/volhahedranovich/jupyter_lectures).</i></small>

In [40]:
import nltk


nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/volha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/volha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/volha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Replacing words matching regular expressions


```python
import re

replacement_patterns = [  
    (r'won\'t', 'will not'),
    (r'can\'t', 'cannot'),
    (r'let\'s', 'let us'),
    (r'i\'m', 'i am'),
    (r'ain\'t', 'is not'),
    (r'(\w+)\'ll', '\g<1> will'),
    (r'(\w+)n\'t', '\g<1> not'),
    (r'(\w+)\'ve', '\g<1> have'),
    (r'(\w+)\'s', '\g<1> is'),
    (r'(\w+)\'re', '\g<1> are'),
    (r'(\w+)\'d', '\g<1> would')
]

class RegexpReplacer:
    def __init__(self, patterns=replacement_patterns):
        self.patterns = [(re.compile(regex), repl) for (regex, repl) in patterns]
        
    def replace(self, text):
        s = text
        for pattern, repl in self.patterns:
            s = re.sub(pattern, repl, s)
        return s
            
replacer = RegexpReplacer()
replacer.replace("I should've done that thing I didn't do")
'I should have done that thing I did not do'
```

In [41]:
import re

replacement_patterns = [  
    (r'won\'t', 'will not'),
    (r'can\'t', 'cannot'),
    (r'let\'s', 'let us'),
    (r'i\'m', 'i am'),
    (r'ain\'t', 'is not'),
    (r'(\w+)\'ll', '\g<1> will'),
    (r'(\w+)n\'t', '\g<1> not'),
    (r'(\w+)\'ve', '\g<1> have'),
    (r'(\w+)\'s', '\g<1> is'),
    (r'(\w+)\'re', '\g<1> are'),
    (r'(\w+)\'d', '\g<1> would')
]

class RegexpReplacer:
    def __init__(self, patterns=replacement_patterns):
        self.patterns = [(re.compile(regex, re.IGNORECASE), repl) for (regex, repl) in patterns]

    def replace(self, text):
        s = text
        for pattern, repl in self.patterns:
            s = re.sub(pattern, repl, s)
        return s


def replace_by_regexps(text):
    """
    Applies RegexpReplacer to provided text
    :param text: an input text
    :return: result of RegexpReplacer work
    """
    # TODO: your code is here
    

def replace_by_regexps(text):
    """
    Applies RegexpReplacer to provided text
    :param text: an input text
    :return: result of RegexpReplacer work
    """
    return RegexpReplacer().replace(text)


text = "Let's do some NLP staff!"
assert replace_by_regexps(text) == 'let us do some NLP staff!'

## Basic cleaning

For simplicity let's lowercase text and replace all non word characters with space symbol.

TODO: link or short regexp example

In [42]:
def clean_text(text):
    """
    Perfomes a basic text cleaning
    
    :param text: an input text
    :return: a cleaned text
    """
    # TODO: your code is here
    
    
def clean_text(text):
    """
    Perfomes a basic text cleaning
    
    :param text: an input text
    :return: a cleaned text
    """
    import re
    
    text = text.lower()
    text = re.sub('[^\w]', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip()
    return text


text = "Lorem Ipsum has been the industry's standard dummy text ever since the 1500s,"
assert clean_text(text) == 'lorem ipsum has been the industry s standard dummy text ever since the 1500s'

## Tokenization


```python
from nltk.tokenize import word_tokenize


sent = 'lorem ipsum has been the industry s standard dummy text ever since the 1500s'
word_tokenize(sent)
['lorem', 'ipsum', 'has', 'been', 'the', 'industry', 's', 'standard', 'dummy', 'text', 'ever', 'since', 'the', '1500s']
```

In [43]:
def tokenize_text(text):
    """
    Tokenizes text using word_tokenize from NLTK
    :param text: an input text
    :return: a list of tokens
    """
    # TODO: your code is here


def tokenize_text(text):
    """
    Tokenizes text using word_tokenize from NLTK
    :param text: an input text
    :return: a list of tokens
    """
    from nltk.tokenize import word_tokenize
    return word_tokenize(text, language='english')


sent = 'lorem ipsum has been the industry s standard dummy text ever since the 1500s'
tokens = tokenize_text(sent)
assert set(tokens) == {'ipsum', '1500s', 'the', 'since', 'text', 'been', 'ever',
                       'has', 'industry', 'lorem', 's', 'standard', 'dummy'}

## Removing repeated characters


```python
import re
from nltk.corpus import wordnet


class RepeatReplacer:
    def __init__(self):
        self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
        self.repl = r'\1\2\3'
        
    def replace(self, word):
        if wordnet.synsets(word):
            return word
        repl_word = self.repeat_regexp.sub(self.repl, word)
        if repl_word != word:
            return self.replace(repl_word)
        return repl_word
    
    
replacer = RepeatReplacer()
replacer.replace('goose')
'goose'
replacer.replace('looooove')
'love'
```

In [44]:
import re
from nltk.corpus import wordnet


class RepeatReplacer:
    def __init__(self):
        self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
        self.repl = r'\1\2\3'

    def replace(self, word):
        if wordnet.synsets(word):
            return word
        repl_word = self.repeat_regexp.sub(self.repl, word)
        if repl_word != word:
            return self.replace(repl_word)
        return repl_word
    

def remove_repeated_characters(text_tokens):
    """
    Removes repeated letters from tokens
    
    :param text_tokens: a list of text's tokens
    :return: tokens list
    """
    # TODO: your code is here
    
    
def remove_repeated_characters(text_tokens):
    """
    Removes repeated letters from tokens
    
    :param text_tokens: a list of text's tokens
    :return: tokens list
    """
    replacer = RepeatReplacer()
    return [replacer.replace(t) for t in text_tokens]


text_tokens = ['I', 'wooooould', 'like', 'to', 'showwww', 'you',
               'basic', 'text', 'preprocessing', 'stageeeeees']
assert remove_repeated_characters(text_tokens) == ['I', 'would', 'like', 'to', 'show',
                                            'you', 'basic', 'text', 'preprocesing', 'stagees']

## Stopwords removal


```python
from nltk.corpus import stopwords

en_stopwords = set(stopwords.words('english'))

tokens = ['lorem', 'ipsum', 'has', 'been', 'the', 'industry', 's', 'standard',
          'dummy', 'text', 'ever', 'since', 'the', '1500s']
tokens = [t for t in tokens if t not in en_stopwords]
```

In [45]:
def remove_stopwords(text_tokens):
    """
    Removes stopwords from a given list of tokens and words shorter than 3 chars
    
    :param text_tokens: a list of text's tokens
    :return: filtered tokens list
    """
    # TODO: your code is here
    

def remove_stopwords(text_tokens):
    """
    Removes stopwords from a given list of tokens and words shorter than 3 chars
    
    :param text_tokens: a list of text's tokens
    :return: filtered tokens list
    """
    from nltk.corpus import stopwords

    en_stopwords = set(stopwords.words('english'))
    return [t for t in text_tokens if t not in en_stopwords and len(t) >= 3]


tokens = ['lorem', 'ipsum', 'has', 'been', 'the', 'industry', 's', 'standard',
          'dummy', 'text', 'ever', 'since', 'the', '1500s']
assert remove_stopwords(tokens) == ['lorem', 'ipsum', 'industry', 'standard',
                                    'dummy', 'text', 'ever', 'since', '1500s']

## Spelling correction


```python
import enchant
from nltk.metrics import edit_distance


class SpellingReplacer:
    def __init__(self, dict_name='en', max_dist=2):
        self.spell_dict = enchant.Dict(dict_name)
        self.max_dist = max_dist
    
    def replace(self, word):
        if self.spell_dict.check(word):
            return word
        suggestions = self.spell_dict.suggest(word)
        if suggestions and edit_distance(word, suggestions[0]) <= self.max_dist:
            return suggestions[0]
        return word
    
    
replacer = SpellingReplacer()
replacer.replace('cookbok')
'cookbook'
```

In [46]:
import enchant
from nltk.metrics import edit_distance


class SpellingReplacer:
    def __init__(self, dict_name='en', max_dist=2):
        self.spell_dict = enchant.Dict(dict_name)
        self.max_dist = max_dist

    def replace(self, word):
        if self.spell_dict.check(word):
            return word
        suggestions = self.spell_dict.suggest(word)
        if suggestions and edit_distance(word, suggestions[0]) <= self.max_dist:
            return suggestions[0]
        return word

    
def correct_spelling(text_tokens):
    """
    Corrects spelling using enchant package
    :param text_tokens: an input tokens list
    :return: a token list
    """
    # TODO: your code is here
    

def correct_spelling(text_tokens):
    """
    Corrects spelling using enchant package
    :param text_tokens: an input tokens list
    :return: a token list
    """
    replacer = SpellingReplacer()
    return [replacer.replace(w) for w in text_tokens]


tokens = ['cookbokc', 'mother', 'fother', 'pythen']
assert correct_spelling(tokens) == ['cookbook', 'mother', 'other', 'python']

## Lemmatizing


```python
from nltk.stem import WordNetLemmatizer


lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize('cooking', 'v')
'cook'
lemmatizer.lemmatize('texts', 'n')
'text'
```

In [47]:
def lemmatize(text_tokens):
    """
    Lemmatizies provided list of tokens
    :param text_tokens: an input tokens list
    :return: a token list
    """
    # TODO: your code is here


def lemmatize(text_tokens):
    """
    Lemmatizies provided list of tokens
    :param text_tokens: an input tokens list
    :return: a token list
    """
    from nltk.stem import WordNetLemmatizer
    
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(t, 'n') for t in text_tokens]

tokens = ['texts', 'books', 'tables', 'pythons']
assert lemmatize(tokens) == ['text', 'book', 'table', 'python']

## Adding synonyms

```python
from nltk.corpus import wordnet


synset = wordnet.synsets('dummy')[0]
synset.lemma_names()
['dummy', 'silent_person']
```

In [48]:
def add_synonyms(text_tokens, n_synonyms=2):
    """
    Adds synonyms to tokens list
    
    :param text_tokens: an input tokens list
    :param n_synonyms: count of synonyms to add
    :return: a token list
    """
    # TODO: your code is here
    
    
def add_synonyms(text_tokens, n_synonyms=2):
    """
    Adds synonyms to tokens list
    
    :param text_tokens: an input tokens list
    :return: a token list
    """
    import itertools
    from nltk.corpus import wordnet
    
    extended_tokens = []
    
    for token in text_tokens:
        synsets = wordnet.synsets(token)
        
        if synsets:
            synset = synsets[0]
            extended_tokens.extend(synset.lemma_names()[:n_synonyms])
        else:
            extended_tokens.append(token)
            
    return extended_tokens


tokens = ['lorem', 'ipsum', 'industry', 'standard', 'dummy', 'text', 'ever', 'since', '1500s']
assert set(add_synonyms(tokens)) == {'industry', 'lorem', 'since',
                                     'ever', 'of_all_time', 'ipsum',
                                     'text', 'criterion', 'standard',
                                     'textual_matter', 'dummy', 'silent_person',
                                     '1500s'}

## Classifing 20 news groups

#### Loading dataset

In [54]:
from sklearn.datasets import fetch_20newsgroups


dataset = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'))

X = dataset['data']
y = dataset['target']

#### Applying prepropcessing

In [56]:
def text_preprocessing_pipeline(X):
    from tqdm import tqdm_notebook
    
    X_processed = []
    
    for x in tqdm_notebook(X):
        x = replace_by_regexps(x)
        x = clean_text(x)
        x = tokenize_text(x)
        x = remove_repeated_characters(x)
        x = remove_stopwords(x)
        # x = correct_spelling(x) # disable spelling correction because of slow work
        x = lemmatize(x)
        x = add_synonyms(x)
        x = ' '.join(x)
        X_processed.append(x)
    
    return X_processed

X = text_preprocessing_pipeline(X)




#### Saving preprocessed data

In [1]:
import pickle


# with open('data.p', 'wb') as f:
#     pickle.dump((X, y), f)
    
with open('data.p', 'rb') as f:
    X, y = pickle.load(f)

In [29]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

from sklearn.preprocessing import StandardScaler, LabelEncoder, FunctionTransformer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

#### Train/test splitting

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

#### Building pipeline

In [35]:
def to_dense(x): return x.todense()

pipeline = make_pipeline(
    TfidfVectorizer(max_features=1000),
    FunctionTransformer(to_dense, accept_sparse=True), 
    StandardScaler(),
    RandomForestClassifier()
)

#### Encoding target

In [36]:
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.transform(y_test)

In [37]:
pipeline.steps

[('tfidfvectorizer',
  TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
          dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
          lowercase=True, max_df=1.0, max_features=1000, min_df=1,
          ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
          stop_words=None, strip_accents=None, sublinear_tf=False,
          token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
          vocabulary=None)),
 ('functiontransformer', FunctionTransformer(accept_sparse=True,
            func=<function to_dense at 0x7f577998b488>, inv_kw_args=None,
            inverse_func=None, kw_args=None, pass_y='deprecated',
            validate=True)),
 ('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)),
 ('randomforestclassifier',
  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
              max_depth=None, max_features='auto', max_leaf_nodes=None,
              min_impurity_

#### Performing grid search cv

In [38]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

param_space = {
    'randomforestclassifier__n_estimators': [10, 100, 1000],
    'randomforestclassifier__max_depth': [5, 10, 20]
}

clf = GridSearchCV(pipeline, param_space, cv=StratifiedKFold(),
                   verbose=8, scoring='f1_weighted', n_jobs=-1)
clf.fit(X_train, y_train)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] randomforestclassifier__n_estimators=10, randomforestclassifier__max_depth=5 
[CV] randomforestclassifier__n_estimators=10, randomforestclassifier__max_depth=5 
[CV] randomforestclassifier__n_estimators=10, randomforestclassifier__max_depth=5 
[CV] randomforestclassifier__n_estimators=100, randomforestclassifier__max_depth=5 
[CV]  randomforestclassifier__n_estimators=10, randomforestclassifier__max_depth=5, score=0.327964226434883, total=   5.4s
[CV] randomforestclassifier__n_estimators=100, randomforestclassifier__max_depth=5 
[CV]  randomforestclassifier__n_estimators=10, randomforestclassifier__max_depth=5, score=0.33277935389575203, total=   5.6s
[CV] randomforestclassifier__n_estimators=100, randomforestclassifier__max_depth=5 
[CV]  randomforestclassifier__n_estimators=10, randomforestclassifier__max_depth=5, score=0.3240515333594717, total=   5.7s


  'precision', 'predicted', average, warn_for)


[CV] randomforestclassifier__n_estimators=1000, randomforestclassifier__max_depth=5 
[CV]  randomforestclassifier__n_estimators=100, randomforestclassifier__max_depth=5, score=0.42969541506801245, total=   8.8s
[CV] randomforestclassifier__n_estimators=1000, randomforestclassifier__max_depth=5 


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[CV]  randomforestclassifier__n_estimators=100, randomforestclassifier__max_depth=5, score=0.4002843081652222, total=   8.5s
[CV] randomforestclassifier__n_estimators=1000, randomforestclassifier__max_depth=5 
[CV]  randomforestclassifier__n_estimators=100, randomforestclassifier__max_depth=5, score=0.40783087754931574, total=   8.4s
[CV] randomforestclassifier__n_estimators=10, randomforestclassifier__max_depth=10 
[CV]  randomforestclassifier__n_estimators=10, randomforestclassifier__max_depth=10, score=0.3850509945199527, total=   7.5s
[CV] randomforestclassifier__n_estimators=10, randomforestclassifier__max_depth=10 
[CV]  randomforestclassifier__n_estimators=10, randomforestclassifier__max_depth=10, score=0.38548455073537624, total=   7.5s
[CV] randomforestclassifier__n_estimators=10, randomforestclassifier__max_depth=10 


  'precision', 'predicted', average, warn_for)


[CV]  randomforestclassifier__n_estimators=10, randomforestclassifier__max_depth=10, score=0.3917534327675396, total=   5.8s
[CV] randomforestclassifier__n_estimators=100, randomforestclassifier__max_depth=10 


  'precision', 'predicted', average, warn_for)


[CV]  randomforestclassifier__n_estimators=1000, randomforestclassifier__max_depth=5, score=0.4311177821722494, total=  42.2s
[CV] randomforestclassifier__n_estimators=100, randomforestclassifier__max_depth=10 


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   57.4s
  'precision', 'predicted', average, warn_for)


[CV]  randomforestclassifier__n_estimators=1000, randomforestclassifier__max_depth=5, score=0.42529450336683833, total=  42.7s
[CV] randomforestclassifier__n_estimators=100, randomforestclassifier__max_depth=10 


  'precision', 'predicted', average, warn_for)


[CV]  randomforestclassifier__n_estimators=1000, randomforestclassifier__max_depth=5, score=0.41846186513471556, total=  40.1s
[CV] randomforestclassifier__n_estimators=1000, randomforestclassifier__max_depth=10 
[CV]  randomforestclassifier__n_estimators=100, randomforestclassifier__max_depth=10, score=0.4691883306293259, total=  11.1s
[CV] randomforestclassifier__n_estimators=1000, randomforestclassifier__max_depth=10 
[CV]  randomforestclassifier__n_estimators=100, randomforestclassifier__max_depth=10, score=0.46087601731273115, total=  11.3s
[CV] randomforestclassifier__n_estimators=1000, randomforestclassifier__max_depth=10 


  'precision', 'predicted', average, warn_for)


[CV]  randomforestclassifier__n_estimators=100, randomforestclassifier__max_depth=10, score=0.4498546031428102, total=  10.2s
[CV] randomforestclassifier__n_estimators=10, randomforestclassifier__max_depth=20 
[CV]  randomforestclassifier__n_estimators=10, randomforestclassifier__max_depth=20, score=0.44045995620091377, total=   6.1s
[CV] randomforestclassifier__n_estimators=10, randomforestclassifier__max_depth=20 
[CV]  randomforestclassifier__n_estimators=10, randomforestclassifier__max_depth=20, score=0.4446308648398147, total=   5.8s
[CV] randomforestclassifier__n_estimators=10, randomforestclassifier__max_depth=20 
[CV]  randomforestclassifier__n_estimators=10, randomforestclassifier__max_depth=20, score=0.43351115171103116, total=   5.9s
[CV] randomforestclassifier__n_estimators=100, randomforestclassifier__max_depth=20 


  'precision', 'predicted', average, warn_for)


[CV]  randomforestclassifier__n_estimators=100, randomforestclassifier__max_depth=20, score=0.5006072469711491, total=  15.4s
[CV] randomforestclassifier__n_estimators=100, randomforestclassifier__max_depth=20 


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[CV]  randomforestclassifier__n_estimators=1000, randomforestclassifier__max_depth=10, score=0.46707726239575936, total= 1.0min
[CV] randomforestclassifier__n_estimators=100, randomforestclassifier__max_depth=20 
[CV]  randomforestclassifier__n_estimators=1000, randomforestclassifier__max_depth=10, score=0.46221744742121973, total= 1.0min
[CV] randomforestclassifier__n_estimators=1000, randomforestclassifier__max_depth=20 
[CV]  randomforestclassifier__n_estimators=100, randomforestclassifier__max_depth=20, score=0.486571769856957, total=  15.4s
[CV] randomforestclassifier__n_estimators=1000, randomforestclassifier__max_depth=20 
[CV]  randomforestclassifier__n_estimators=1000, randomforestclassifier__max_depth=10, score=0.46287162695989237, total= 1.1min
[CV] randomforestclassifier__n_estimators=1000, randomforestclassifier__max_depth=20 
[CV]  randomforestclassifier__n_estimators=100, randomforestclassifier__max_depth=20, score=0.4868414808264797, total=  14.2s


[Parallel(n_jobs=-1)]: Done  24 out of  27 | elapsed:  2.6min remaining:   19.2s
  'precision', 'predicted', average, warn_for)


[CV]  randomforestclassifier__n_estimators=1000, randomforestclassifier__max_depth=20, score=0.4980874139585865, total= 1.6min
[CV]  randomforestclassifier__n_estimators=1000, randomforestclassifier__max_depth=20, score=0.49801481716337986, total= 1.5min
[CV]  randomforestclassifier__n_estimators=1000, randomforestclassifier__max_depth=20, score=0.4883461721278169, total= 1.6min


[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:  4.0min finished


GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=None, shuffle=False),
       error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=1000, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_i...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'randomforestclassifier__n_estimators': [10, 100, 1000], 'randomforestclassifier__max_depth': [5, 10, 20]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1_weighted', verbose=8)

In [41]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.94      0.16      0.27        96
          1       0.66      0.44      0.53       117
          2       0.48      0.56      0.52       118
          3       0.49      0.50      0.50       118
          4       0.75      0.43      0.54       115
          5       0.77      0.57      0.66       119
          6       0.69      0.57      0.63       117
          7       0.71      0.50      0.58       119
          8       0.86      0.49      0.62       120
          9       0.57      0.39      0.46       119
         10       0.66      0.69      0.67       120
         11       0.79      0.63      0.70       119
         12       0.16      0.64      0.25       118
         13       0.30      0.62      0.41       119
         14       0.72      0.58      0.64       119
         15       0.56      0.81      0.66       120
         16       0.67      0.44      0.53       109
         17       0.81      0.65      0.72   