In [1]:
import re
import nltk
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from nltk import tokenize as tknz
from nltk.corpus import stopwords

from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet

from natasha import Doc, MorphVocab, Segmenter, NewsEmbedding, NewsMorphTagger


nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/alex/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
class TextProcessor:
    
    def __init__(self,message=None,verbose=False):
        self._message= message
        self._verbose=verbose
        self.tokens=[]

    def set_message(self,msg):
        self._message = msg;
        if self._verbose: print ("  --> The message was set {}".format(self._message))
        return self

    def get_message(self):
        return self._message

    def remove_email_word(self,replace=" "):
        xtr = re.sub(pattern="@[\w]*",repl=replace,string=self._message)
        if self._verbose: print("  --> The @word was removed {} {} ".format(xtr,self._message))
        return self.set_message(msg=xtr)
    
    def remove_punctuation(self,replace=" "):
        xtr = re.sub(pattern=r'[^\w\s]',repl=replace,string=self._message)
        if self._verbose: print("  --> Removing punctuation: {} {}".format(xtr,self._message))
        return self.set_message(msg=xtr)

    def remove_special_sym(self,replace=" "):
        xtr = re.sub(pattern=r'[^a-zA-Z0-9]',repl=replace,string=self._message)
        if self._verbose: print ("  --> Removing special symbols: {} {}".format(xtr,self._message))
        return self.set_message(msg=xtr)
    
    def remove_numbers(self,replace=" "):
        xtr = re.sub(pattern=r'[^a-zA-Z]',repl=replace,string=self._message)
        if self._verbose: print("  --> Removing all numbers: {} {}".format(xtr,self._message))
        return self.set_message(msg=xtr)

    def to_lower (self ):
        xtr = self._message.lower();
        if self._verbose: print ("  --> Making to lower case: {} {}".format(xtr,self._message))
        return self.set_message(msg=xtr)

    def replace_by_dicts (self,dictionary: dict):
        for key in dictionary:
            xtr = self._message.replace(key,dictionary[key])
            #re.sub(pattern=key,repl=dictionary[key],string=self._message)
            if self._verbose:
                print ("  --> Replacing: {}->{} Result: before: {} after: {}".format(key,dictionary[key],self._message,xtr))
            self.set_message(msg=xtr)
        return self

    def escape_single_symbol_words (self):
        xtr=" ".join([word for word in self._message.split() if len(word)>1])
        if self._verbose:
            print ("  --> Escaping single symbol words: {} {}".format(xtr,self._message))
        return self.set_message(msg=xtr)

    def make_tokenization(self,tokenizer):
        self.tokens=tokenizer(self._message)
        if self._verbose:
            print ("  --> custom tokenizer was completed: {}".format(self.tokens))
        return self

    def nltk_word_tokenize(self):
        self.tokens=tknz.word_tokenize(self._message)
        if self._verbose:
            print ("  --> nltk.tokenize.word_tokenize was completed: {}".format(self.tokens))
        return self

    def nltk_word_punc_tokenize(self):
        self.tokens = tknz.wordpunct_tokenize(self._message)
        if self._verbose:
            print ("  --> nltk.tokenize.wordpunct_tokenize was completed: {}".format(self.tokens))
        return self

    def nltk_tok_tok_tokenizer(self):
        self.tokens = tknz.ToktokTokenizer().tokenize(self._message)
        if self._verbose:
            print ("  --> nltk.tokenize.ToktokTokenizer().tokenize was completed: {}".format(self.tokens))
        return self
        
    def nltk_tweet_tokenizer(self):
        self.tokens = tknz.TweetTokenizer().tokenize(self._message)
        if self._verbose:
            print ("  --> nltk.tokenize.TweetTokenizer().tokenize was completed: {}".format(self.tokens))
        return self

    def nltk_with_regexp_tokenizer (self,regexp):
        self.tokens = tknz.RegexpTokenizer(regexp).tokenize(self._message)
        if self._verbose:
            print ("  --> nltk.tokenize.RegexpTokenizer({}).tokenize was completed: {}".format(regexp,self.tokens))
        return self

    def nltk_sentence_tokenizer (self):
        self.tokens = nltk.sent_tokenize(self._message)
        if self._verbose:
            print ("  --> nltk.sent_tokenize() was completed: {}".format(self.tokens))
        return self

    def remove_stopwords_from_tokens (self,lang=None,is_new=True):
        if lang is None:
            sw = set(stopwords.words("english"))
        else :
            sw = set(stopwords.words(lang))
        tks = [token for token in self.tokens if token not in sw]
        if is_new:
            self.tokens_without_stops = tks
        else :
            self.tokens = tks
        return self

    def stemme_tokens (self,stemmer=None,is_new=True):
        if stemmer is None:
            stemmer = PorterStemmer()
        xtr = [stemmer.stem(token) for token in self.tokens]
        if is_new :
            self.stem_tokens = xtr
        else :
            self.tokens = xtr
        return self

    def lematize_tokens (self,lematizer=None,is_new=True):
        if lematizer is None:
            lematizer = WordNetLemmatizer()
        xtr = [lematizer.lemmatize(token) for token in self.tokens]
        if is_new:
            self.lem_tokens = xtr
        else :
            self.tokens = xtr
        return self

In [3]:
import pandas as pd

file_path_1 = "/home/alex/dev/AiLearning/DataSetStore/twitter_messages_2/negative.csv"
df1 = pd.read_csv(file_path_1,header=None,on_bad_lines='skip',sep=";")

file_path_2 = "/home/alex/dev/AiLearning/DataSetStore/twitter_messages_2/positive.csv"
df2 = pd.read_csv(file_path_2,header=None,on_bad_lines='skip',sep=";")

df = df1._append(df2)
df.head()
df_copy = df.copy()[0:300]

In [4]:


def make_natasha_tokens(text,
                        morphology_filter_set=None,
                        token_length_limit=None,
                        empty_token="empty",
                        verbose=False):
    segmenter = Segmenter()
    morph_tagger = NewsMorphTagger(NewsEmbedding())
    morph_vocab = MorphVocab()
    
    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)

    selected_tokens=[]
    for token in doc.tokens:
        token.lemmatize(morph_vocab)
        if verbose:
            print (" >> Input {} ".format(token))
        new_token = None
        if morphology_filter_set is None:
            new_token = token.lemma
        else:
            if token.pos in morphology_filter_set:
                new_token = token.lemma
        if new_token is not None and token_length_limit is not None:
            if len(new_token)<token_length_limit:
                new_token = None
        if new_token is not None:
            selected_tokens.append(new_token)
        if verbose:
            print (" >> Output {} ".format(new_token))
    if len(selected_tokens)==0 :
        selected_tokens.append(empty_token)
    return selected_tokens

make_natasha_tokens(text="Желаю хорошего полёта и удачной посадки,я буду",morphology_filter_set=('NOUN','VERB'), token_length_limit=3,verbose=True)


 >> Input DocToken(stop=5, text='Желаю', pos='VERB', feats=<Imp,Ind,Sing,1,Pres,Fin,Act>, lemma='желать') 
 >> Output желать 
 >> Input DocToken(start=6, stop=14, text='хорошего', pos='ADJ', feats=<Gen,Pos,Masc,Sing>, lemma='хороший') 
 >> Output None 
 >> Input DocToken(start=15, stop=21, text='полёта', pos='NOUN', feats=<Inan,Gen,Masc,Sing>, lemma='полет') 
 >> Output полет 
 >> Input DocToken(start=22, stop=23, text='и', pos='CCONJ', lemma='и') 
 >> Output None 
 >> Input DocToken(start=24, stop=31, text='удачной', pos='ADJ', feats=<Gen,Pos,Fem,Sing>, lemma='удачный') 
 >> Output None 
 >> Input DocToken(start=32, stop=39, text='посадки', pos='NOUN', feats=<Inan,Gen,Fem,Sing>, lemma='посадка') 
 >> Output посадка 
 >> Input DocToken(start=39, stop=40, text=',', pos='PUNCT', lemma=',') 
 >> Output None 
 >> Input DocToken(start=40, stop=41, text='я', pos='PRON', feats=<Nom,Sing,1>, lemma='я') 
 >> Output None 
 >> Input DocToken(start=42, stop=46, text='буду', pos='AUX', feats=<Imp,I

['желать', 'полет', 'посадка']

In [5]:


def process(txt: str,verbose=False):
    return TextProcessor(verbose=verbose)\
    .set_message(msg=txt)\
    .remove_email_word()\
    .remove_punctuation()\
    .to_lower()\
    .escape_single_symbol_words()\
    .remove_stopwords_from_tokens(lang='russian',is_new=False)\
    .make_tokenization(tokenizer=lambda txt: make_natasha_tokens(text=txt,morphology_filter_set=('NOUN','VERB'),token_length_limit=3))\
    .tokens
    

process(txt="Желаю хорошего полёта и удачной посадки,я буду",verbose=True)

  --> The message was set Желаю хорошего полёта и удачной посадки,я буду
  --> The @word was removed Желаю хорошего полёта и удачной посадки,я буду Желаю хорошего полёта и удачной посадки,я буду 
  --> The message was set Желаю хорошего полёта и удачной посадки,я буду
  --> Removing punctuation: Желаю хорошего полёта и удачной посадки я буду Желаю хорошего полёта и удачной посадки,я буду
  --> The message was set Желаю хорошего полёта и удачной посадки я буду
  --> Making to lower case: желаю хорошего полёта и удачной посадки я буду Желаю хорошего полёта и удачной посадки я буду
  --> The message was set желаю хорошего полёта и удачной посадки я буду
  --> Escaping single symbol words: желаю хорошего полёта удачной посадки буду желаю хорошего полёта и удачной посадки я буду
  --> The message was set желаю хорошего полёта удачной посадки буду
  --> custom tokenizer was completed: ['желать', 'полет', 'посадка']


['желать', 'полет', 'посадка']

In [6]:
df_copy['tokens'] = df_copy[3].apply(lambda txt: process(txt=txt,verbose=False))
df_copy.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,tokens
0,408906762813579264,1386325944,dugarchikbellko,на работе был полный пиддес :| и так каждое за...,-1,0,0,0,8064,111,94,2,"[работа, пиддес, закрытие, месяц, свихнуться]"
1,408906818262687744,1386325957,nugemycejela,"Коллеги сидят рубятся в Urban terror, а я из-з...",-1,0,0,0,26,42,39,0,"[коллега, сидеть, рубятся, винд, мочь]"
2,408906858515398656,1386325966,4post21,@elina_4post как говорят обещаного три года жд...,-1,0,0,0,718,49,249,0,"[говорить, обещаного, год, ждать]"
3,408906914437685248,1386325980,Poliwake,"Желаю хорошего полёта и удачной посадки,я буду...",-1,0,0,0,10628,207,200,0,"[желать, полет, посадка, скучать]"
4,408906914723295232,1386325980,capyvixowe,"Обновил за каким-то лешим surf, теперь не рабо...",-1,0,0,0,35,17,34,0,"[обновить, леший, работать, простоплеер]"


In [7]:
!pip install -U gensim

Collecting gensim
  Downloading gensim-4.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.3 kB)
Collecting smart-open>=1.8.1 (from gensim)
  Downloading smart_open-6.4.0-py3-none-any.whl.metadata (21 kB)
Downloading gensim-4.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading smart_open-6.4.0-py3-none-any.whl (57 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.0/57.0 kB[0m [31m846.6 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hInstalling collected packages: smart-open, gensim
Successfully installed gensim-4.3.2 smart-open-6.4.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip instal

In [8]:
from gensim.models import *
from gensim import corpora

In [9]:
dictionary = corpora.Dictionary(df_copy['tokens'])

dictionary.filter_extremes(no_below = 10, no_above = 0.9, keep_n=None) 
dictionary.save('lenta.dict')

In [10]:
corpus = [dictionary.doc2bow(text) for text in df_copy['tokens']]
corpora.MmCorpus.serialize('lenta.model', corpus) 

In [11]:
%time lda = ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=20, chunksize=50, update_every=1, passes=2)

CPU times: user 47.2 ms, sys: 1.97 ms, total: 49.1 ms
Wall time: 84 ms


In [13]:
lda.show_topics(num_topics=10, num_words=10, formatted=True)

[(8,
  '0.200*"любить" + 0.200*"мочь" + 0.200*"день" + 0.200*"хотеть" + 0.200*"нет"'),
 (12,
  '0.200*"любить" + 0.200*"мочь" + 0.200*"день" + 0.200*"хотеть" + 0.200*"нет"'),
 (16,
  '0.200*"любить" + 0.200*"мочь" + 0.200*"день" + 0.200*"хотеть" + 0.200*"нет"'),
 (10,
  '0.200*"любить" + 0.200*"мочь" + 0.200*"день" + 0.200*"хотеть" + 0.200*"нет"'),
 (2,
  '0.200*"любить" + 0.200*"мочь" + 0.200*"день" + 0.200*"хотеть" + 0.200*"нет"'),
 (15,
  '0.200*"любить" + 0.200*"мочь" + 0.200*"день" + 0.200*"хотеть" + 0.200*"нет"'),
 (6,
  '0.200*"любить" + 0.200*"мочь" + 0.200*"день" + 0.200*"хотеть" + 0.200*"нет"'),
 (19,
  '0.311*"хотеть" + 0.194*"нет" + 0.165*"мочь" + 0.165*"день" + 0.165*"любить"'),
 (17,
  '0.200*"любить" + 0.200*"мочь" + 0.200*"день" + 0.200*"хотеть" + 0.200*"нет"'),
 (9,
  '0.987*"нет" + 0.003*"мочь" + 0.003*"хотеть" + 0.003*"день" + 0.003*"любить"')]

In [14]:
!pip install pyLDAvis

Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting numexpr (from pyLDAvis)
  Downloading numexpr-2.8.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.9 kB)
Collecting funcy (from pyLDAvis)
  Downloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Downloading numexpr-2.8.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (377 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m377.2/377.2 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: funcy, numexpr, pyLDAvis
Successfully installed funcy-2.0 numexpr-2.8.8 pyLDAvis-3.4.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m

In [16]:
import pyLDAvis
import pyLDAvis.gensim as gensimvis

%time vis_data = gensimvis.prepare(lda, corpus, dictionary, mds='mmds')
pyLDAvis.display(vis_data)

CPU times: user 80.2 ms, sys: 0 ns, total: 80.2 ms
Wall time: 540 ms


