In [1]:
import re
import gensim
import logging
import nltk
import nltk.data
import pandas as pd
import urllib.request
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from gensim.models import word2vec
from nltk.tokenize import sent_tokenize, RegexpTokenizer

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
import requests

url = "https://raw.githubusercontent.com/ancatmara/data-science-nlp/master/data/w2v/train/unlabeledTrainData.tsv"
response = requests.get(url)

with open('unlabeledTrainData.tsv', 'wb') as file:
    file.write(response.content)

In [3]:
data = pd.read_csv('unlabeledTrainData.tsv', header=0, delimiter="\t", quoting=3)

len(data)

50000

In [4]:
data.head()

Unnamed: 0,id,review
0,"""9999_0""","""Watching Time Chasers, it obvious that it was..."
1,"""45057_0""","""I saw this film about 20 years ago and rememb..."
2,"""15561_0""","""Minor Spoilers<br /><br />In New York, Joan B..."
3,"""7161_0""","""I went to see this film with a great deal of ..."
4,"""43971_0""","""Yes, I agree with everyone on this site this ..."


In [5]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [6]:
def review_to_wordlist(review, remove_stopwords=False ):
    # убираем ссылки вне тегов
    review = re.sub(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", " ", review)
    review_text = BeautifulSoup(review, "lxml").get_text()
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    words = review_text.lower().split()
    if remove_stopwords:
        stops = stopwords.words("english")
        words = [w for w in words if not w in stops]
    return(words)

In [7]:
def review_to_sentences(review, tokenizer, remove_stopwords=False):
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []

    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(review_to_wordlist(raw_sentence, remove_stopwords))
    return sentences

In [8]:
sentences = []

print("Parsing sentences from training set ...")
for review in data["review"]:
    sentences += review_to_sentences(review, tokenizer)

Parsing sentences from training set ...


  review_text = BeautifulSoup(review, "lxml").get_text()


In [9]:
print(len(sentences))
print(sentences[0])

529416
['watching', 'time', 'chasers', 'it', 'obvious', 'that', 'it', 'was', 'made', 'by', 'a', 'bunch', 'of', 'friends']


Обучаем и сохраняем модель. 


Основные параметры:
* данные должны быть итерируемым объектом 
* size — размер вектора, 
* window — размер окна наблюдения,
* min_count — мин. частотность слова в корпусе,
* sg — используемый алгоритм обучения (0 — CBOW, 1 — Skip-gram),
* sample — порог для downsampling'a высокочастотных слов,
* workers — количество потоков,
* alpha — learning rate,
* iter — количество итераций,
* max_vocab_size — позволяет выставить ограничение по памяти при создании словаря (т.е. если ограничение превышается, то низкочастотные слова будут выбрасываться). Для сравнения: 10 млн слов = 1Гб RAM.

**NB!** Обратите внимание, что тренировка модели не включает препроцессинг! Это значит, что избавляться от пунктуации, приводить слова к нижнему регистру, лемматизировать их, проставлять частеречные теги придется до тренировки модели (если, конечно, это необходимо для вашей задачи). Т.е. в каком виде слова будут в исходном тексте, в таком они будут и в модели.

In [10]:
from gensim.models import Word2Vec

In [11]:
print("Model training time ... ")

%time model_en = word2vec.Word2Vec(sentences, workers=4, vector_size=300, min_count=10, window=10, sample=1e-3)

Model training time ... 
CPU times: total: 1min 44s
Wall time: 28.5 s


In [12]:
len(model_en.wv)

28308

In [13]:
model_en.wv.most_similar(positive=["woman", "actor"], negative=["man"], topn=5)

[('actress', 0.7466329336166382),
 ('performer', 0.5464993119239807),
 ('actresses', 0.5036458969116211),
 ('ms', 0.4557781219482422),
 ('performance', 0.45309698581695557)]

In [14]:
model_en.wv.most_similar(positive=["dogs", "man"], negative=["dog"], topn=5)

[('men', 0.6447756886482239),
 ('criminals', 0.499918669462204),
 ('civilians', 0.4834405183792114),
 ('scientists', 0.4689888656139374),
 ('nazis', 0.46129292249679565)]

In [15]:
print(model_en.wv.most_similar("usa", topn=5))

[('europe', 0.733922004699707), ('germany', 0.709866464138031), ('uk', 0.7064135670661926), ('north', 0.6918818354606628), ('greece', 0.6868155598640442)]


In [16]:
print(model_en.wv.doesnt_match("apple banana man".split()))

man


In [17]:
model_en.wv.similarity('lion', 'rabbit')

0.28471678

In [18]:
# ! wget https://raw.githubusercontent.com/ancatmara/data-science-nlp/master/data/w2v/train/alice.txt

In [19]:
with open('alice.txt', 'r', encoding='utf-8') as file:
    text = file.read()

text = re.sub('\n', ' ', text)
sents = sent_tokenize(text)

In [20]:
punct = '!"”#$%&()*+,-./:;<=>?@[\\]^_`{|}~„“«»†*—/\\-‘’'
clean_sents = []

for sent in sents:
    s = [s.lower().strip(punct) for s in sent.split()]
    clean_sents.append(s)

print(clean_sents[:2])

[['through', 'the', 'looking-glass', 'by', 'lewis', 'carroll', 'chapter', 'i', 'looking-glass', 'house', 'one', 'thing', 'was', 'certain', 'that', 'the', 'white', 'kitten', 'had', 'had', 'nothing', 'to', 'do', 'with', 'it', '', 'it', 'was', 'the', 'black', 'kitten’s', 'fault', 'entirely'], ['for', 'the', 'white', 'kitten', 'had', 'been', 'having', 'its', 'face', 'washed', 'by', 'the', 'old', 'cat', 'for', 'the', 'last', 'quarter', 'of', 'an', 'hour', 'and', 'bearing', 'it', 'pretty', 'well', 'considering', 'so', 'you', 'see', 'that', 'it', 'couldn’t', 'have', 'had', 'any', 'hand', 'in', 'the', 'mischief']]


In [21]:
model_path = "movie_reviews.model"

print("Saving model...")
model_en.save(model_path)

Saving model...


In [22]:
model = word2vec.Word2Vec.load(model_path)

model.build_vocab(clean_sents, update=True)
model.train(clean_sents, total_examples=model.corpus_count, epochs=5)

(97526, 150225)

In [23]:
model.wv.similarity('lion', 'rabbit')

0.2906908

In [24]:
model.init_sims(replace=True)
model_path = "movies_alice.bin"

print("Saving model ...")
model_en.wv.save_word2vec_format(model_path, binary=True)

Saving model ...


  model.init_sims(replace=True)


In [25]:
model_en.wv.similarity('london', 'uk')

0.46422276

In [26]:
# ! wget https://raw.githubusercontent.com/ancatmara/data-science-nlp/master/data/w2v/evaluation/ru_analogy_tagged.txt

In [27]:
from nltk import FreqDist
from tqdm import tqdm_notebook as tqdm
from sklearn.manifold import TSNE

top_words = []

fd = FreqDist()
for s in tqdm(sentences):
    fd.update(s)

for w in fd.most_common(1000):
    top_words.append(w[0])

print(top_words[:50:])
top_words_vec = [model_en.wv[word] for word in top_words]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for s in tqdm(sentences):


  0%|          | 0/529416 [00:00<?, ?it/s]

['the', 'and', 'a', 'of', 'to', 'is', 'it', 'in', 'i', 'this', 'that', 's', 'was', 'as', 'with', 'for', 'movie', 'but', 'film', 'you', 't', 'on', 'not', 'he', 'are', 'his', 'have', 'be', 'one', 'all', 'they', 'at', 'by', 'who', 'an', 'from', 'so', 'like', 'there', 'or', 'her', 'just', 'about', 'out', 'has', 'if', 'what', 'some', 'good', 'can']


In [28]:
%%time
import numpy as np

tsne = TSNE(n_components=2, random_state=0)
top_words_tsne = tsne.fit_transform(np.array(top_words_vec))

CPU times: total: 5.5 s
Wall time: 4.8 s


In [29]:
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()

p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="word2vec T-SNE (eng model, top1000 words)")

source = ColumnDataSource(data=dict(x1=top_words_tsne[:,0],
                                    x2=top_words_tsne[:,1],
                                    names=top_words))

p.scatter(x="x1", y="x2", size=8, source=source)

labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
p.add_layout(labels)

show(p)

In [30]:
from sklearn.decomposition import TruncatedSVD

svd_50 = TruncatedSVD(n_components=50)
top_words_vec_50 = svd_50.fit_transform(top_words_vec)
top_words_tsne2 = TSNE(n_components=2, random_state=0).fit_transform(top_words_vec_50)

In [31]:
output_notebook()

p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="word2vec T-SNE (eng model, top1000 words, +SVD)")

source = ColumnDataSource(data=dict(x1=top_words_tsne2[:,0],
                                    x2=top_words_tsne2[:,1],
                                    names=top_words))

p.scatter(x="x1", y="x2", size=8, source=source)

labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
p.add_layout(labels)

show(p)

## FastText

FastText использует не только эмбеддинги слов, но и эмбеддинги n-грам. В корпусе каждое слово автоматически представляется в виде набора символьных n-грамм. Скажем, если мы установим n=3, то вектор для слова "where" будет представлен суммой векторов следующих триграм: "<wh", "whe", "her", "ere", "re>" (где "<" и ">" символы, обозначающие начало и конец слова). Благодаря этому мы можем также получать вектора для слов, отсутствующих в словаре, а также эффективно работать с текстами, содержащими ошибки и опечатки.

* [Статья](https://aclweb.org/anthology/Q17-1010)
* [Сайт](https://fasttext.cc/)
* [Тьюториал](https://fasttext.cc/docs/en/support.html)
* [Вектора для 157 языков](https://fasttext.cc/docs/en/crawl-vectors.html)
* [Вектора, обученные на википедии](https://fasttext.cc/docs/en/pretrained-vectors.html) (отдельно для 294 разных языков)
* [Репозиторий](https://github.com/facebookresearch/fasttext)

Есть библиотека `fasttext` для питона (с готовыми моделями можно работать и через `gensim`).

In [32]:
# ! git clone https://github.com/facebookresearch/fastText.git
# ! pip3 install fastText/.

In [46]:
with open('clean_text.txt', 'w', encoding='utf-8') as file:
    for s in sentences:
        file.write(' '.join(s))
        file.write('\n')

In [47]:
import fasttext 

ft_model = fasttext.train_unsupervised('clean_text.txt', minn=3, maxn=4, dim=300)

In [48]:
ft_model.get_word_vector("movie")

array([-2.72923023e-01,  4.07852158e-02, -8.74096230e-02, -1.27159402e-01,
       -5.28752804e-02,  1.04892030e-01, -1.41197070e-02,  9.04834419e-02,
       -2.99937720e-03,  1.23862855e-01,  1.03187457e-01,  2.78505683e-03,
        2.38806054e-01, -2.50266761e-01,  9.68079045e-02,  2.44842973e-02,
       -1.60062671e-01, -5.84143065e-02, -5.47989383e-02,  7.03189000e-02,
       -4.49193828e-02,  1.99977025e-01, -1.57394651e-02,  2.12564617e-02,
       -1.66952923e-01,  1.83455944e-02,  2.70068645e-04,  1.35680243e-01,
        3.89516354e-02, -1.93783883e-02,  1.48363039e-01, -1.11812316e-01,
       -1.37262449e-01,  6.08672015e-02,  1.31749704e-01, -1.82580147e-02,
       -5.18879071e-02, -2.16519400e-01,  1.40219972e-01,  6.50775209e-02,
       -1.35115191e-01,  2.17132375e-01,  7.00165555e-02, -4.19702053e-01,
        1.39736578e-01,  1.90288313e-02, -2.57452816e-01,  1.13450728e-01,
       -1.79584250e-01, -3.93727086e-02, -6.90501928e-02, -9.19841677e-02,
       -9.73226577e-02,  

In [49]:
ft_model.get_nearest_neighbors('actor')

[(0.6604437232017517, 'actors'),
 (0.5741051435470581, 'ctor'),
 (0.5713533759117126, 'tractor'),
 (0.5655781626701355, 'actress'),
 (0.5219190120697021, 'role'),
 (0.5113593339920044, 'reactor'),
 (0.4961804449558258, 'performance'),
 (0.4953829050064087, 'viktor'),
 (0.48770871758461, 'factor'),
 (0.4833109676837921, 'actresses')]

In [50]:
ft_model.get_nearest_neighbors('actr')

[(0.7605119943618774, 'actress'),
 (0.7503945827484131, 'actresses'),
 (0.6604633331298828, 'actors'),
 (0.6310917139053345, 'actor'),
 (0.5418269634246826, 'acte'),
 (0.5185579061508179, 'acting'),
 (0.47738519310951233, 'talented'),
 (0.4593389928340912, 'performers'),
 (0.45686453580856323, 'cast'),
 (0.456167995929718, 'performer')]

In [51]:
ft_model.get_nearest_neighbors('moviegeek')

[(0.6539474725723267, 'geek'),
 (0.6335132122039795, 'moviegoing'),
 (0.5995051264762878, 'movie'),
 (0.5993400812149048, 'moviegoer'),
 (0.5873931646347046, 'moviemaking'),
 (0.569153368473053, 'movies'),
 (0.564493715763092, 'moviegoers'),
 (0.5630878806114197, 'geeks'),
 (0.556845486164093, 'reek'),
 (0.5538786053657532, 'beek')]

In [52]:
ft_model.get_analogies("woman", "man", "actor")

[(0.6795867085456848, 'actress'),
 (0.5566461682319641, 'actresses'),
 (0.5238617658615112, 'actors'),
 (0.49344706535339355, 'seductress'),
 (0.4310092031955719, 'acting'),
 (0.42811039090156555, 'tractor'),
 (0.41991883516311646, 'womanly'),
 (0.4198024868965149, 'winslet'),
 (0.41217902302742004, 'sarsgaard'),
 (0.40355584025382996, 'valenzuela')]

Кроме этого, fastText можно использовать для классификации, для этого нужен следующий формат размеченных данных:

__label_1__  text_1

__label_2__  text_2

...

In [53]:
positive = pd.read_csv('../assets/positive.csv', sep=';', usecols=[3], names=['text'])
positive['label'] = ['positive'] * len(positive)
negative = pd.read_csv('../assets/positive.csv', sep=';', usecols=[3], names=['text'])
negative['label'] = ['negative'] * len(negative)
df = pd.concat([positive, negative], axis=0)
df.head()

Unnamed: 0,text,label
0,"@first_timee хоть я и школота, но поверь, у на...",positive
1,"Да, все-таки он немного похож на него. Но мой ...",positive
2,RT @KatiaCheh: Ну ты идиотка) я испугалась за ...,positive
3,"RT @digger2912: ""Кто то в углу сидит и погибае...",positive
4,@irina_dyshkant Вот что значит страшилка :D\nН...,positive


In [54]:
len(df)

229822

In [55]:
! pip install pymorphy2

Defaulting to user installation because normal site-packages is not writeable


In [61]:
import pymorphy2
from functools import lru_cache
from multiprocessing import Pool
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm_notebook as tqdm
import re

morphling_analyzer = pymorphy2.MorphAnalyzer()

regex = re.compile("[А-Яа-я:=!\)\()A-z\_\%/|]+")

def words_only(text, regex=regex):
    try:
        return regex.findall(text)
    except:
        return []

  regex = re.compile("[А-Яа-я:=!\)\()A-z\_\%/|]+")


ValueError: too many values to unpack (expected 4)