In [2]:
import re  # For preprocessing
!pip install pandas
import pandas as pd  # For data handling
from time import time  # To time our operations
import spacy
from spacy.lang.ru.examples import sentences 
from collections import defaultdict  # For word frequency
import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)





In [3]:
data = pd.read_csv('Сlean.csv')
df=data['description']
df.shape

(100723,)

In [4]:
df.head()

0    Прекрасная однокомнатная квартира в доме с лиф...
1    Светлая, просторная комната 19 кв. метров. Отд...
2    Первомайская, 15 мин. пешкомТёплая уютная квар...
3    в шаговой доступности парк Сокольники. , Разви...
4    Ильинское с.Все коммуникации центральные.Кварт...
Name: description, dtype: object

In [5]:
df.isnull().sum()

0

In [6]:
!python -m spacy download ru_core_news_sm

Collecting ru-core-news-sm==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.3.0/ru_core_news_sm-3.3.0-py3-none-any.whl (15.3 MB)
     ---------------------------------------- 15.3/15.3 MB 8.7 MB/s eta 0:00:00
✔ Download and installation successful
You can now load the package via spacy.load('ru_core_news_sm')




In [7]:
nlp = spacy.load('ru_core_news_sm', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed
def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)

INFO - 01:25:39: Loading dictionaries from c:\Users\Daniel\anaconda3\lib\site-packages\pymorphy2_dicts_ru\data
INFO - 01:25:39: format: 2.4, revision: 417127, updated: 2020-10-11T15:05:51.070345


In [8]:
brief_cleaning = (re.sub("[^А-Яа-я]", ' ', str(row)).lower() for row in data['description'])

In [9]:
t = time()

#txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_process=-1)]
txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=2000, n_process=4)]
print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

Time to clean up everything: 26.03 mins


In [10]:
df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

(65113, 1)

In [11]:
df_clean.head(20)

Unnamed: 0,clean
0,прекрасный однокомнатный квартира дом лифт мус...
1,светлый просторный комната кв метр о...
2,первомайский мина пешкомт плая уютный к...
3,шаговый доступность парк сокольники развит...
4,ильинский коммуникация центральный квартира хо...
5,продаваться срочно однокомнатный квартира ремо...
6,знамя октябрь пос продаваться светлый уютный...
7,маяковская мина пешкомбаррикадная ...
8,продаваться двухкомнатный квартира общий площа...
9,домодедово успейте купить выгодно ...


In [30]:
df_clean.to_csv('PRE.csv',index=True)

In [12]:
from gensim.models.phrases import Phrases, Phraser

In [13]:
sent = [row.split() for row in df_clean['clean']]

In [14]:
phrases = Phrases(sent, min_count=30, progress_per=10000)

INFO - 01:56:15: collecting all words and their counts
INFO - 01:56:15: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 01:56:16: PROGRESS: at sentence #10000, processed 623358 words and 173825 word types
INFO - 01:56:17: PROGRESS: at sentence #20000, processed 1187655 words and 273413 word types
INFO - 01:56:18: PROGRESS: at sentence #30000, processed 1733814 words and 343709 word types
INFO - 01:56:20: PROGRESS: at sentence #40000, processed 2866897 words and 443755 word types
INFO - 01:56:22: PROGRESS: at sentence #50000, processed 3884567 words and 528139 word types
INFO - 01:56:23: PROGRESS: at sentence #60000, processed 4895880 words and 598383 word types
INFO - 01:56:24: collected 635089 token types (unigram + bigrams) from a corpus of 5313725 words and 65113 sentences
INFO - 01:56:24: merged Phrases<635089 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000>
INFO - 01:56:24: Phrases lifecycle event {'msg': 'built Phrases<635089 vocab, min_count=30, 

In [15]:
bigram = Phraser(phrases)

INFO - 01:56:27: exporting phrases from Phrases<635089 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000>
INFO - 01:56:29: FrozenPhrases lifecycle event {'msg': 'exported FrozenPhrases<3862 phrases, min_count=30, threshold=10.0> from Phrases<635089 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000> in 2.09s', 'datetime': '2022-11-17T01:56:29.456241', 'gensim': '4.2.0', 'python': '3.9.13 (main, Aug 25 2022, 23:51:50) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19044-SP0', 'event': 'created'}


In [16]:
sentences = bigram[sent]

In [17]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

46478

In [18]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['квартира',
 'дом',
 'кв',
 'этаж',
 'двор',
 'район',
 'школа',
 'комплекс',
 'жилой',
 'кухня']

In [19]:
import multiprocessing
from gensim.models import Word2Vec

In [20]:
cores = multiprocessing.cpu_count() # Количество ядер на компе

In [22]:
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     vector_size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

INFO - 01:58:33: Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=300, alpha=0.03>', 'datetime': '2022-11-17T01:58:33.119629', 'gensim': '4.2.0', 'python': '3.9.13 (main, Aug 25 2022, 23:51:50) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19044-SP0', 'event': 'created'}


In [23]:
t = time()

w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 01:58:46: collecting all words and their counts
INFO - 01:58:46: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 01:58:47: PROGRESS: at sentence #10000, processed 519503 words, keeping 17344 word types
INFO - 01:58:48: PROGRESS: at sentence #20000, processed 990655 words, keeping 23260 word types
INFO - 01:58:49: PROGRESS: at sentence #30000, processed 1449491 words, keeping 27016 word types
INFO - 01:58:50: PROGRESS: at sentence #40000, processed 2315237 words, keeping 34080 word types
INFO - 01:58:52: PROGRESS: at sentence #50000, processed 3117085 words, keeping 39644 word types
INFO - 01:58:53: PROGRESS: at sentence #60000, processed 3902562 words, keeping 44006 word types
INFO - 01:58:53: collected 46478 word types from a corpus of 4245081 raw words and 65113 sentences
INFO - 01:58:53: Creating a fresh vocabulary
INFO - 01:58:54: Word2Vec lifecycle event {'msg': 'effective_min_count=20 retains 9666 unique words (20.80% of original 46478, drops 36812

Time to build vocab: 0.12 mins


In [24]:
w2v_model.init_sims(replace=True)

  w2v_model.init_sims(replace=True)


In [69]:
#Наиболее похожие слова на:
w2v_model.wv.most_similar(positive=["добираться"])

[('лавка', 0.2105065882205963),
 ('рублёво', 0.20606811344623566),
 ('красный_пресня', 0.20458590984344482),
 ('воспользоваться_программа', 0.20423823595046997),
 ('развивающемся', 0.19505524635314941),
 ('сложный', 0.19070512056350708),
 ('плюсы', 0.1820734292268753),
 ('рекордный_ипотека', 0.17914734780788422),
 ('приятный', 0.17847347259521484),
 ('единый_пространство', 0.1768987625837326)]

In [39]:
#Сходство слов
w2v_model.wv.similarity("дом", 'год',)

0.08323367

In [40]:
#Лишнее слово
w2v_model.wv.doesnt_match(['квартира', 'кухня', 'метро'])

'метро'

In [53]:
#Значимость слов 
w2v_model.wv.most_similar(positive=['этаж', 'квартира'], negative=['школа'], topn=3)

[('клиент', 0.20758584141731262),
 ('окн', 0.2021651715040207),
 ('популярный', 0.20012113451957703)]

In [70]:
w2v_model.wv.index_to_key

['квартира',
 'дом',
 'кв',
 'этаж',
 'двор',
 'район',
 'школа',
 'комплекс',
 'жилой',
 'кухня',
 'москва',
 'ремонт',
 'метро',
 'два',
 'детский_сад',
 'парк',
 'жк',
 'комната',
 'детский',
 'окно',
 'отделка',
 'новый',
 'просторный',
 'находиться',
 'год',
 'минута',
 'продаваться',
 'корпус',
 'хороший',
 'сделка',
 'шаговый_доступность',
 'территория',
 'станция_метро',
 'инфраструктура',
 'центр',
 'магазин',
 'студия',
 'площадь',
 'санузел',
 'комнатный',
 'квартал',
 'продажа',
 'площадка',
 'свободный_продажа',
 'взрослый_собственник',
 'общий_площадь',
 'отличный',
 'минута_пешком',
 'собственник',
 'зона',
 'современный',
 'больший',
 'место',
 'ипотека',
 'планировка',
 'поликлиника',
 'пеший_доступность',
 'уютный',
 'вид',
 'лоджия',
 'светлый',
 'подъезд',
 'подземный_паркинг',
 'спальня',
 'расположен',
 'отдых',
 'балкон',
 'собственность',
 'гостиная',
 'окно_выходить',
 'развитой_инфраструктура',
 'высота_потолок',
 'сторона',
 'бизнес_класс',
 'сделать',
 'боль