# Введение в обработку текста на естественном языке

In [11]:
import pandas as pd
import numpy as np 

## Лабораторная работа 7

### Расстояние редактирования

1.1 Загрузите предобработанные описания рецептов из файла `preprocessed_descriptions.csv`. Получите набор уникальных слов `words`, содержащихся в текстах описаний рецептов (воспользуйтесь `word_tokenize` из `nltk`). 

In [4]:
recipes = pd.read_csv("recipes_sample.csv")
descriptions = recipes['description']
descriptions

0        an original recipe created by chef scott meska...
1        my children and their friends ask for my homem...
2                  these were so go, it surprised even me.
3        my sister-in-law made these for us at a family...
4        i think a fondue is a very romantic casual din...
                               ...                        
29995    this is based on a french recipe but i changed...
29996    this is a traditional fresh plum cake, thought...
29997    this is a traditional late summer early fall s...
29998    this is a delicious soup that i originally fou...
29999    i've heard of the 'cookies by design' company,...
Name: description, Length: 30000, dtype: object

In [5]:
from nltk.tokenize import word_tokenize

In [7]:
words_set = set()
words_set.update(word_ for description_ in descriptions.str.lower().dropna() for word_ in word_tokenize(description_))

In [8]:
words = list(words_set)
words

['381956',
 'changeable',
 'makes:2',
 '//feaston.wordpress.com/2009/10/12/blushing-pork-loin/',
 'slaughtered',
 'shoot',
 'goe',
 'breakfast',
 '100mg',
 'instructions',
 'weekdays',
 'cult',
 'coordinating',
 'azteca',
 '//www.care2.com/greenliving/ten-mothers-garlic-and-spinach-soup.html',
 'cookie-recipe',
 'environment',
 'coolers',
 'watkins',
 'evenrecommend',
 'sym',
 '63g',
 'g.',
 'fuse',
 "'ll",
 'striped',
 'revitalised',
 'scharffen',
 'accustomed',
 'coleslaws',
 '//',
 'ck',
 '7.9',
 '//www.uga.edu/nchfp/how/can_home.html',
 'carries',
 'sophomore',
 'foldovers',
 'burro',
 'rusks',
 'tine',
 'optional-',
 'slug…',
 '07/17/2008',
 'inadvertently',
 'chain',
 'zarr',
 'highlands',
 'southern',
 'sue',
 'palettes',
 'fails',
 'appetiteforchina.com',
 'moistening',
 'constitute',
 '492956',
 'margarine',
 'thousandth',
 'crabs',
 'frying',
 'milk/ice',
 'key',
 '750',
 'hearted',
 '135158',
 'tour-caribbean',
 'lox',
 'cans',
 'saltine-type',
 'warmers',
 '1001',
 'www.man

1.2 Сгенерируйте 5 пар случайно выбранных слов и посчитайте между ними расстояние редактирования.

In [13]:
from nltk.metrics.distance import edit_distance

In [14]:
list_a = np.random.choice(words, 5)
list_b = np.random.choice(words, 5)

for word_1, word_2 in zip(list_a, list_b):
    print(f'{word_1}, {word_2} —> {edit_distance(word_1, word_2)}')

uncles, chronic —> 7
arzola, leveling —> 7
determined, hazelnuts —> 9
godden, praline —> 6
inherent, equivalents —> 8


1.3 Напишите функцию, которая для заданного слова `word` возвращает `k` ближайших к нему слов из списка `words` (близость слов измеряется с помощью расстояния Левенштейна)

In [23]:
def find_nearest(word_, count_):
    return sorted(words, key=lambda word_target: edit_distance(word_, word_target))[:count_]

In [30]:
word_ = 'califonia'
count_ = 7

In [31]:
find_nearest(word_, count_)

['california',
 'californian',
 'catalonia',
 'californians',
 'calzone',
 'caribana',
 'alioli']

In [33]:
import time

In [34]:
# Посмотрим время работы данной ф-ции

startt = time.time()
find_nearest(word_, count_)
endt = time.time()

print(f'Время выполнения ф-ции: {endt - startt}')

Время выполнения ф-ции: 5.9543867111206055


In [36]:
# Можно быстрее

In [35]:
import difflib

In [37]:
def find_nearest_faster(word_, count_):
    return difflib.get_close_matches(word_, words, count_)

In [38]:
# Посмотрим время работы данной ф-ции

startt = time.time()
find_nearest_faster(word_, count_)
endt = time.time()

print(f'Время выполнения ф-ции: {endt - startt}')

Время выполнения ф-ции: 0.3483924865722656


### Стемминг, лемматизация

2.1 На основе результатов 1.1 создайте `pd.DataFrame` со столбцами: 
    * word
    * stemmed_word 
    * normalized_word 

Столбец `word` укажите в качестве индекса. 

Для стемминга воспользуйтесь `SnowballStemmer`, для нормализации слов - `WordNetLemmatizer`. Сравните результаты стемминга и лемматизации.

In [41]:
from nltk import SnowballStemmer, WordNetLemmatizer

In [46]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to C:\Users\Alice
[nltk_data]     Klimovich\AppData\Roaming\nltk_data...


True

In [47]:
snb_stemmer = SnowballStemmer('english')
lemmatizer_ = WordNetLemmatizer()

In [48]:
stemmed_words = [snb_stemmer.stem(word_) for word_ in words]
normalized_words = [lemmatizer_.lemmatize(word_) for word_ in words]

In [54]:
df_ = pd.DataFrame()
df_['words'] = words
df_['stemmed_word'] = stemmed_words
df_['normalized_word'] = normalized_words

df_.sample(10)

Unnamed: 0,words,stemmed_word,normalized_word
2653,austalian,austalian,austalian
12390,occupies,occupi,occupies
11871,quiches,quich,quiche
24739,cheeriors,cheerior,cheeriors
28726,hardcore,hardcor,hardcore
23717,oooos,oooo,oooos
12668,refined,refin,refined
15919,noel,noel,noel
23319,325°f,325°f,325°f
25702,butomg-this,butomg-thi,butomg-this


2.2. Удалите стоп-слова из описаний рецептов. Какую долю об общего количества слов составляли стоп-слова? Сравните топ-10 самых часто употребляемых слов до и после удаления стоп-слов.

In [55]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Alice
[nltk_data]     Klimovich\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [57]:
words = list()
words.extend(word_ for description_ in descriptions.str.lower().dropna() for word_ in word_tokenize(description_))

In [59]:
from collections import Counter

In [60]:
Counter(words).most_common(10)

[('.', 66166),
 ('the', 40257),
 (',', 38544),
 ('a', 35030),
 ('and', 30425),
 ('i', 27799),
 ('this', 27132),
 ('to', 23508),
 ('it', 23212),
 ('is', 20501)]

In [63]:
from nltk.corpus import stopwords

In [64]:
stopwords_ = stopwords.words('english')
without_stopwords = [word_ for word_ in words if word_ not in stopwords_]

In [66]:
Counter(without_stopwords).most_common(10)

[('.', 66166),
 (',', 38544),
 ('!', 16054),
 ('recipe', 15122),
 ("'s", 7688),
 ('make', 6367),
 ('time', 5198),
 ("n't", 4798),
 ('use', 4645),
 (')', 4587)]

In [72]:
((len(words)- len(without_stopwords))/len(words)) * 100

40.26511519694858

### Векторное представление текста

3.1 Выберите случайным образом 5 рецептов из набора данных. Представьте описание каждого рецепта в виде числового вектора при помощи `TfidfVectorizer`

In [153]:
five_descrip_ = descriptions.sample(5)
five_descrip_

2229     from cooking light, oct. 1999. \r\nthis health...
630      wow! doesn't taste light, but a 12 serving sli...
7373     from cook's country aug/sept 2007 edition.  th...
8132                   a tasty way to do brussels sprouts.
25984    this is a wonderfully refreshing and tasty sau...
Name: description, dtype: object

In [145]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [154]:
vectorizer_ = TfidfVectorizer()
words_vector_ = vectorizer_.fit_transform(five_descrip_)
print(words_vector_)

  (0, 31)	0.2053098626453843
  (0, 49)	0.2053098626453843
  (0, 52)	0.2053098626453843
  (0, 78)	0.2053098626453843
  (0, 18)	0.2053098626453843
  (0, 5)	0.2053098626453843
  (0, 6)	0.2053098626453843
  (0, 38)	0.33128540889255914
  (0, 60)	0.2053098626453843
  (0, 72)	0.2053098626453843
  (0, 24)	0.16564270444627957
  (0, 65)	0.16564270444627957
  (0, 40)	0.16564270444627957
  (0, 7)	0.13749840780819034
  (0, 35)	0.2053098626453843
  (0, 47)	0.16564270444627957
  (0, 33)	0.2053098626453843
  (0, 39)	0.11566799534091207
  (0, 9)	0.2053098626453843
  (0, 37)	0.2053098626453843
  (0, 69)	0.13749840780819034
  (0, 2)	0.2053098626453843
  (0, 46)	0.2053098626453843
  (0, 41)	0.16564270444627957
  (0, 20)	0.2053098626453843
  :	:
  (2, 32)	0.13822464387173108
  (3, 59)	0.43429718303084847
  (3, 10)	0.43429718303084847
  (3, 25)	0.43429718303084847
  (3, 71)	0.3503882327118585
  (3, 75)	0.43429718303084847
  (3, 66)	0.3503882327118585
  (4, 14)	0.21486636511193843
  (4, 70)	0.429732730223876

In [155]:
words_vector_ = vectorizer_.fit_transform(five_descrip_).toarray()
print(words_vector_)

[[0.         0.         0.20530986 0.         0.         0.20530986
  0.20530986 0.13749841 0.         0.20530986 0.         0.
  0.         0.         0.         0.         0.         0.
  0.20530986 0.         0.20530986 0.         0.         0.
  0.1656427  0.         0.         0.         0.         0.
  0.         0.20530986 0.1656427  0.20530986 0.         0.20530986
  0.         0.20530986 0.33128541 0.115668   0.1656427  0.1656427
  0.         0.         0.         0.         0.20530986 0.1656427
  0.         0.20530986 0.         0.         0.20530986 0.
  0.         0.         0.         0.         0.         0.
  0.20530986 0.         0.         0.         0.         0.1656427
  0.         0.         0.         0.13749841 0.         0.
  0.20530986 0.         0.         0.         0.         0.
  0.20530986]
 [0.24529829 0.24529829 0.         0.         0.24529829 0.
  0.         0.         0.         0.         0.         0.24529829
  0.         0.24529829 0.         0.2452

3.2 Вычислите близость между каждой парой рецептов, выбранных в задании 3.1, используя косинусное расстояние (`scipy.spatial.distance.cosine`) Результаты оформите в виде таблицы `pd.DataFrame`. В качестве названий строк и столбцов используйте названия рецептов.

In [156]:
len_ = len(five_descrip_)
matrix_similar = np.zeros((len_, len_))

In [157]:
from scipy.spatial.distance import cosine
from itertools import product # Декартово произведение

In [158]:
for i, j in product(range(len_), range(len_)):
    matrix_similar[i, j] = cosine(words_vector_[i], words_vector_[j])

In [159]:
similar_df = pd.DataFrame(data=matrix_similar, columns=five_descrip_, index=five_descrip_)
similar_df

description,"from cooking light, oct. 1999. \r\nthis healthful bread is full of goodness, and it tastes delicious, too! store in an airtight container in your refrigerator or freezer.","wow! doesn't taste light, but a 12 serving slice is only about 130 calories. great for chocolate craving fix.","from cook's country aug/sept 2007 edition. this muffin tastes like a coffee cake, it is so good! there is cinnamon sugar streusel in the middle and top of the muffin. very crunchy and sinful!",a tasty way to do brussels sprouts.,this is a wonderfully refreshing and tasty sauce for strawberries. quick and easy to make and very delicious! cook time is chilling time.
description,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"from cooking light, oct. 1999. \r\nthis healthful bread is full of goodness, and it tastes delicious, too! store in an airtight container in your refrigerator or freezer.",0.0,0.951234,0.792966,1.0,0.864138
"wow! doesn't taste light, but a 12 serving slice is only about 130 calories. great for chocolate craving fix.",0.951234,0.0,0.973322,1.0,0.932235
"from cook's country aug/sept 2007 edition. this muffin tastes like a coffee cake, it is so good! there is cinnamon sugar streusel in the middle and top of the muffin. very crunchy and sinful!",0.792966,0.973322,0.0,1.0,0.789765
a tasty way to do brussels sprouts.,1.0,1.0,1.0,0.0,0.878518
this is a wonderfully refreshing and tasty sauce for strawberries. quick and easy to make and very delicious! cook time is chilling time.,0.864138,0.932235,0.789765,0.878518,0.0


3.3 Какие рецепты являются наиболее похожими? Прокомментируйте результат (словами).

In [160]:
# Находим наименьшие значения в каждой строке, исключая диагональные элементы
most_similar_recipes = similar_df.apply(lambda x: x[x != 0].min(), axis=1)

# Определяем, какие рецепты являются наиболее похожими
most_similar_pairs_ind = most_similar_recipes.min()
print(most_similar_recipes[most_similar_recipes == most_similar_pairs_ind])

description
from cook's country aug/sept 2007 edition.  this muffin tastes like a coffee cake, it is so good!  there is cinnamon sugar streusel in the middle and top of the muffin.  very crunchy and sinful!    0.789765
this is a wonderfully refreshing and tasty sauce for strawberries. quick and easy to make and very delicious! cook time is chilling time.                                                             0.789765
dtype: float64


Относительно высокое косинусное расстояние указывает на то, что описания содержат различные ключевые слова и фразы. 
Вероятно, небольшое сходство связано с использованием общих тем и структуры текста, таких как описание вкуса блюда, его текстуры, наличие короткого отзыва о самом процессе готовки/вкусе.