## Data preprocessing 食譜資料前處理

### Import neccessary libraries and packages 導入必要套件

In [1]:
import matplotlib
import pandas as pd
import numpy as np
import json
import pymongo
import jieba
import string
import re
import os
import time
import random
import googletrans
import nltk
from IPython.display import clear_output
clear_output(wait=True)
from nltk.stem.lancaster import LancasterStemmer
from collections import Counter
pd.set_option('display.max_columns', 110)
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

### Connecting to mongoDB 資料庫連線

In [2]:
# Create connnect 建立與mongoDB連線
client = pymongo.MongoClient(host='192.168.158.128', port=27017)

In [3]:
# assign database 選擇資料庫
db = client.tibame
# assign colection 選擇collection
collection = db.recipe_raw

In [4]:
# Query specific column from all recipe_raw 選擇要讀取的資料欄位
queryArgs = {}
projectField = {'_id' : True, 'url' : True, 'title' : True, 'author' : True, 'ingredient' : True, 'steps' : True, 'category' : True}
search_response = db.recipe_raw.find(queryArgs, projection=projectField)

In [5]:
recipe_list = []
for item in search_response:
    recipe_list.append(item)

In [6]:
# total recipe numbers 確認資料總數
len(recipe_list)

58392

# 設定辭典 : 
<!--     選用辭典
    辭典內容調整(切詞、刪除...) -->

In [7]:
# 開啟 stop word list
stop_word = []
with open('./jieba_data/stopword.txt', 'r', encoding='utf-8') as file:
    txt = file.readlines()
# 過濾掉 txt內的\n
    stop_word = [each.strip('\n') for each in txt]
# 開啟 自定義辭典    
with open('./jieba_data/mydict.txt', 'r', encoding='utf-8') as file:
    txt = file.readlines()
# 過濾 權重、詞性、空白
    word_bank = [element.replace(' 10 n','').replace(' 20 n','').strip('\n') for element in txt]

# 讀取辭典
jieba.load_userdict('./jieba_data/mydict.txt')
# 將 嫩、豆腐 確實切開 
jieba.suggest_freq(('嫩', '豆腐'), True)
for word in word_bank:
# 確保 辭典內容 確實保留
    jieba.suggest_freq(word, True)
# 刪除辭典字
jieba.del_word('雞胸肉')
jieba.del_word('雞肉')
jieba.del_word('蛋黃')
jieba.del_word('全蛋')
jieba.del_word('雞蛋')
jieba.del_word('雞胸肉片')

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\BIGDAT~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.735 seconds.
Prefix dict has been built successfully.


In [8]:
# Initial googletrans instance
translator = googletrans.Translator()
word_bank_en = {}
for word in word_bank:
    word_bank_en[word] = (translator.translate(word)).text.lower()
# word_bank_en

In [9]:
def clear():
    os.system('clear')

In [None]:
word_list = []
total_match = []

for n, recipe in enumerate(recipe_list):
# 食物向量    
    food_vector = {}
    try:
        ingredient_step = recipe['ingredient'] + ',' + recipe['steps']
        
        url = recipe['url'];
        title = recipe['title']
        author = recipe['author'];
        category = recipe['category']
        
        print('食譜編號: ', n)
        seg = jieba.cut(ingredient_step, cut_all = False)
#         print('結巴斷詞後: ', seg)
#         word_list = word_list + ([item for item in list(seg) if len(item) > 0])
        matched = set(seg).intersection(set(word_bank))
        
        if len(matched) > 4:
            for each_word in word_bank_en:
                if each_word in list(matched):
                    food_vector[word_bank_en[each_word]] = 1
                else:
                    food_vector[word_bank_en[each_word]] = 0

            # forming recipe record
            vector_dict = {}

            vector_dict['url'] = url
            vector_dict['title'] = title
            vector_dict['author'] = author
            vector_dict['vector'] = food_vector
            vector_dict['category'] = category
#             print(vector_dict)
#             save back to mongoDB
            db = client.tibame
            collection = db.recipe_vector_5
            insert_item = vector_dict
            insert_result = db.recipe_vector_5.insert_one(insert_item)


            print(insert_result)
            clear()
        else:
            print('The recipe is not good....')
            clear()
            pass
        
        total_match = total_match + list(matched)
    except:
        pass
    
# total_match

In [None]:
ingredient_freq_list = sorted(total_match , key = lambda x : total_match[1], reverse=True)
ingredient_freq_list

## 以下是胡亂練習用的

### Translate vector features word-bank  把向量詞庫先翻譯

In [None]:
translator = googletrans.Translator()

word_bank_en = []
with open('./jieba_data/mydict.txt', 'r', encoding='utf-8') as file:
    txt = file.readlines()
    word_bank_list = [each.strip('\n') for each in txt]
    for each in word_bank_list:
        translated_result = translator.translate(each)
        output_str = translated_result.text.lower()+'\n'
        with open('./jieba_data/word_bank_en.txt', 'a', encoding='utf-8') as f:
            f.write(output_str)
    

### 解決單複數及時態變形

In [None]:
word_bank_dict = {}
with open('./jieba_data/word_bank_en.txt', 'r', encoding='utf-8') as file:
    txt = file.readlines()
    for each in txt:
        word_bank_dict[each.strip('\n')] = 0 

In [None]:
word_bank_dict

### Extracting ingredient and steps 擷取食譜內食材及步驟字串

In [None]:
# Initial googletrans instance
translator = googletrans.Translator()

for each_recipe in recipe_list[:20]:   ## 先練習前五筆
    
    try:    
        ingre_step = each_recipe['ingredient'] + ',' + each_recipe['steps']
    except:
        continue
    #print(ingre_step)
    #print('------------------  Translated and segmented -------------------')
    
    
    # Translate into EN
    translated_results = translator.translate(ingre_step)
    no_punctuation_result = translated_results.text.translate(str.maketrans('', '', string.punctuation))
    ingre_step_list = no_punctuation_result.split()
    print(ingre_step_list)
    print('================== Find intersection ===========================')
    # find matched items by using set. 用集合的交集方式找出配對的食材跟做法 ##效果不好 
    
    # Try nltk, put all words back into stem state
    # initial a PorterStemmer() instance
    stemmer = nltk.PorterStemmer()
    original_matched_vec = set([word.lower() for word in ingre_step_list]).intersection(set(word_bank_dict.keys()))
    stem_matched_vec = set([stemmer.stem(word.lower()) for word in ingre_step_list]).intersection(set(word_bank_dict.keys()))
    union_set = original_matched_vec.union(stem_matched_vec)
    #print(matched_vec)
    
    # If maintain recipes with more than 3 elements
    if len(union_set) > 3:
        food_vector = {}
        for key in word_bank_dict:
            if key in union_set:
                food_vector[key] = 1
            else:
                food_vector[key] = 0
        print(food_vector)

        vector_dict = {}
        url = each_recipe['url']; author = each_recipe['author']; category = each_recipe['category']
        vector_dict['url'] = url
        vector_dict['author'] = author
        vector_dict['vector'] = food_vector
        vector_dict['category'] = category

        db = client.tibame
        collection = db.recipe_vector
        insert_item = vector_dict
        insert_result = db.recipe_vector.insert_one(insert_item)
        print(insert_result)
    
    print("********************** Next one ********************************")
    time.sleep(random.randrange(15,17))

## !!!!!!! 用正規表示法段文字 !!!!!!!!!!!!!!!!!!!!!!!

In [None]:
recipe_list[5]['ingredient']

## 設定結巴字典及段字權重

In [None]:
# count the frequency of each word
seg_counter = Counter(total_match)

In [None]:
# transfer the object into dictionary type
counter_dict = dict(seg_counter)
counter_dict

In [None]:
# Sorting the word based on its frequency, from highest to lowest
ingredient_freq_list = sorted(counter_dict.keys(), key = lambda x : counter_dict[x], reverse=True)
ingredient_freq_list

## 算總辭頻

In [None]:
stop_word = []
with open('./jieba_data/stopword.txt', 'r', encoding='utf-8') as file:
    txt = file.readlines()
    stop_word = [each.strip('\n') for each in txt]
    
with open('./jieba_data/mydict.txt', 'r', encoding='utf-8') as file:
    txt = file.readlines()
    word_bank = [element.replace(' 10 n','').replace(' 20 n','').strip('\n') for element in txt]

    
    
    
    
jieba.load_userdict('./jieba_data/mydict.txt')
jieba.suggest_freq(('嫩', '豆腐'), True)
for word in word_bank:
    jieba.suggest_freq(word, True)

jieba.del_word('雞胸肉')
jieba.del_word('雞肉')
jieba.del_word('蛋黃')
jieba.del_word('全蛋')
jieba.del_word('雞蛋')
jieba.del_word('雞胸肉片')    



total_list = []
ingredient = ''
for recipe in recipe_list[:1000]:
    try:
        ingredient = recipe['ingredient']
        seg = jieba.cut(ingredient, cut_all = False)
        ingredient_list = [item for item in list(seg) if item not in stop_word]
        total_list = total_list + ingredient_list
            
    except:
        pass

In [None]:
total_list

In [None]:
# count the frequency of each word
seg_counter = Counter(total_list)

In [None]:
# transfer the object into dictionary type
counter_dict = dict(seg_counter)
counter_dict

In [None]:
# Sorting the word based on its frequency, from highest to lowest
ingredient_freq_list = sorted(counter_dict , key = lambda x : counter_dict[x], reverse=True)
ingredient_freq_list

In [None]:
# Initial googletrans instance
translator = googletrans.Translator()

# initial a PorterStemmer() instance
stemmer = nltk.PorterStemmer()

In [None]:
ingredient = recipe_list[0]['ingredient']
ingredient

In [None]:
ingredient_en = translator.translate(ingredient)
ingredient_en.text

In [None]:
#remove space and stem each word
ingredient_list = [item.strip() for item in ingredient_en.text.split(',')]
ingredient_list

In [None]:
pattern = list(word_bank_dict.keys())
pattern

In [None]:
matches = set(ingredient_list).intersection(set(pattern))

In [None]:
matches

### Translate CN to EN 中翻英

### Strings segmentation 斷字

### Matching to form recipe vector 配對成特徵向量

### Save processed recipe vector back to mongoDB 轉換好的資料存回資料庫（另開新的collection）