-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
191 lines (147 loc) · 5.95 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
import base64
import json
import os
import re
import string
import urllib.request
from string import digits
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
from ModelRecipe import ModelRecipe
from SQLquery import SQLWrapper
debug = False
folderRecipes = "./giallo_zafferano/Recipes"
def toDict(e):
dizionario_spesa = {}
for elemento in e:
ingrediente = elemento[0].replace("'", "''").replace(" ", "_")
print(ingrediente)
quantita = elemento[1].replace("'", "''").replace(" ", "_")
dizionario_spesa[ingrediente] = quantita
json_spesa = json.dumps(dizionario_spesa, ensure_ascii=False)
return json_spesa
def saveRecipe(linkRecipeToDownload, oldSoup):
soup = downloadPage(linkRecipeToDownload)
title = findTitle(soup)
rating = findRating(oldSoup, title)
if float(rating) > 5:
rating = "3.3"
filePath = calculateFilePath(title)
if os.path.exists(filePath):
return
ingredients = findIngredients(soup)
description = findDescription(soup)
category = findCategory(soup)
imageBase64, imageURL = findImage(soup)
modelRecipe = ModelRecipe()
modelRecipe.title = title
modelRecipe.ingredients = ingredients
modelRecipe.description = description
modelRecipe.category = category
modelRecipe.imageBase64 = imageBase64
modelRecipe.linkToRecipe = linkRecipeToDownload
modelRecipe.imageURL = imageURL
modelRecipe.rating = rating
dbConnection = SQLWrapper()
dbConnection.createConnection()
dbConnection.setQueryType("insert")
for k, v in modelRecipe.toDictionary().items():
if k == "ingredients":
v = toDict(v)
elif k == "description":
# Esegui l'escape degli apostrofi nella descrizione per php
v = v.replace("'", "''")
print(f"add {k}")
dbConnection.addValue(k, "" if type(v) == "NoneType" else v)
dbConnection.sendRequest("POST", "/ricette.php/insert/ricette_rating")
dbConnection.closeConnection()
# createFileJson(modelRecipe.toDictionary(), filePath)
def findRating(oldSoup : BeautifulSoup, title):
for ul_tag in oldSoup.find_all("ul", class_="gz-card-data top"):
li_tags = ul_tag.find_all("li", class_="gz-single-data-recipe")
matching_li_tags = [li for li in li_tags if li.find("a") and li.find("a").get("title") == title]
if matching_li_tags:
matching_li_tag = matching_li_tags[1] # prendo il secondo della lista
rating_text = matching_li_tag.find("a").text.strip()
return "3" if type(rating_text) == "NoneType" else rating_text.replace(",", ".")
return "3"
def findTitle(soup):
titleRecipe = ""
for title in soup.find_all(attrs={"class": "gz-title-recipe gz-mBottom2x"}):
titleRecipe = title.text
return titleRecipe
def findIngredients(soup):
allIngredients = []
for tag in soup.find_all(attrs={"class": "gz-ingredient"}):
link = tag.a.get("href")
nameIngredient = tag.a.string
contents = tag.span.contents[0]
quantityProduct = re.sub(r"\s+", " ", contents).strip()
allIngredients.append([nameIngredient.lower(), quantityProduct])
return allIngredients
def findDescription(soup):
allDescription = ""
for tag in soup.find_all(attrs={"class": "gz-content-recipe-step"}):
removeNumbers = str.maketrans("", "", digits)
if hasattr(tag.p, "text"):
description = tag.p.text.translate(removeNumbers)
allDescription = allDescription + description
return allDescription
def findCategory(soup):
for tag in soup.find_all(attrs={"class": "gz-breadcrumb"}):
try:
category = tag.li.a.string
return category
except AttributeError:
return ""
def findImage(soup):
pictures = soup.find("picture", attrs={"class": "gz-featured-image"})
if pictures is None:
pictures = soup.find(
"div", attrs={"class": "gz-featured-image-video gz-type-photo"}
)
imageSource = pictures.find("img")
imageURL = imageSource.get("data-src")
if imageURL is None:
imageURL = imageSource.get("src")
imageToBase64 = str(base64.b64encode(requests.get(imageURL).content))
imageToBase64 = imageToBase64[2 : len(imageToBase64) - 1]
return imageToBase64, imageURL
def calculateFilePath(title):
compact_name = title.replace(" ", "_").lower()
return folderRecipes + "/" + compact_name + ".json"
def createFileJson(data, path):
with open(path, "w", encoding="utf-8") as file:
file.write(json.dumps(data, ensure_ascii=False))
def downloadPage(linkToDownload):
response = requests.get(linkToDownload)
soup = BeautifulSoup(response.text, "html.parser")
return soup
def downloadAllRecipesFromGialloZafferano():
totalPages = countTotalPages() + 1
for pageNumber in tqdm(range(1, totalPages), desc="pages…", ascii=False, ncols=75):
linkList = "https://www.giallozafferano.it/ricette-cat/page" + str(pageNumber)
# linkList = "https://www.giallozafferano.com/latest-recipes/page" + str(pageNumber)
response = requests.get(linkList)
soup = BeautifulSoup(response.text, "html.parser")
for tag in soup.find_all(attrs={"class": "gz-title"}):
link = tag.a.get("href")
saveRecipe(link, soup)
if debug:
break
if debug:
break
def countTotalPages():
numberOfPages = 0
linkList = "https://www.giallozafferano.it/ricette-cat"
# linkList = "https://www.giallozafferano.com/latest-recipes/"
response = requests.get(linkList)
soup = BeautifulSoup(response.text, "html.parser")
for tag in soup.find_all(attrs={"class": "disabled total-pages"}):
numberOfPages = int(tag.text)
return numberOfPages
if __name__ == "__main__":
if not os.path.exists(folderRecipes):
os.makedirs(folderRecipes)
downloadAllRecipesFromGialloZafferano()