## Part 2 of creating auto-populate feature

## Use the recipe URLs to get the actual recipes

In [4]:
# Get the recipe scraper here: https://github.com/hhursev/recipe-scrapers
!pip install recipe-scrapers



In [5]:
from recipe_scrapers import scrape_me

## Open CSV and read to list

In [6]:
import csv

In [20]:
with open('allrecipes_urls.csv') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    url_list = []
    
    for row in reader:
        url_list.append(row)

In [21]:
# Check the length; should be 70964
len(url_list)

70964

In [22]:
# Look at first 20
url_list[:20]

[['https://www.allrecipes.com/recipe/16956/death-by-chocolate-mousse/'],
 ['https://www.allrecipes.com/recipe/25875/shrimp-and-crabmeat-loaf/'],
 ['https://www.allrecipes.com/recipe/239335/senate-bean-soup-from-idahoan/'],
 ['https://www.allrecipes.com/recipe/9689/nutmeg-refrigerator-cookies/'],
 ['https://www.allrecipes.com/recipe/246207/french-toast-sandwich/'],
 ['https://www.allrecipes.com/recipe/117539/slow-cooker-chicken-curry-with-quinoa/'],
 ['https://www.allrecipes.com/recipe/25652/cozy-cottage-beef-stew-soup/'],
 ['https://www.allrecipes.com/recipe/13825/peanut-pie/'],
 ['https://www.allrecipes.com/recipe/16238/emilys-famous-sloppy-joes/'],
 ['https://www.allrecipes.com/recipe/268872/vegan-gluten-free-chocolate-chip-cookies-without-coconut-oil/'],
 ['https://www.allrecipes.com/recipe/238317/honey-dressing-with-poppy-seeds/'],
 ['https://www.allrecipes.com/recipe/231537/grilled-halibut/'],
 ['https://www.allrecipes.com/recipe/217377/spaghetti-bolognese/'],
 ['https://www.allre

In [23]:
# I don't want to overwhelm the allrecipes.com servers so I'm going to include a pause between scrapes.
import time

## Check to make sure everything is working

In [24]:
scraper = scrape_me(url_list[0][0])

In [25]:
scraper.title()

'Death by Chocolate Mousse'

In [26]:
scraper.ingredients()

['21 chocolate sandwich cookies, crushed',
 '¼ cup butter, softened',
 '1 cup heavy cream',
 '1 (12 ounce) package semisweet chocolate chips',
 '1 teaspoon vanilla extract',
 '1 pinch salt',
 '2 cups heavy cream',
 '¼ cup white sugar',
 '1 cup heavy cream, chilled',
 '¼ cup white sugar']

In [27]:
scraper.instructions()

'Preheat oven to 350 degrees F (175 degrees C). Generously grease a 9 inch springform pan with 2 3/4 inch sides.\nIn a medium bowl, mix together crushed cookies and softened butter or margarine. Press mixture evenly into greased pan. Bake in preheated oven for 5 minutes, then allow to cool.\nCombine 1 cup cream, chocolate, vanilla extract, and salt, in the top of a double boiler. Heat until chocolate is fully melted and mixture is smooth. Alternatively, if you have a food processor, you can blend mixture by placing chocolate, vanilla extract, and salt, in processor bowl. Bring 1 cup cream to a boil on stovetop, then slowly pour cream into processor with blade running. Continue to process until mixture is smooth.\nPour chocolate mixture into a bowl and cool to room temperature, stirring occasionally.\nIn a large bowl, beat 2 cups chilled cream with 1/4 cup sugar. Beat until stiff peaks form. Fold whipped cream into chocolate mixture. Pour mixture into cooled crust.\nChill pie at least 6

## Function to get title, ingredients, and instructions

In [28]:
def get_info(url):
    scraper = scrape_me(url)
    
    title = scraper.title()
    ingredients = scraper.ingredients()
    instructions = scraper.instructions()
    
    print(title)
    print(ingredients)
    print(instructions)


## Try first 10 entries

In [35]:
for i in range(10):
    get_info(url_list[i][0])
    print('-'*20)
    time.sleep(0.2) # Take a tiny break

Death by Chocolate Mousse
['21 chocolate sandwich cookies, crushed', '¼ cup butter, softened', '1 cup heavy cream', '1 (12 ounce) package semisweet chocolate chips', '1 teaspoon vanilla extract', '1 pinch salt', '2 cups heavy cream', '¼ cup white sugar', '1 cup heavy cream, chilled', '¼ cup white sugar']
Preheat oven to 350 degrees F (175 degrees C). Generously grease a 9 inch springform pan with 2 3/4 inch sides.
In a medium bowl, mix together crushed cookies and softened butter or margarine. Press mixture evenly into greased pan. Bake in preheated oven for 5 minutes, then allow to cool.
Combine 1 cup cream, chocolate, vanilla extract, and salt, in the top of a double boiler. Heat until chocolate is fully melted and mixture is smooth. Alternatively, if you have a food processor, you can blend mixture by placing chocolate, vanilla extract, and salt, in processor bowl. Bring 1 cup cream to a boil on stovetop, then slowly pour cream into processor with blade running. Continue to process 

### It works. Time to implement it.

In [36]:
# Update function to return info rather than print

def get_info(url):
    scraper = scrape_me(url)
    
    title = scraper.title()
    ingredients = scraper.ingredients()
    instructions = scraper.instructions()
    
#     print(title)
#     print(ingredients)
#     print(instructions)

    return title, ingredients, instructions


In [51]:
csv_file = open('allrecipes_recipes_1.csv', 'w', encoding='utf-8')

csv_writer = csv.writer(csv_file)
csv_writer.writerow(['name', 'ingredients', 'instructions'])

31

### Splitting the work up 10,000 recipes at a time.

In [53]:
for i in range(10000):
    title, ingredients, instructions = get_info(url_list[i][0])
    
    csv_writer.writerow([title, ingredients, instructions])
    
    if i % 500 == 0:
        print(f'{i+1} done.')
    
    time.sleep(0.1) # Take a tiny break

csv_file.close()

1 done.
501 done.
1001 done.
1501 done.
2001 done.
2501 done.
3001 done.
3501 done.
4001 done.
4501 done.
5001 done.
5501 done.
6001 done.
6501 done.
7001 done.
7501 done.
8001 done.
8501 done.
9001 done.
9501 done.


<function TextIOWrapper.close()>

In [54]:
# Backup close statment if something happens in previous cell.
csv_file.close()

### I want to see how long this is taking

In [55]:
from datetime import datetime

In [58]:
print('The current time is:', datetime.now())

The current time is: 2020-04-23 18:20:41.466579


In [61]:
csv_file = open('allrecipes_recipes_2a.csv', 'w', encoding='utf-8')

csv_writer = csv.writer(csv_file)
csv_writer.writerow(['name', 'ingredients', 'instructions'])

for i in range(13957, 20000):
    title, ingredients, instructions = get_info(url_list[i][0])
    
    csv_writer.writerow([title, ingredients, instructions])
    
    if i % 500 == 0:
        print(f'{i} done at:', datetime.now())
    
    time.sleep(0.1) # Take a tiny break

csv_file.close()

14000 done at: 2020-04-23 19:40:06.650208
14500 done at: 2020-04-23 19:47:38.255335
15000 done at: 2020-04-23 19:54:58.884894
15500 done at: 2020-04-23 20:02:15.884788
16000 done at: 2020-04-23 20:09:57.573445
16500 done at: 2020-04-23 20:17:13.112242
17000 done at: 2020-04-23 20:24:27.107197
17500 done at: 2020-04-23 20:31:25.955741
18000 done at: 2020-04-23 20:38:40.326230
18500 done at: 2020-04-23 20:45:46.614984
19000 done at: 2020-04-23 20:53:04.670986
19500 done at: 2020-04-23 21:00:10.102762


In [60]:
# Backup close statment if something happens in previous cell.
csv_file.close()

In [62]:
csv_file = open('allrecipes_recipes_3.csv', 'w', encoding='utf-8')

csv_writer = csv.writer(csv_file)
csv_writer.writerow(['name', 'ingredients', 'instructions'])

for i in range(20000, 30000):
    title, ingredients, instructions = get_info(url_list[i][0])
    
    csv_writer.writerow([title, ingredients, instructions])
    
    if i % 500 == 0:
        print(f'{i} done at:', datetime.now())
    
    time.sleep(0.1) # Take a tiny break

csv_file.close()

20000 done at: 2020-04-23 21:08:31.194771
20500 done at: 2020-04-23 21:16:07.917796
21000 done at: 2020-04-23 21:23:05.750977
21500 done at: 2020-04-23 21:30:05.109051
22000 done at: 2020-04-23 21:36:52.971443
22500 done at: 2020-04-23 21:43:59.525567
23000 done at: 2020-04-23 21:51:05.404327
23500 done at: 2020-04-23 21:58:07.218790
24000 done at: 2020-04-23 22:05:23.516736
24500 done at: 2020-04-23 22:12:24.603355
25000 done at: 2020-04-23 22:19:14.543619
25500 done at: 2020-04-23 22:26:13.590862
26000 done at: 2020-04-23 22:33:10.761805
26500 done at: 2020-04-23 22:40:12.131431
27000 done at: 2020-04-23 22:47:05.233184
27500 done at: 2020-04-23 22:53:52.446171
28000 done at: 2020-04-23 23:00:43.344946
28500 done at: 2020-04-23 23:07:41.919898
29000 done at: 2020-04-23 23:14:38.225377
29500 done at: 2020-04-23 23:21:49.513844


In [63]:
# Backup close statment if something happens in previous cell.
csv_file.close()

In [64]:
csv_file = open('allrecipes_recipes_4.csv', 'w', encoding='utf-8')

csv_writer = csv.writer(csv_file)
csv_writer.writerow(['name', 'ingredients', 'instructions'])

for i in range(30000, 40000):
    title, ingredients, instructions = get_info(url_list[i][0])
    
    csv_writer.writerow([title, ingredients, instructions])
    
    if i % 500 == 0:
        print(f'{i} done at:', datetime.now())
    
    time.sleep(0.1) # Take a tiny break

csv_file.close()

30000 done at: 2020-04-23 23:28:58.425402
30500 done at: 2020-04-23 23:35:38.086704
31000 done at: 2020-04-23 23:42:12.418953
31500 done at: 2020-04-23 23:48:53.879957
32000 done at: 2020-04-23 23:55:42.656717
32500 done at: 2020-04-24 00:02:24.197599
33000 done at: 2020-04-24 00:09:17.368610
33500 done at: 2020-04-24 00:16:10.086490
34000 done at: 2020-04-24 00:23:01.737042
34500 done at: 2020-04-24 00:29:48.314311
35000 done at: 2020-04-24 00:36:43.150086
35500 done at: 2020-04-24 00:43:28.708380
36000 done at: 2020-04-24 00:50:10.683211
36500 done at: 2020-04-24 00:56:54.262243
37000 done at: 2020-04-24 01:03:41.591317
37500 done at: 2020-04-24 01:10:17.108335
38000 done at: 2020-04-24 01:17:03.722392
38500 done at: 2020-04-24 01:23:45.475041
39000 done at: 2020-04-24 01:30:33.813793
39500 done at: 2020-04-24 01:37:21.194631


In [65]:
# Backup close statment if something happens in previous cell.
csv_file.close()

In [68]:
csv_file = open('allrecipes_recipes_5a.csv', 'w', encoding='utf-8')

csv_writer = csv.writer(csv_file)
csv_writer.writerow(['name', 'ingredients', 'instructions'])

for i in range(46357, 50000):
    title, ingredients, instructions = get_info(url_list[i][0])
    
    csv_writer.writerow([title, ingredients, instructions])
    
    if i % 500 == 0:
        print(f'{i} done at:', datetime.now())
    
    time.sleep(0.1) # Take a tiny break

csv_file.close()

46500 done at: 2020-04-24 08:03:05.475811
47000 done at: 2020-04-24 08:10:10.130631
47500 done at: 2020-04-24 08:17:13.394043
48000 done at: 2020-04-24 08:24:23.587256
48500 done at: 2020-04-24 08:31:44.924150
49000 done at: 2020-04-24 08:39:04.689912
49500 done at: 2020-04-24 08:46:27.006172


### Something happened when running the above cell so I reran it and changed the file name and starting location. Combined later.

In [69]:
# Backup close statment if something happens in previous cell.
csv_file.close()

In [70]:
csv_file = open('allrecipes_recipes_6.csv', 'w', encoding='utf-8')

csv_writer = csv.writer(csv_file)
csv_writer.writerow(['name', 'ingredients', 'instructions'])

for i in range(50000, 60000):
    title, ingredients, instructions = get_info(url_list[i][0])
    
    csv_writer.writerow([title, ingredients, instructions])
    
    if i % 500 == 0:
        print(f'{i} done at:', datetime.now())
    
    time.sleep(0.1) # Take a tiny break

csv_file.close()

50000 done at: 2020-04-24 08:53:51.703639
50500 done at: 2020-04-24 09:01:08.169589
51000 done at: 2020-04-24 09:08:19.162594
51500 done at: 2020-04-24 09:15:48.210429
52000 done at: 2020-04-24 09:22:57.332895
52500 done at: 2020-04-24 09:30:06.723783
53000 done at: 2020-04-24 09:37:11.190010
53500 done at: 2020-04-24 09:44:09.398000
54000 done at: 2020-04-24 09:51:31.022097
54500 done at: 2020-04-24 09:58:49.681251
55000 done at: 2020-04-24 10:06:07.512683
55500 done at: 2020-04-24 10:13:27.784050
56000 done at: 2020-04-24 10:20:54.670953
56500 done at: 2020-04-24 10:28:19.026381
57000 done at: 2020-04-24 10:35:52.681449
57500 done at: 2020-04-24 10:43:29.656016
58000 done at: 2020-04-24 10:51:34.803455
58500 done at: 2020-04-24 10:58:59.464871
59000 done at: 2020-04-24 11:06:12.065694
59500 done at: 2020-04-24 11:13:24.201726


In [None]:
# Backup close statment if something happens in previous cell.
csv_file.close()

In [71]:
csv_file = open('allrecipes_recipes_7.csv', 'w', encoding='utf-8')

csv_writer = csv.writer(csv_file)
csv_writer.writerow(['name', 'ingredients', 'instructions'])

for i in range(60000, len(url_list)):
    title, ingredients, instructions = get_info(url_list[i][0])
    
    csv_writer.writerow([title, ingredients, instructions])
    
    if i % 500 == 0:
        print(f'{i} done at:', datetime.now())
    
    #time.sleep(0.1) # Take a tiny break

csv_file.close()

60000 done at: 2020-04-24 11:21:04.920696
60500 done at: 2020-04-24 11:27:27.070674
61000 done at: 2020-04-24 11:33:54.392175
61500 done at: 2020-04-24 11:40:20.000756
62000 done at: 2020-04-24 11:46:56.370316
62500 done at: 2020-04-24 11:53:50.759867
63000 done at: 2020-04-24 12:00:44.448650
63500 done at: 2020-04-24 12:07:27.312946
64000 done at: 2020-04-24 12:13:46.497947
64500 done at: 2020-04-24 12:20:07.632031
65000 done at: 2020-04-24 12:26:40.287530
65500 done at: 2020-04-24 12:33:01.543220
66000 done at: 2020-04-24 12:39:44.929558
66500 done at: 2020-04-24 12:46:20.473143
67000 done at: 2020-04-24 12:53:19.723680
67500 done at: 2020-04-24 12:59:50.107045
68000 done at: 2020-04-24 13:06:08.306254
68500 done at: 2020-04-24 13:12:43.760768
69000 done at: 2020-04-24 13:19:18.094028
69500 done at: 2020-04-24 13:25:55.498210
70000 done at: 2020-04-24 13:32:28.974888
70500 done at: 2020-04-24 13:38:52.626140


In [None]:
# Backup close statment if something happens in previous cell.
csv_file.close()

## Below I was trying to programmatically combine all of the disparate CSVs that had been made. I think I just ended up combining them manually since there weren't a whole lot and I was getting frustrated. (I'm writing this note a few weeks later so I don't really recall.)

In [47]:
# for i in range(8):
#     csv_file = open(f'allrecipes_recipes_{i+1}', 'w', encoding='utf-8')
#     csv_writer = csv.writer(csv_file)
#     csv_writer.writerow(['name', 'ingredients', 'instructions'])
    
#     for j in range(i*10000, (i+1)*10000):
#         scraper = scrape_me(url_list[j][0])
        
#         csv_writer.writerow([scraper.title(), scraper.ingredients(), scraper.instructions()])
        
#         if j % 250 == 0:
#             print(f'Recipe {j} gathered.')
        
#         time.sleep(0.1.)
    
#     csv_file.close()
    
#     print('='*20)
#     print(f'{i*10000} - {(i+1)*10000} done.')

In [72]:
csv_file = open('allrecipes_recipes_combined.csv', 'w', encoding='utf-8')

csv_writer = csv.writer(csv_file)
csv_writer.writerow(['name', 'ingredients', 'instructions'])

csv_file.close()

In [77]:
for i in range(1,2):
    with open(f'allrecipes_recipes_{i}.csv', 'r') as csv_file_read:
        csvreader = csv.reader(csv_file_read)
        
        next(csvreader)
        
        for i in range(sum(1 for row in csvreader)):
            print(i)
        
#         for i, row in enumerate(csvreader):
#             print(i, row)

UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 3305: character maps to <undefined>

In [78]:
with open('allrecipes_recipes_1.csv', encoding='utf-8') as in_file:
    with open('test_file.csv', 'w', encoding='utf-8') as out_file:
        writer = csv.writer(out_file)

        for row in csv.reader(in_file):
            if row:
                writer.writerow(row)