In [1]:
# import dependencies
import os
import io
import bs4
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
# from selenium import webdriver
# from lxml import html
import csv
from datetime import datetime
from pytz import timezone
import pandas as pd

In [2]:
# url of ucsc college 9 & 10 dining hall
url = 'https://nutrition.sa.ucsc.edu/nutframe.asp?sName=UC+Santa+Cruz+Dining&locationNum=40&locationName=Colleges+Nine+%26+Ten+Dining+Hall&naFlag=1'

In [3]:
# scrap the menu data from the webpage
# since the menu data is cannot be scrapped from the html file directly
# we have to make additional requests to get the frame page contents 
with requests.Session() as session:
    response = session.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    for frame in soup.select("frameset frame"):
        frame_url = urljoin(url, frame["src"])
        response = session.get(frame_url)
        frame_soup = BeautifulSoup(response.content, 'html.parser') 
        # print(frame_soup.prettify())

In [4]:
# extract both the meal name and food name from html
meal_name = frame_soup.find_all('div', attrs={'class': 'menusampmeals'})
food_name = frame_soup.find_all('div', attrs={'class': 'menusamprecipes'})

In [5]:
# convert the name of the meal type in html form to a list form
meal_type = [item.string for item in meal_name]
print(meal_type)
print('Length: ', len(meal_type))

['Breakfast', 'Lunch', 'Dinner']
Length:  3


In [6]:
# convert the food name in html form to a list form
food_list = [item.string for item in food_name]
print(food_list)
print('Length: ', len(food_list))

['Buttermilk Pancakes', 'Cage Free Scrambled Eggs', 'Eggs Benedict', 'Hard-boiled Cage Free Eggs', 'Hash Brown Patty', 'Natural BridgesTofu Scramble', 'Oatmeal Gluten-Free', 'Pizza with Potato, Sausage and Cheese', 'Sausage Links', 'Steamed Rice', 'Creamy Broccoli Cheddar Soup', 'Oven Roasted Allergen Free Chicken Thigh', 'Pineapple Chicken Curry', 'Pineapple Tofu Curry', 'Sesame Udon Noodles', 'Roasted Vegetables', 'BAR Hot Dog', 'Cheese Sauce', 'Chili con Carne', 'Condiments', 'French Fries', 'Hawaiian Coleslaw', 'Hot Dog All Beef', 'Hot Dog Vegan', 'California Pasta', "Cuban Banana BBQ'd Chicken", 'Oven Roasted Allergen Free Chicken Thigh', 'Black Bean Corn Salsa', 'Cuban Yellow Rice', 'Roasted Vegetables', 'Atomic Cheese Sauce', 'BAR Boardwalk Burgers', 'Bacon', 'Bread Bun', 'Burger Bar #1 Condiments', 'Burger Beef', 'Burger Turkey', 'Chili con Carne', 'Onion Rings', 'Sauteed Mushrooms', 'Vegan Malibu Burger']
Length:  41


In [7]:
# In case of dupliate food name, the list should be converted to a set
# sort
food_list = sorted(list(set(food_list)))
print(food_list)
print('Sorted length: ', len(food_list))

['Atomic Cheese Sauce', 'BAR Boardwalk Burgers', 'BAR Hot Dog', 'Bacon', 'Black Bean Corn Salsa', 'Bread Bun', 'Burger Bar #1 Condiments', 'Burger Beef', 'Burger Turkey', 'Buttermilk Pancakes', 'Cage Free Scrambled Eggs', 'California Pasta', 'Cheese Sauce', 'Chili con Carne', 'Condiments', 'Creamy Broccoli Cheddar Soup', "Cuban Banana BBQ'd Chicken", 'Cuban Yellow Rice', 'Eggs Benedict', 'French Fries', 'Hard-boiled Cage Free Eggs', 'Hash Brown Patty', 'Hawaiian Coleslaw', 'Hot Dog All Beef', 'Hot Dog Vegan', 'Natural BridgesTofu Scramble', 'Oatmeal Gluten-Free', 'Onion Rings', 'Oven Roasted Allergen Free Chicken Thigh', 'Pineapple Chicken Curry', 'Pineapple Tofu Curry', 'Pizza with Potato, Sausage and Cheese', 'Roasted Vegetables', 'Sausage Links', 'Sauteed Mushrooms', 'Sesame Udon Noodles', 'Steamed Rice', 'Vegan Malibu Burger']
Sorted length:  38


In [8]:
# preference list, labeled by myself
# pref_1 = [1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0 ]
# pref_2 = [1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0]
# pref_3 = [0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1]
# pref_4 = [0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0]
# pref_5 = [1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0]
pref = []
for i in food_list:
    print(i)
    preference = int(input())
    pref.append(preference)
print(pref)
print('Preference length: ', len(pref))

Atomic Cheese Sauce
0
BAR Boardwalk Burgers
1
BAR Hot Dog
1
Bacon
1
Black Bean Corn Salsa
0
Bread Bun
0
Burger Bar #1 Condiments
1
Burger Beef
1
Burger Turkey
0
Buttermilk Pancakes
1
Cage Free Scrambled Eggs
1
California Pasta
1
Cheese Sauce
0
Chili con Carne
0
Condiments
0
Creamy Broccoli Cheddar Soup
0
Cuban Banana BBQ'd Chicken
0
Cuban Yellow Rice
0
Eggs Benedict
0
French Fries
1
Hard-boiled Cage Free Eggs
0
Hash Brown Patty
1
Hawaiian Coleslaw
0
Hot Dog All Beef
1
Hot Dog Vegan
0
Natural BridgesTofu Scramble
0
Oatmeal Gluten-Free
0
Onion Rings
1
Oven Roasted Allergen Free Chicken Thigh
0
Pineapple Chicken Curry
0
Pineapple Tofu Curry
0
Pizza with Potato, Sausage and Cheese
1
Roasted Vegetables
0
Sausage Links
1
Sauteed Mushrooms
0
Sesame Udon Noodles
0
Steamed Rice
1
Vegan Malibu Burger
0
[0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0]
Preference length:  38


In [9]:
# timezone setting
us_pacific = timezone('US/Pacific')
time = datetime.now(us_pacific)
us_time = time.strftime('%Y-%m-%d')
print('Current US/Pacific time: ', us_time)

Current US/Pacific time:  2018-09-16


In [10]:
# write the food list to a csv file
path = '/Users/7w0r4ng3s/Desktop/menu_scraping/data/{}.csv'.format(us_time)
print('Current path: ', path)
path_2 = '/Users/7w0r4ng3s/Desktop/menu_scraping/data/'

Current path:  /Users/7w0r4ng3s/Desktop/menu_scraping/data/2018-09-16.csv


In [12]:
def write_data():
    with open(path, "w") as output:
        writer = csv.writer(output, lineterminator='\n')
        for key, val in zip(food_list, pref):
            writer.writerow([key, val])
    print('write_csv: COMPLETED')
    # add index and column names
    df = pd.read_csv(path, names=['food', 'pref'])
    df.index.names = ['index']
    df.to_csv(path)
    print('add_column_name: COMPLETED')
    
write_data()

write_csv: COMPLETED
add_column_name: COMPLETED


In [13]:
# def merge_data():
#     # TODO: Figure out a way to get rid of the unnamed: 0 column
#     # TODO: Modify merge_data() so that new data can be append to data.csv
#     files = [f for f in os.listdir('.') if os.path.isfile(f)]

#     merged = []

#     for f in files:
#         filename, ext = os.path.splitext(f)
#         if ext == '.csv':
#             read = pd.read_csv(f)
#             merged.append(read)

#     result = pd.concat(merged)
#     result.to_csv('data.csv')
    
# merge_data()

In [14]:
def append_data():
    df1 = pd.read_csv('Data.csv', index_col='index')
    df1.index.names = ['index']
    df2 = pd.read_csv(path, index_col='index')
    df3 = df1.append(df2).reset_index()
    df3 = df3.drop('index', 1).sort_values('food').reset_index().drop('index', 1)
    df3.index.names = ['index']
    df3.to_csv('Data.csv')
    
append_data()