In [1]:
# import dependencies
import os
import io
import bs4
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
# from selenium import webdriver
# from lxml import html
import csv
from datetime import datetime, date, timedelta
from pytz import timezone
import pandas as pd

In [2]:
# url of ucsc college 9 & 10 dining hall
url = 'https://nutrition.sa.ucsc.edu/nutframe.asp?sName=UC+Santa+Cruz+Dining&locationNum=40&locationName=Colleges+Nine+%26+Ten+Dining+Hall&naFlag=1'

In [3]:
# scrap the menu data from the webpage
# since the menu data is cannot be scrapped from the html file directly
# we have to make additional requests to get the frame page contents 
with requests.Session() as session:
    response = session.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    for frame in soup.select("frameset frame"):
        frame_url = urljoin(url, frame["src"])
        response = session.get(frame_url)
        frame_soup = BeautifulSoup(response.content, 'html.parser') 
        # print(frame_soup.prettify())

In [4]:
# extract both the meal name and food name from html
meal_name = frame_soup.find_all('div', attrs={'class': 'menusampmeals'})
food_name = frame_soup.find_all('div', attrs={'class': 'menusamprecipes'})

In [5]:
# convert the name of the meal type in html form to a list form
meal_type = [item.string for item in meal_name]
print(meal_type)
print('Length: ', len(meal_type))

['Breakfast', 'Lunch', 'Dinner']
Length:  3


In [6]:
# convert the food name in html form to a list form
food_list = [item.string for item in food_name]
print(food_list)
print('Length: ', len(food_list))

['Cage Free Scrambled Eggs', 'Ham Steaks', 'Hard-boiled Cage Free Eggs', 'Natural BridgesTofu Scramble', 'Oatmeal Gluten-Free', "Potatoes O'Brien", 'Steamed Rice', 'Texas French Toast', 'Tomato Bisque Soup', 'Oven Roasted Allergen Free Chicken Thigh', 'Teriyaki Chicken Shoyu', 'Veggie Fried Rice', 'Cheese Pizza', 'Cilantro Flatbread with Red Onions and Mushrooms', 'Pepperoni Pizza', 'Steamed Rice', 'Steamed Seasonal Vegetables', 'Atomic Cheese Sauce', 'BAR Boardwalk Burgers', 'Bacon', 'Bread Bun', 'Burger Bar #1 Condiments', 'Burger Beef', 'Chili con Carne', 'Onion Rings', 'Sauteed Mushrooms', 'Vegan Malibu Burger', 'Fried Chicken', 'Oven Roasted Allergen Free Chicken Thigh', 'Steamed Seasonal Vegetables', 'Red Skin Mashed Potatoes', 'Bar Pasta', 'Cheese Ravioli', 'Cheesy Garlic Bread Sticks', 'Condiments', 'Creamy Alfredo Sauce', 'Marinara Sauce', 'Penne']
Length:  38


In [7]:
# In case of dupliate food name, the list should be converted to a set
# sort
food_list = sorted(list(set(food_list)))
print(food_list)
print('Sorted length: ', len(food_list))

['Atomic Cheese Sauce', 'BAR Boardwalk Burgers', 'Bacon', 'Bar Pasta', 'Bread Bun', 'Burger Bar #1 Condiments', 'Burger Beef', 'Cage Free Scrambled Eggs', 'Cheese Pizza', 'Cheese Ravioli', 'Cheesy Garlic Bread Sticks', 'Chili con Carne', 'Cilantro Flatbread with Red Onions and Mushrooms', 'Condiments', 'Creamy Alfredo Sauce', 'Fried Chicken', 'Ham Steaks', 'Hard-boiled Cage Free Eggs', 'Marinara Sauce', 'Natural BridgesTofu Scramble', 'Oatmeal Gluten-Free', 'Onion Rings', 'Oven Roasted Allergen Free Chicken Thigh', 'Penne', 'Pepperoni Pizza', "Potatoes O'Brien", 'Red Skin Mashed Potatoes', 'Sauteed Mushrooms', 'Steamed Rice', 'Steamed Seasonal Vegetables', 'Teriyaki Chicken Shoyu', 'Texas French Toast', 'Tomato Bisque Soup', 'Vegan Malibu Burger', 'Veggie Fried Rice']
Sorted length:  35


In [8]:
# preference list, labeled by myself
# pref_1 = [1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0 ]
# pref_2 = [1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0]
# pref_3 = [0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1]
pref_4 = [0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0]

In [9]:
# timezone setting
us_pacific = timezone('US/Pacific')
time = datetime.now(us_pacific)
us_time = time.strftime('%Y-%m-%d')
print('Current US/Pacific time: ', us_time)

Current US/Pacific time:  2018-09-10


In [10]:
# write the food list to a csv file
path = '/Users/7w0r4ng3s/Desktop/menu_scraping/data/{}.csv'.format(us_time)
print('Current path: ', path)
path_2 = '/Users/7w0r4ng3s/Desktop/menu_scraping/data/'

Current path:  /Users/7w0r4ng3s/Desktop/menu_scraping/data/2018-09-10.csv


In [38]:
def write_data():
    with open(path, "w") as output:
        writer = csv.writer(output, lineterminator='\n')
        for key, val in zip(food_list, pref_4):
            writer.writerow([key, val])
    print('write_csv: COMPLETED')

    df = pd.read_csv(path, names=['food', 'pref'])
    df.index.names = ['index']
    df.to_csv(path)
    print('add_column_name: COMPLETED')

In [39]:
write_data()

write_csv: COMPLETED
add_column_name: COMPLETED


In [68]:
def merge_data():
    # TODO: Figure out a way to get rid of the unnamed: 0 column
    # TODO: Modify merge_data() so that new data can be append to data.csv
    files = [f for f in os.listdir('.') if os.path.isfile(f)]

    merged = []

    for f in files:
        filename, ext = os.path.splitext(f)
        if ext == '.csv':
            read = pd.read_csv(f)
            merged.append(read)

    result = pd.concat(merged)
    result.to_csv('data.csv')

In [69]:
merge_data()

In [76]:
def append_data():
    df1 = pd.read_csv('Data.csv', index_col='index')
    df2 = pd.read_csv(path, index_col='index')
    df3 = df1.append(df2).reset_index()
    df3.index.names = ['index']
    df3 = df3.drop('index', 1).sort_values('food').reset_index().drop('index', 1)

In [77]:
append_data()