In [1]:
import pandas as pd
from splinter import Browser
from bs4 import BeautifulSoup
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()

## Importing csv files to dataframes

In [2]:
beers_file = "Resources/beers.csv"
beers_df = pd.read_csv(beers_file).fillna(0).rename(columns={"Unnamed: 0":"index", "name":"beer_name"})
beers_df

Unnamed: 0,index,abv,ibu,id,beer_name,style,brewery_id,ounces
0,0,0.050,0.0,1436,Pub Beer,American Pale Lager,408,12.0
1,1,0.066,0.0,2265,Devil's Cup,American Pale Ale (APA),177,12.0
2,2,0.071,0.0,2264,Rise of the Phoenix,American IPA,177,12.0
3,3,0.090,0.0,2263,Sinister,American Double / Imperial IPA,177,12.0
4,4,0.075,0.0,2262,Sex and Candy,American IPA,177,12.0
...,...,...,...,...,...,...,...,...
2405,2405,0.067,45.0,928,Belgorado,Belgian IPA,424,12.0
2406,2406,0.052,0.0,807,Rail Yard Ale,American Amber / Red Ale,424,12.0
2407,2407,0.055,0.0,620,B3K Black Lager,Schwarzbier,424,12.0
2408,2408,0.055,40.0,145,Silverback Pale Ale,American Pale Ale (APA),424,12.0


In [3]:
breweries_file = "Resources/breweries.csv"
breweries_df = pd.read_csv(breweries_file).rename(columns={"Unnamed: 0":"brewery_id", "name":"brewery_name"})
breweries_df

Unnamed: 0,brewery_id,brewery_name,city,state
0,0,NorthGate Brewing,Minneapolis,MN
1,1,Against the Grain Brewery,Louisville,KY
2,2,Jack's Abby Craft Lagers,Framingham,MA
3,3,Mike Hess Brewing Company,San Diego,CA
4,4,Fort Point Beer Company,San Francisco,CA
...,...,...,...,...
553,553,Covington Brewhouse,Covington,LA
554,554,Dave's Brewfarm,Wilson,WI
555,555,Ukiah Brewing Company,Ukiah,CA
556,556,Butternuts Beer and Ale,Garrattsville,NY


## Windows Path

In [None]:
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

## Mac Path

In [4]:
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)

## Scraping Data from Brewers Association Website

In [5]:
url = 'https://www.brewersassociation.org/statistics-and-data/state-craft-beer-stats/'
browser.visit(url)
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [6]:
#                           NOT WORKING 
#State cards with data to scrape
states_cards = soup.find('section', class_='site-content wide')
states = soup.find_all('div', class_='stat-container')


#Set up lists to append into
state_list = []
million_dollars_list = []
breweries_per_capita_list = []
barrels_produced_list = []
gallons_per_adult_list = []


In [7]:
for state in states:
    state_name = state.find('h1').text.strip()
    state_list.append(state_name)
    #million dollars is the first one
    million_dollars = state.find('span', class_='total').find('span', class_='count').text.strip()
    million_dollars_list.append(million_dollars)
    breweries_per_capita = state.find('span', class_="bpc").find('span', class_= 'count').text.strip()
    breweries_per_capita_list.append(breweries_per_capita)
    #barrels produced is the second one
    barrels_produced = state.find(id ='production').find('span', class_='total').find('span', class_='count').text.strip()
    barrels_produced_list.append(barrels_produced)
    gallons_per_adult = state.find(id = 'production').find('span', class_='per-capita').find('span', class_='count').text.strip()
    gallons_per_adult_list.append(gallons_per_adult)

In [8]:
#working
state_list

['Alabama',
 'Alaska',
 'Arizona',
 'Arkansas',
 'California',
 'Colorado',
 'Connecticut',
 'Delaware',
 'District of Columbia',
 'Florida',
 'Georgia',
 'Hawaii',
 'Idaho',
 'Illinois',
 'Indiana',
 'Iowa',
 'Kansas',
 'Kentucky',
 'Louisiana',
 'Maine',
 'Maryland',
 'Massachusetts',
 'Michigan',
 'Minnesota',
 'Mississippi',
 'Missouri',
 'Montana',
 'Nebraska',
 'Nevada',
 'New Hampshire',
 'New Jersey',
 'New Mexico',
 'New York',
 'North Carolina',
 'North Dakota',
 'Ohio',
 'Oklahoma',
 'Oregon',
 'Pennsylvania',
 'Rhode Island',
 'South Carolina',
 'South Dakota',
 'Tennessee',
 'Texas',
 'Utah',
 'Vermont',
 'Virginia',
 'Washington',
 'West Virginia',
 'Wisconsin',
 'Wyoming']

In [9]:
million_dollars_list

['758',
 '326',
 '1,147',
 '838',
 '9,014',
 '3,285',
 '753',
 '388',
 '213',
 '3,625',
 '1,841',
 '291',
 '430',
 '3,251',
 '1,655',
 '1,007',
 '519',
 '795',
 '852',
 '656',
 '889',
 '2,076',
 '2,566',
 '2,118',
 '328',
 '1,268',
 '496',
 '521',
 '520',
 '452',
 '1,612',
 '354',
 '4,126',
 '2,555',
 '282',
 '3,223',
 '646',
 '2,136',
 '6,335',
 '220',
 '796',
 '231',
 '1,309',
 '5,077',
 '441',
 '362',
 '1,709',
 '1,947',
 '286',
 '2,352',
 '193']

In [10]:
#working
breweries_per_capita_list

['1.1',
 '7.8',
 '2.2',
 '1.8',
 '2.9',
 '9.2',
 '3.2',
 '3.7',
 '2.4',
 '1.7',
 '1.1',
 '1.8',
 '5',
 '2.4',
 '3.7',
 '4',
 '2.2',
 '1.8',
 '1.1',
 '11.3',
 '2.1',
 '2.9',
 '4.7',
 '4.3',
 '0.7',
 '2.4',
 '11.4',
 '3.6',
 '1.9',
 '7.6',
 '1.6',
 '5.5',
 '2.6',
 '3.7',
 '2.9',
 '3.3',
 '1.3',
 '8.8',
 '3.6',
 '3.2',
 '2.1',
 '4.4',
 '1.9',
 '1.4',
 '1.7',
 '13.5',
 '3.7',
 '6.9',
 '1.9',
 '4.3',
 '6.8']

In [11]:
#not working
barrels_produced_list

['71,894',
 '210,063',
 '173,427',
 '45,720',
 '3,421,295',
 '1,522,834',
 '213,676',
 '298,706',
 '33,857',
 '1,373,558',
 '449,485',
 '79,309',
 '90,498',
 '400,473',
 '259,005',
 '120,755',
 '47,490',
 '122,415',
 '227,096',
 '357,438',
 '294,801',
 '629,463',
 '899,792',
 '644,077',
 '31,422',
 '367,871',
 '197,167',
 '54,640',
 '71,869',
 '110,509',
 '144,283',
 '135,557',
 '1,270,157',
 '1,254,024',
 '16,378',
 '1,398,358',
 '66,133',
 '1,032,369',
 '3,719,475',
 '35,398',
 '100,242',
 '18,196',
 '184,203',
 '1,144,563',
 '193,055',
 '335,199',
 '405,465',
 '566,949',
 '18,951',
 '1,007,123',
 '58,966']

In [12]:
#grabbing breweries per capita
gallons_per_adult_list

['0.6',
 '12.1',
 '1',
 '0.6',
 '3.6',
 '11',
 '2.4',
 '12.5',
 '1.9',
 '2.6',
 '1.8',
 '2.3',
 '2.2',
 '1.3',
 '1.6',
 '1.6',
 '0.7',
 '1.1',
 '2',
 '10.5',
 '2',
 '3.7',
 '3.7',
 '4.8',
 '0.4',
 '2.5',
 '7.6',
 '1.2',
 '1',
 '3.2',
 '0.7',
 '2.7',
 '2.6',
 '5',
 '0.9',
 '4.9',
 '0.7',
 '9.9',
 '11.7',
 '1.3',
 '0.8',
 '0.9',
 '1.1',
 '1.7',
 '2.8',
 '21.3',
 '2',
 '3.1',
 '0.4',
 '7.1',
 '4.3']

In [13]:
brewers_association_data = pd.DataFrame(
    {'state': state_list,
     'millions_sales': million_dollars_list,
     'breweries_per_capita': breweries_per_capita_list,
     'barrels_produced': barrels_produced_list,
     'gallons_per_adult': gallons_per_adult_list
    })


In [14]:
brewers_association_data

Unnamed: 0,state,millions_sales,breweries_per_capita,barrels_produced,gallons_per_adult
0,Alabama,758,1.1,71894,0.6
1,Alaska,326,7.8,210063,12.1
2,Arizona,1147,2.2,173427,1.0
3,Arkansas,838,1.8,45720,0.6
4,California,9014,2.9,3421295,3.6
5,Colorado,3285,9.2,1522834,11.0
6,Connecticut,753,3.2,213676,2.4
7,Delaware,388,3.7,298706,12.5
8,District of Columbia,213,2.4,33857,1.9
9,Florida,3625,1.7,1373558,2.6


## Transforming Data  (example code for what we will do later)

In [None]:
 # Create a filtered dataframe from specific columns
#premise_cols = ["License Serial Number", "Premises Name", "County ID Code"]
#premise_transformed= premise_df[premise_cols].copy()

# Rename the column headers
#premise_transformed = premise_transformed.rename(columns={"License Serial Number": "id",
                                                          "Premises Name": "premise_name",
                                                          "County ID Code": "county_id"})

# Clean the data by dropping duplicates and setting the index
#premise_transformed.drop_duplicates("id", inplace=True)
#premise_transformed.set_index("id", inplace=True)

#premise_transformed.head()

In [None]:
#county_cols = ["ID", "County Name (Licensee)", "County ID Code", "License Count"]
#county_transformed = county_df[county_cols].copy()

# Rename the column headers
#county_transformed = county_transformed.rename(columns={"ID": "id",
                                                         "County Name (Licensee)": "county_name",
                                                         "License Count": "license_count",
                                                         "County ID Code": "county_id"})

# Set index
#county_transformed.set_index("id", inplace=True)

#county_transformed.head()

## Connecting to postgres

In [15]:
engine = create_engine('sqlite:///Resources/brew.sqlite')
Base.metadata.create_all(engine)
session = Session(engine)

In [29]:
# Confirm tables
engine.execute('DROP TABLE IF EXISTS beers')
engine.execute('DROP TABLE IF EXISTS breweries')
beers_df.to_sql('beers', con=engine, if_exists='append', index=True)
breweries_df.to_sql('breweries', con=engine, if_exists='append', index=True)

In [53]:
combined_df = pd.DataFrame(engine.execute
                            ('SELECT r.state, COUNT(r.brewery_id), COUNT(e.beer_name), COUNT(e.style), COUNT(r.city), '\
                             'AVG(e.abv), AVG(e.ibu) FROM beers e INNER JOIN breweries r ON r.brewery_id=e.brewery_id '\
                             'GROUP BY r.state ORDER BY r.state;'))

In [54]:
# combined_df = pd.DataFrame(engine.execute('SELECT * FROM brews'))
combined_df = combined_df.rename(columns={0:"brewery_id", 1:"city", 2:"state", 3:"beer_name", 4:"abv", 5:"ibu", 6:"id", 7:"style", 8:"ounces"})
combined_df

Unnamed: 0,brewery_id,city,state,beer_name,abv,ibu,id
0,AK,25,25,25,25,0.05564,27.8
1,AL,10,10,10,10,0.062,46.1
2,AR,5,5,5,5,0.052,7.8
3,AZ,47,47,47,47,0.056383,17.978723
4,CA,183,183,183,183,0.060749,34.142077
5,CO,265,265,265,265,0.059785,26.132075
6,CT,27,27,27,27,0.061074,9.074074
7,DC,8,8,8,8,0.065625,27.625
8,DE,2,2,2,2,0.0275,26.0
9,FL,58,58,58,58,0.057879,29.827586


<sqlalchemy.engine.result.ResultProxy at 0x11a7fd2b0>