In [1]:
%matplotlib inline
from matplotlib import style
style.use('fivethirtyeight')
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import datetime as dt
from datetime import datetime

In [2]:
# Python SQL toolkit and Object Relational Mapper
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func, inspect

In [3]:
# create an engine
engine = create_engine("sqlite:///db/wine.sqlite")

In [4]:
# reflect an existing database into a new model
Base = automap_base()
# reflect the tables
Base.prepare(engine, reflect=True)

In [5]:
# View all of the classes/tables that automap found
Base.classes.keys()

['map_wine_table', 'wine_table']

In [7]:
# Save references to the table
Wine = Base.classes.wine_table

In [8]:
# Create our session (link) from Python to the DB
session = Session(engine)

In [10]:
# Create the inspector and connect it to the engine
inspector = inspect(engine)

# inspecting the columns
columns = inspector.get_columns('wine_table')   
for column in columns:
    print(column["name"], column["type"])

id INTEGER
country VARCHAR(255)
description VARCHAR(1000)
points FLOAT
price FLOAT
province VARCHAR(255)
region_1 VARCHAR(255)
region_2 VARCHAR(255)
variety VARCHAR(255)
winery VARCHAR(255)


In [12]:
# inspecting the rows Measurement Table 
# engine.execute('select * from wine_table LIMIT 5').fetchall()  

In [13]:
# Design a query to show how many wineries are available in this dataset?
session.query(func.count(Wine.winery)).all()

[(137230)]

# Wineries that got the most reviews - will need that for ML part. 

In [17]:
# TODO: save this query for ML Part 
# TODO: visualize number of reviews ONLY per country on the scatter plot 
# Then point out the wineries that got the most reviews 

# What are the most active winaries? Listing the stations and the 
# counts in descending order.
session.query(Wine.winery, Wine.country, func.count(Wine.winery)).\
    group_by(Wine.winery).\
    order_by(func.count(Wine.winery).desc()).all()

[('Williams Selyem', 'US', 371),
 ('Testarossa', 'US', 274),
 ('DFJ Vinhos', 'Portugal', 249),
 ('Chateau Ste. Michelle', 'US', 225),
 ('Columbia Crest', 'US', 216),
 ('Kendall-Jackson', 'US', 216),
 ('Concha y Toro', 'Chile', 214),
 ('Trapiche', 'Argentina', 201),
 ('Bouchard Père & Fils', 'France', 192),
 ('De Loach', 'US', 189),
 ('Joseph Drouhin', 'France', 185),
 ('Kenwood', 'US', 183),
 ('Cameron Hughes', 'US', 172),
 ("D'Arenberg", 'Australia', 153),
 ('Dry Creek Vineyard', 'US', 153),
 ('Louis Latour', 'France', 153),
 ('Morgan', 'US', 153),
 ('Concannon', 'US', 151),
 ('Robert Mondavi', 'US', 151),
 ('Martin Ray', 'US', 149),
 ('Errazuriz', 'Chile', 148),
 ('Wines & Winemakers', 'Portugal', 148),
 ("L'Ecole No. 41", 'US', 144),
 ('Iron Horse', 'US', 142),
 ('Montes', 'Chile', 142),
 ('Renwood', 'US', 141),
 ('Santa Rita', 'Chile', 141),
 ('Waterbrook', 'US', 141),
 ('Calera', 'US', 140),
 ('Hogue', 'US', 139),
 ('Yalumba', 'Australia', 139),
 ('Gary Farrell', 'US', 138),
 ('Sa

In [18]:
# Reviews by country 
session.query(Wine.country, func.count(Wine.description)).\
    group_by(Wine.country).\
    order_by(func.count(Wine.country).desc()).all()

[('US', 62139),
 ('Italy', 18784),
 ('France', 14785),
 ('Spain', 8160),
 ('Chile', 5766),
 ('Argentina', 5587),
 ('Australia', 4894),
 ('Portugal', 4176),
 ('New Zealand', 3070),
 ('Austria', 2483),
 ('Germany', 2347),
 ('South Africa', 2237),
 ('Greece', 872),
 ('Israel', 610),
 ('Hungary', 230),
 ('Canada', 194),
 ('Romania', 139),
 ('Uruguay', 85),
 ('Croatia', 83),
 ('Slovenia', 81),
 ('Bulgaria', 77),
 ('Moldova', 71),
 ('Mexico', 63),
 ('Turkey', 50),
 ('Georgia', 43),
 ('Lebanon', 37),
 ('Cyprus', 31),
 ('Brazil', 25),
 ('Macedonia', 16),
 ('Serbia', 14),
 ('Morocco', 12),
 ('Luxembourg', 9),
 ('England', 8),
 ('India', 8),
 ('Lithuania', 8),
 ('Czech Republic', 6),
 ('Ukraine', 5),
 ('Bosnia and Herzegovina', 4),
 ('South Korea', 4),
 ('Switzerland', 4),
 ('China', 3),
 ('Slovakia', 3),
 ('Albania', 2),
 ('Japan', 2),
 ('Montenegro', 2),
 ('US-France', 1)]

In [27]:
# Query the wineraies with avg point and avg price, along with country 
# and province 
for_df = session.query(Wine.winery, Wine.country, Wine.province, func.avg(Wine.points), func.avg(Wine.price)).\
        group_by(Wine.winery).\
        order_by(func.avg(Wine.points).desc()).all()
print(for_df)

[('Sloan', 'US', 'California', 100.0, 245.0), ('Mascarello Giuseppe e Figlio', 'Italy', 'Piedmont', 99.0, 175.0), ('Clos de Tart', 'France', 'Burgundy', 98.0, 319.0), ('Au Sommet', 'US', 'California', 97.0, 250.0), ('Cardinale', 'US', 'California', 97.0, 215.0), ('Domaine Bruno Clair', 'France', 'Burgundy', 97.0, 281.0), ('Gandona', 'US', 'California', 97.0, 190.0), ('Ovid', 'US', 'California', 97.0, 195.0), ('Vieux Château Certan', 'France', 'Bordeaux', 97.0, 130.0), ('Château Climens', 'France', 'Bordeaux', 96.8, 174.0), ('Harlan Estate', 'US', 'California', 96.625, 298.75), ('Araujo', 'US', 'California', 96.0, 185.0), ('Bryant Family', 'US', 'California', 96.0, 335.0), ('Château Bélair-Monange', 'France', 'Bordeaux', 96.0, 160.0), ('Château Haut-Brion', 'France', 'Bordeaux', 96.0, 569.0909090909091), ('Domaine Jean Grivot', 'France', 'Burgundy', 96.0, 281.0), ('Screaming Eagle', 'US', 'California', 96.0, 500.0), ('Semper', 'US', 'California', 96.0, 85.0), ("Tenuta dell'Ornellaia", '