In [8]:
# Dependancies
import pandas as pd
import os
import csv

# Python SQL toolkit and Object Relational Mapper
import sqlalchemy
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, inspect, func

# Import modules to declare columns and column data types
from sqlalchemy import Column, Integer, String, Float

# Import and establish Base for which classes will be constructed 
from sqlalchemy.ext.declarative import declarative_base

# Define Declarative Base 
Base = declarative_base()

In [None]:
# Creating an engine, sqlite database and connection to the engine
engine = create_engine("sqlite:///db/wine.sqlite", echo=False)
conn = engine.connect()

In [9]:
# Use this to delete tables
engine.execute('DROP TABLE wine_table')

# Clear the metadata object, too
Base.metadata.clear()
# Inspecting the Table to make sure the table is there
inspector = inspect(engine)
inspector.get_table_names()

[]


# Exploratory Analysis of the Data

In [10]:
# Read in CSV File
raw_df=pd.read_csv('db/wine_library.csv', dtype={'Zip': 'str'})
raw_df.head(2)

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
0,0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz
1,1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez


In [11]:
# Inspecting the data 
# NOTE: The lowest point is 80, the highest is 100, so have to put the ratings in the range for the ml label later on
raw_df.describe()

Unnamed: 0.1,Unnamed: 0,points,price
count,150930.0,150930.0,137235.0
mean,75464.5,87.888418,33.131482
std,43569.882402,3.222392,36.322536
min,0.0,80.0,4.0
25%,37732.25,86.0,16.0
50%,75464.5,88.0,24.0
75%,113196.75,90.0,40.0
max,150929.0,100.0,2300.0


In [12]:
# Shape of the data
raw_df.shape

(150930, 11)

In [13]:
# See if there are any NaN values
raw_df.isnull().sum()

Unnamed: 0         0
country            5
description        0
designation    45735
points             0
price          13695
province           5
region_1       25060
region_2       89977
variety            0
winery             0
dtype: int64

In [14]:
# Drop designation & region_2 columns (don't need those)
dropped_df = raw_df.drop(['designation', 'Unnamed: 0'], axis = 1) 
dropped_df.head(2)

Unnamed: 0,country,description,points,price,province,region_1,region_2,variety,winery
0,US,This tremendous 100% varietal wine hails from ...,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz
1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez


In [15]:
# Drop all NaN values in region_1 and price columns
# NOTE: if there is a missing values in region 1 then the data was entered incorrectly
# NOTE: Only dropping NaN values from country and price. If I dropped NaN values from region_2 then I only have 9 
# countries, losing too much data.
new_df = dropped_df.dropna(subset=['price', 'country'])

In [16]:
# The shape of a clean dataset 
new_df.shape

(137230, 9)

In [17]:
new_df.head(2)

Unnamed: 0,country,description,points,price,province,region_1,region_2,variety,winery
0,US,This tremendous 100% varietal wine hails from ...,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz
1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez


# Load the Data into sqlite database

In [18]:
# Creating a Class & table 
class Wine(Base):
    __tablename__ = 'wine_table'
    id = Column(Integer, primary_key=True)
    country = Column(String(255))
    description = Column(String(1000))
    points = Column(Float)
    price = Column(Float)
    province = Column(String(255))
    region_1 = Column(String(255))
    region_2 = Column(String(255))
    variety = Column(String(255))
    winery = Column(String(255))

In [19]:
Base.metadata.create_all(conn)

In [20]:
# Transfer the data to sqlite table
new_df.to_sql(name='wine_table', con=conn, if_exists='append', index=False)

In [21]:
# Inspecting the Table to make sure the table is there
inspector = inspect(engine)
inspector.get_table_names()

['wine_table']

In [22]:
# Checking the data 
engine.execute('SELECT * FROM wine_table LIMIT 20').fetchall()

[(1, 'US', 'This tremendous 100% varietal wine hails from Oakville and was aged over three years in oak. Juicy red-cherry fruit and a compelling hint of caramel  ... (57 characters truncated) ... ubtle minty tone in the background. Balanced and rewarding from start to finish, it has years ahead of it to develop further nuance. Enjoy 2022–2030.', 96.0, 235.0, 'California', 'Napa Valley', 'Napa', 'Cabernet Sauvignon', 'Heitz'),
 (2, 'Spain', 'Ripe aromas of fig, blackberry and cassis are softened and sweetened by a slathering of oaky chocolate and vanilla. This is full, layered, intense an ... (20 characters truncated) ... alate, with rich flavors of chocolaty black fruits and baking spices. A toasty, everlasting finish is heady but ideally balanced. Drink through 2023.', 96.0, 110.0, 'Northern Spain', 'Toro', None, 'Tinta de Toro', 'Bodega Carmen Rodríguez'),
 (3, 'US', 'Mac Watson honors the memory of a wine once made by his mother in this tremendously delicious, balanced and complex b