In [1]:
import pandas as pd
from sqlalchemy import create_engine


In [207]:
# DATASOURCE 1: Extract all records from winemag.csv - source kaggle.com

wine_file = "Resources/winemag.csv"
df_wine = pd.read_csv(wine_file)

print("Total records ", df_wine['winery'].count())
df_wine.head()

Total records  129971


Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


In [233]:
# Only keep required columns and rename them
wine_cols = ['Unnamed: 0','country','points','price','province','region_1','taster_name','title','variety','winery']
df_wine_base = df_wine[wine_cols].copy()

df_wine_base = df_wine_base.rename(columns={'Unnamed: 0':'wine_id',
                                            'country': 'country_name',
                                            'points':'rating',
                                            'price':'price',
                                            'province': 'province_name',
                                            'region_1':'region_name',
                                            'taster_name':'taster_name',
                                            'title':'wine_name',
                                            'variety':'wine_type',
                                            'winery':'winery_name'})

df_wine_base.head()

Unnamed: 0,wine_id,country_name,rating,price,province_name,region_name,taster_name,wine_name,wine_type,winery_name
0,0,Italy,87,,Sicily & Sardinia,Etna,Kerin O’Keefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,87,15.0,Douro,,Roger Voss,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,87,14.0,Oregon,Willamette Valley,Paul Gregutt,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,3,US,87,13.0,Michigan,Lake Michigan Shore,Alexander Peartree,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4,US,87,65.0,Oregon,Willamette Valley,Paul Gregutt,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


In [234]:
# Drop any records that have emply values in any of their column fields
df_wine_base = df_wine_base.dropna(how='any')
print("Total records ", df_wine_base['wine_id'].count() )
df_wine_base.head()

Total records  77267


Unnamed: 0,wine_id,country_name,rating,price,province_name,region_name,taster_name,wine_name,wine_type,winery_name
2,2,US,87,14.0,Oregon,Willamette Valley,Paul Gregutt,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,3,US,87,13.0,Michigan,Lake Michigan Shore,Alexander Peartree,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4,US,87,65.0,Oregon,Willamette Valley,Paul Gregutt,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks
5,5,Spain,87,15.0,Northern Spain,Navarra,Michael Schachner,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...,Tempranillo-Merlot,Tandem
6,6,Italy,87,16.0,Sicily & Sardinia,Vittoria,Kerin O’Keefe,Terre di Giurfo 2013 Belsito Frappato (Vittoria),Frappato,Terre di Giurfo


In [187]:
# DATASOURCE 2: extract country information from countries.csv - source: wikipedia
    
country_file = "Resources/countries.csv"
df_countries = pd.read_csv(country_file)
df_countries.head()
df_countries.reset_index(inplace=True)
df_countries = df_countries.rename(columns={'index':'country_id',
                                            'country':'country_abbr',
                                            'name': 'country_name'})
df_countries = df_countries.dropna(how='any')
print("Total records: ", df_countries['country_id'].count())
      
df_countries.head()

Total records:  243


Unnamed: 0,country_id,country_abbr,latitude,longitude,country_name
0,0,AD,42.546245,1.601554,Andorra
1,1,AE,23.424076,53.847818,United Arab Emirates
2,2,AF,33.93911,67.709953,Afghanistan
3,3,AG,17.060816,-61.796428,Antigua and Barbuda
4,4,AI,18.220554,-63.068615,Anguilla


In [188]:
# DATASOURCE 3: Directly load the temperatures by country table of the internet - source: Wikipedia
url = 'https://en.wikipedia.org/wiki/List_of_countries_by_average_yearly_temperature'
tables = pd.read_html(url)

df_temps = pd.DataFrame(tables[0])

# Rename columns
df_temps = df_temps.rename(columns={'Country':'country_name','Average yearly temperature (1961–1990, degrees Celsius)':'avg_temp'})

# Reset index for fast lookup by df_countries
df_temps = df_temps.set_index('country_name')

print("Total records: ", df_temps['avg_temp'].count())
df_temps.head()

Total records:  191


Unnamed: 0_level_0,avg_temp
country_name,Unnamed: 1_level_1
Burkina Faso,28.25
Mali,28.25
Kiribati,28.2
Djibouti,28.0
Tuvalu,28.0


In [189]:
# Add temperature to df_countries when found
df_countries['temperature'] = ""
list_temp = []

for index, row in df_countries.iterrows():
    try:
        list_temp = df_temps.loc[str(row[4]),'avg_temp']
        df_countries.iloc[index, 5] = list_temp
    except:
        pass
    
df_countries.head()

Unnamed: 0,country_id,country_abbr,latitude,longitude,country_name,temperature
0,0,AD,42.546245,1.601554,Andorra,7.6
1,1,AE,23.424076,53.847818,United Arab Emirates,27.0
2,2,AF,33.93911,67.709953,Afghanistan,12.6
3,3,AG,17.060816,-61.796428,Antigua and Barbuda,26.0
4,4,AI,18.220554,-63.068615,Anguilla,


In [238]:
# from df_wine_base, create seperate tables, to normalize the database
# df_wineries, df_provinces and df_regions. df_countries was created by pulling so called 'master data' of the internet.

# For the wineries

df_winery_base = df_wine_base.groupby(['winery_name']).count()
df_winery_base.reset_index(inplace=True)
df_wineries = df_winery_base[['winery_name']]
df_wineries.reset_index(inplace=True)
df_wineries = df_wineries.rename(columns={'index':'winery_id'})
print("Total records: ", df_wineries['winery_id'].count())
df_wineries.head()

Total records:  11762


Unnamed: 0,winery_id,winery_name
0,0,1+1=3
1,1,100 Percent Wine
2,2,1000 Stories
3,3,10Span
4,4,12 Linajes


In [239]:
# For the provinces

df_provinces_base = df_wine_base.groupby(['province_name']).count()
df_provinces_base.reset_index(inplace=True)
df_provinces = df_provinces_base[['province_name']]
df_provinces.reset_index(inplace=True)
df_provinces = df_provinces.rename(columns={'index':'province_id'})
print("Total records: ", df_provinces['province_id'].count())
df_provinces.head()

Total records:  64


Unnamed: 0,province_id,province_name
0,0,Alsace
1,1,Andalucia
2,2,Arizona
3,3,Australia Other
4,4,Beaujolais


In [240]:
# For the regions

df_regions_base = df_wine_base.groupby(['region_name']).count()
df_regions_base.reset_index(inplace=True)
df_regions = df_regions_base[['region_name']]
df_regions.reset_index(inplace=True)
df_regions = df_regions.rename(columns={'index':'region_id'})
print("Total records: ", df_regions['region_id'].count())
df_regions.head()

Total records:  1091


Unnamed: 0,region_id,region_name
0,0,Abruzzo
1,1,Adelaida District
2,2,Adelaide
3,3,Adelaide Hills
4,4,Adelaide Plains


In [241]:
# For the wine_types

df_wine_types_base = df_wine_base.groupby(['wine_type']).count()
df_wine_types_base.reset_index(inplace=True)
df_wine_types = df_wine_types_base[['wine_type']]
df_wine_types.reset_index(inplace=True)
df_wine_types = df_wine_types.rename(columns={'index':'wine_type_id'})
print("Total records: ", df_wine_types['wine_type_id'].count())
df_wine_types.head()

Total records:  465


Unnamed: 0,wine_type_id,wine_type
0,0,Abouriou
1,1,Aglianico
2,2,Airen
3,3,Albana
4,4,Albanello
