In [1]:
import pandas as pd
import numpy as np
import json 
from sklearn.preprocessing import MinMaxScaler

## Read In Main CSV from Kaggle

In [2]:
file = "winemag-data_first150k.csv"

In [3]:
wine_df = pd.read_csv(file)

In [4]:
wine_df.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
0,0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz
1,1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez
2,2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley
3,3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi
4,4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,95,66.0,Provence,Bandol,,Provence red blend,Domaine de la Bégude


In [5]:
wine_df.columns

Index(['Unnamed: 0', 'country', 'description', 'designation', 'points',
       'price', 'province', 'region_1', 'region_2', 'variety', 'winery'],
      dtype='object')

## Clean Main CSV Data

In [6]:
reduced_wine_df = wine_df.loc[:, ["country", "points", "price", "province", "region_1", "variety", "winery"]]
reduced_wine_df

Unnamed: 0,country,points,price,province,region_1,variety,winery
0,US,96,235.0,California,Napa Valley,Cabernet Sauvignon,Heitz
1,Spain,96,110.0,Northern Spain,Toro,Tinta de Toro,Bodega Carmen Rodríguez
2,US,96,90.0,California,Knights Valley,Sauvignon Blanc,Macauley
3,US,96,65.0,Oregon,Willamette Valley,Pinot Noir,Ponzi
4,France,95,66.0,Provence,Bandol,Provence red blend,Domaine de la Bégude
...,...,...,...,...,...,...,...
150925,Italy,91,20.0,Southern Italy,Fiano di Avellino,White Blend,Feudi di San Gregorio
150926,France,91,27.0,Champagne,Champagne,Champagne Blend,H.Germain
150927,Italy,91,20.0,Southern Italy,Fiano di Avellino,White Blend,Terredora
150928,France,90,52.0,Champagne,Champagne,Champagne Blend,Gosset


In [7]:
reduced_wine_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150930 entries, 0 to 150929
Data columns (total 7 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   country   150925 non-null  object 
 1   points    150930 non-null  int64  
 2   price     137235 non-null  float64
 3   province  150925 non-null  object 
 4   region_1  125870 non-null  object 
 5   variety   150930 non-null  object 
 6   winery    150930 non-null  object 
dtypes: float64(1), int64(1), object(5)
memory usage: 8.1+ MB


In [8]:
reduced_wine_df.count()

country     150925
points      150930
price       137235
province    150925
region_1    125870
variety     150930
winery      150930
dtype: int64

In [9]:
reduced_wine_df2 = reduced_wine_df.dropna(how="any")
reduced_wine_df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 114393 entries, 0 to 150929
Data columns (total 7 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   country   114393 non-null  object 
 1   points    114393 non-null  int64  
 2   price     114393 non-null  float64
 3   province  114393 non-null  object 
 4   region_1  114393 non-null  object 
 5   variety   114393 non-null  object 
 6   winery    114393 non-null  object 
dtypes: float64(1), int64(1), object(5)
memory usage: 7.0+ MB


In [10]:
reduced_wine_df2.count()

country     114393
points      114393
price       114393
province    114393
region_1    114393
variety     114393
winery      114393
dtype: int64

In [11]:
reduced_wine_df2

Unnamed: 0,country,points,price,province,region_1,variety,winery
0,US,96,235.0,California,Napa Valley,Cabernet Sauvignon,Heitz
1,Spain,96,110.0,Northern Spain,Toro,Tinta de Toro,Bodega Carmen Rodríguez
2,US,96,90.0,California,Knights Valley,Sauvignon Blanc,Macauley
3,US,96,65.0,Oregon,Willamette Valley,Pinot Noir,Ponzi
4,France,95,66.0,Provence,Bandol,Provence red blend,Domaine de la Bégude
...,...,...,...,...,...,...,...
150925,Italy,91,20.0,Southern Italy,Fiano di Avellino,White Blend,Feudi di San Gregorio
150926,France,91,27.0,Champagne,Champagne,Champagne Blend,H.Germain
150927,Italy,91,20.0,Southern Italy,Fiano di Avellino,White Blend,Terredora
150928,France,90,52.0,Champagne,Champagne,Champagne Blend,Gosset


## Read in CSV Red Wine CSV (From Previous Group's Project)

In [12]:
file2 = "Wine_Reds.csv"

In [13]:
red_wine_df = pd.read_csv(file2)
red_wine_df

Unnamed: 0,Variety,Counts,Red?
0,Chardonnay,13775,False
1,Pinot Noir,13625,True
2,Cabernet Sauvignon,12671,True
3,Red Blend,9377,True
4,Sauvignon Blanc,6054,False
...,...,...,...
614,Bombino Bianco,1,
615,Magliocco,1,
616,Merlot-Petite Verdot,1,
617,Moscofilero,1,


## Clean Red Wine CSV Data

In [14]:
renamed_red_wine = red_wine_df.rename(columns = {'Variety': 'variety', 'Red?': 'red' })
renamed_red_wine

Unnamed: 0,variety,Counts,red
0,Chardonnay,13775,False
1,Pinot Noir,13625,True
2,Cabernet Sauvignon,12671,True
3,Red Blend,9377,True
4,Sauvignon Blanc,6054,False
...,...,...,...
614,Bombino Bianco,1,
615,Magliocco,1,
616,Merlot-Petite Verdot,1,
617,Moscofilero,1,


## Merge Main DataSet w/Red Wine Dataset for types of wine

In [15]:
merge_df = pd.merge(reduced_wine_df2, renamed_red_wine, on="variety")
merge_df

Unnamed: 0,country,points,price,province,region_1,variety,winery,Counts,red
0,US,96,235.0,California,Napa Valley,Cabernet Sauvignon,Heitz,12671,True
1,US,95,325.0,California,Diamond Mountain District,Cabernet Sauvignon,Hall,12671,True
2,US,90,60.0,California,Mount Veeder,Cabernet Sauvignon,Brandlin,12671,True
3,US,90,40.0,Washington,Red Mountain,Cabernet Sauvignon,Canvasback,12671,True
4,US,90,69.0,Washington,Red Mountain,Cabernet Sauvignon,DeLille,12671,True
...,...,...,...,...,...,...,...,...,...
114388,France,88,19.0,Alsace,Alsace,Tokay Pinot Gris,Rieflé,4,
114389,France,87,41.0,Alsace,Alsace,Tokay Pinot Gris,Rieflé,4,
114390,Italy,84,9.0,Tuscany,Toscana,Chardonnay-Pinot Grigio,Banfi,1,
114391,Spain,84,12.0,Northern Spain,Somontano,Moristel,Alquézar,1,


## Clean Merged Data

In [16]:
merge_df2 = merge_df.loc[:, ["country", "points", "price", "province", "region_1", "variety", "winery", "red"]]
merge_df2

Unnamed: 0,country,points,price,province,region_1,variety,winery,red
0,US,96,235.0,California,Napa Valley,Cabernet Sauvignon,Heitz,True
1,US,95,325.0,California,Diamond Mountain District,Cabernet Sauvignon,Hall,True
2,US,90,60.0,California,Mount Veeder,Cabernet Sauvignon,Brandlin,True
3,US,90,40.0,Washington,Red Mountain,Cabernet Sauvignon,Canvasback,True
4,US,90,69.0,Washington,Red Mountain,Cabernet Sauvignon,DeLille,True
...,...,...,...,...,...,...,...,...
114388,France,88,19.0,Alsace,Alsace,Tokay Pinot Gris,Rieflé,
114389,France,87,41.0,Alsace,Alsace,Tokay Pinot Gris,Rieflé,
114390,Italy,84,9.0,Tuscany,Toscana,Chardonnay-Pinot Grigio,Banfi,
114391,Spain,84,12.0,Northern Spain,Somontano,Moristel,Alquézar,


In [17]:
reduced_wine_df3 = merge_df2.dropna(how="any")
reduced_wine_df3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 113882 entries, 0 to 114339
Data columns (total 8 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   country   113882 non-null  object 
 1   points    113882 non-null  int64  
 2   price     113882 non-null  float64
 3   province  113882 non-null  object 
 4   region_1  113882 non-null  object 
 5   variety   113882 non-null  object 
 6   winery    113882 non-null  object 
 7   red       113882 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 7.8+ MB


In [18]:
reduced_wine_df3

Unnamed: 0,country,points,price,province,region_1,variety,winery,red
0,US,96,235.0,California,Napa Valley,Cabernet Sauvignon,Heitz,True
1,US,95,325.0,California,Diamond Mountain District,Cabernet Sauvignon,Hall,True
2,US,90,60.0,California,Mount Veeder,Cabernet Sauvignon,Brandlin,True
3,US,90,40.0,Washington,Red Mountain,Cabernet Sauvignon,Canvasback,True
4,US,90,69.0,Washington,Red Mountain,Cabernet Sauvignon,DeLille,True
...,...,...,...,...,...,...,...,...
114310,Italy,87,24.0,Northeastern Italy,Alto Adige,Silvaner,Abbazia di Novacella,False
114311,Italy,87,24.0,Northeastern Italy,Alto Adige,Silvaner,Abbazia di Novacella,False
114312,Italy,87,24.0,Northeastern Italy,Alto Adige,Silvaner,Abbazia di Novacella,False
114313,France,88,14.0,Alsace,Alsace,Silvaner,Domaines Schlumberger,False


In [19]:
reduced_wine_df3 = reduced_wine_df3.replace(
    {1:"red wine", 0:"white wine"})
reduced_wine_df3

Unnamed: 0,country,points,price,province,region_1,variety,winery,red
0,US,96,235.0,California,Napa Valley,Cabernet Sauvignon,Heitz,red wine
1,US,95,325.0,California,Diamond Mountain District,Cabernet Sauvignon,Hall,red wine
2,US,90,60.0,California,Mount Veeder,Cabernet Sauvignon,Brandlin,red wine
3,US,90,40.0,Washington,Red Mountain,Cabernet Sauvignon,Canvasback,red wine
4,US,90,69.0,Washington,Red Mountain,Cabernet Sauvignon,DeLille,red wine
...,...,...,...,...,...,...,...,...
114310,Italy,87,24.0,Northeastern Italy,Alto Adige,Silvaner,Abbazia di Novacella,white wine
114311,Italy,87,24.0,Northeastern Italy,Alto Adige,Silvaner,Abbazia di Novacella,white wine
114312,Italy,87,24.0,Northeastern Italy,Alto Adige,Silvaner,Abbazia di Novacella,white wine
114313,France,88,14.0,Alsace,Alsace,Silvaner,Domaines Schlumberger,white wine


In [20]:
renamed_merge_df1 = reduced_wine_df3.rename(columns = {'red': 'type'})
renamed_merge_df1

Unnamed: 0,country,points,price,province,region_1,variety,winery,type
0,US,96,235.0,California,Napa Valley,Cabernet Sauvignon,Heitz,red wine
1,US,95,325.0,California,Diamond Mountain District,Cabernet Sauvignon,Hall,red wine
2,US,90,60.0,California,Mount Veeder,Cabernet Sauvignon,Brandlin,red wine
3,US,90,40.0,Washington,Red Mountain,Cabernet Sauvignon,Canvasback,red wine
4,US,90,69.0,Washington,Red Mountain,Cabernet Sauvignon,DeLille,red wine
...,...,...,...,...,...,...,...,...
114310,Italy,87,24.0,Northeastern Italy,Alto Adige,Silvaner,Abbazia di Novacella,white wine
114311,Italy,87,24.0,Northeastern Italy,Alto Adige,Silvaner,Abbazia di Novacella,white wine
114312,Italy,87,24.0,Northeastern Italy,Alto Adige,Silvaner,Abbazia di Novacella,white wine
114313,France,88,14.0,Alsace,Alsace,Silvaner,Domaines Schlumberger,white wine


In [21]:
renamed_merge_df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 113882 entries, 0 to 114339
Data columns (total 8 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   country   113882 non-null  object 
 1   points    113882 non-null  int64  
 2   price     113882 non-null  float64
 3   province  113882 non-null  object 
 4   region_1  113882 non-null  object 
 5   variety   113882 non-null  object 
 6   winery    113882 non-null  object 
 7   type      113882 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 7.8+ MB


## Read in Province CSV for Red Wine (From Previous Group's Project) to get Lat/Longs for Province

In [22]:
file3 = "Red_Wine_Provinces.csv"

In [23]:
province_redwine = pd.read_csv(file3)
province_redwine

Unnamed: 0.1,Unnamed: 0,province,red_wine_counts,latitude,longtitude
0,0,California,32247,36.778261,-119.417932
1,1,Washington,7303,47.751074,-120.740139
2,2,Tuscany,5542,43.771051,11.248621
3,3,Mendoza Province,3982,-32.889625,-68.852687
4,4,Northern Spain,3877,42.816977,-1.641765
...,...,...,...,...,...
338,338,Central Otago-Marlborough,1,-44.956632,169.398845
339,339,Arcadia,1,34.139729,-118.035345
340,340,Cyprus,1,35.126413,33.429859
341,341,Dolenjska,1,45.755858,15.059233


## Merge with Main Data Set with Red Province  

In [24]:
merge_df3 = pd.merge(renamed_merge_df1, province_redwine, on="province")
merge_df3

Unnamed: 0.1,country,points,price,province,region_1,variety,winery,type,Unnamed: 0,red_wine_counts,latitude,longtitude
0,US,96,235.0,California,Napa Valley,Cabernet Sauvignon,Heitz,red wine,0,32247,36.778261,-119.417932
1,US,95,325.0,California,Diamond Mountain District,Cabernet Sauvignon,Hall,red wine,0,32247,36.778261,-119.417932
2,US,90,60.0,California,Mount Veeder,Cabernet Sauvignon,Brandlin,red wine,0,32247,36.778261,-119.417932
3,US,91,85.0,California,Napa Valley,Cabernet Sauvignon,Michael Mondavi Family Estate,red wine,0,32247,36.778261,-119.417932
4,US,91,60.0,California,Rutherford,Cabernet Sauvignon,Provenance Vineyards,red wine,0,32247,36.778261,-119.417932
...,...,...,...,...,...,...,...,...,...,...,...,...
113866,Spain,85,15.0,Spanish Islands,Vi de la Terra Illes Balears,White Blend,Ànima Negra,white wine,112,24,39.358776,2.735633
113867,Spain,85,15.0,Spanish Islands,Vi de la Terra Illes Balears,White Blend,Ànima Negra,white wine,112,24,39.358776,2.735633
113868,Spain,82,20.0,Spanish Islands,Lanzarote,Rosado,El Grifo,red wine,112,24,39.358776,2.735633
113869,Spain,85,20.0,Spanish Islands,Lanzarote,Malvasia,El Grifo,white wine,112,24,39.358776,2.735633


## Clean Main Dataframe

In [25]:
Final_Wine_Data = merge_df3.loc[:, ["country", "points", "price", "province", "region_1", "variety", "winery", "type", "latitude", "longtitude"]]
Final_Wine_Data

Unnamed: 0,country,points,price,province,region_1,variety,winery,type,latitude,longtitude
0,US,96,235.0,California,Napa Valley,Cabernet Sauvignon,Heitz,red wine,36.778261,-119.417932
1,US,95,325.0,California,Diamond Mountain District,Cabernet Sauvignon,Hall,red wine,36.778261,-119.417932
2,US,90,60.0,California,Mount Veeder,Cabernet Sauvignon,Brandlin,red wine,36.778261,-119.417932
3,US,91,85.0,California,Napa Valley,Cabernet Sauvignon,Michael Mondavi Family Estate,red wine,36.778261,-119.417932
4,US,91,60.0,California,Rutherford,Cabernet Sauvignon,Provenance Vineyards,red wine,36.778261,-119.417932
...,...,...,...,...,...,...,...,...,...,...
113866,Spain,85,15.0,Spanish Islands,Vi de la Terra Illes Balears,White Blend,Ànima Negra,white wine,39.358776,2.735633
113867,Spain,85,15.0,Spanish Islands,Vi de la Terra Illes Balears,White Blend,Ànima Negra,white wine,39.358776,2.735633
113868,Spain,82,20.0,Spanish Islands,Lanzarote,Rosado,El Grifo,red wine,39.358776,2.735633
113869,Spain,85,20.0,Spanish Islands,Lanzarote,Malvasia,El Grifo,white wine,39.358776,2.735633


In [26]:
Final_Wine_Data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 113871 entries, 0 to 113870
Data columns (total 10 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   country     113871 non-null  object 
 1   points      113871 non-null  int64  
 2   price       113871 non-null  float64
 3   province    113871 non-null  object 
 4   region_1    113871 non-null  object 
 5   variety     113871 non-null  object 
 6   winery      113871 non-null  object 
 7   type        113871 non-null  object 
 8   latitude    113871 non-null  float64
 9   longtitude  113871 non-null  float64
dtypes: float64(3), int64(1), object(6)
memory usage: 9.6+ MB


## Export Final Wine Data to CSV/Json

In [None]:
# Final_Wine_Data.to_csv('Final_Wine_Data.csv', index=True)
# Final_Wine_Data.to_json('Final_Wine_Data.csv', index=True)

## Subset Data for All 3 Plotty Charts & Maps 

In [27]:
Top_100_Red = Final_Wine_Data.loc[Final_Wine_Data.type == 'red wine'].sort_values(by=['points'], ascending=False).head(100)
Top_100_White = Final_Wine_Data.loc[Final_Wine_Data.type == 'white wine'].sort_values(by=['points'], ascending=False).head(100)

In [28]:
Top_100_Red

Unnamed: 0,country,points,price,province,region_1,variety,winery,type,latitude,longtitude
73169,US,100,65.0,Oregon,Walla Walla Valley (OR),Syrah,Cayuse,red wine,43.804133,-120.554201
63293,Italy,100,460.0,Tuscany,Toscana,Merlot,Tenuta dell'Ornellaia,red wine,43.771051,11.248621
4206,US,100,215.0,California,Stags Leap District,Cabernet Sauvignon,Shafer,red wine,36.778261,-119.417932
73199,US,100,65.0,Oregon,Walla Walla Valley (OR),Syrah,Cayuse,red wine,43.804133,-120.554201
16010,US,100,100.0,California,Russian River Valley,Pinot Noir,Williams Selyem,red wine,36.778261,-119.417932
...,...,...,...,...,...,...,...,...,...,...
62352,Italy,98,175.0,Tuscany,Bolgheri Superiore,Red Blend,Tenuta dell'Ornellaia,red wine,43.771051,11.248621
68701,Australia,98,850.0,South Australia,South Australia,Shiraz,Penfolds,red wine,-30.000232,136.209155
62696,Italy,98,175.0,Tuscany,Bolgheri Superiore,Red Blend,Tenuta dell'Ornellaia,red wine,43.771051,11.248621
42567,US,98,110.0,California,Rutherford,Cabernet Blend,Rubicon Estate,red wine,36.778261,-119.417932


In [29]:
Top_Wine = pd.concat([Top_100_Red, Top_100_White], axis= 0, ignore_index=True)
Top_Wine.value_counts('province')

province
California            45
Tuscany               40
Champagne             29
Burgundy              23
Victoria              15
Bordeaux              13
Oregon                10
Washington             7
Rhône Valley           4
Piedmont               4
Alsace                 4
South Australia        3
Northeastern Italy     2
Northern Spain         1
dtype: int64

## Export Subset to Json

In [None]:
# Top_Wine.to_json('Top_Wine.json', index=True)
# Top_100_Red.to_json('Top_100_Red.json', index=True)
# Top_100_White.to_json('Top_100_White.json', index=True)

## Subset Data for Scattered Bubble Chart 

In [30]:
Final_Wine_Data.province.value_counts()

California              44294
Washington               9664
Tuscany                  5950
Northern Spain           4774
Mendoza Province         4690
Oregon                   4560
Burgundy                 3340
Veneto                   3088
South Australia          2948
Piedmont                 2882
Bordeaux                 2656
New York                 2411
Sicily & Sardinia        2006
Northeastern Italy       1833
Loire Valley             1382
Alsace                   1353
Catalonia                1314
Southwest France         1286
Central Italy            1236
Rhône Valley             1221
Southern Italy           1119
Champagne                1089
Languedoc-Roussillon     1012
Other                     878
Provence                  817
Central Spain             726
Victoria                  602
Australia Other           550
Galicia                   539
Virginia                  506
Western Australia         486
Levante                   443
Lombardy                  439
Beaujolais

## Clean Data

In [31]:
wine_bubble = Final_Wine_Data.loc[:, ["points", "province", "type"]]
wine_bubble

Unnamed: 0,points,province,type
0,96,California,red wine
1,95,California,red wine
2,90,California,red wine
3,91,California,red wine
4,91,California,red wine
...,...,...,...
113866,85,Spanish Islands,white wine
113867,85,Spanish Islands,white wine
113868,82,Spanish Islands,red wine
113869,85,Spanish Islands,white wine


In [32]:
red_wine_bubble = wine_bubble.loc[wine_bubble.type == 'red wine'].sort_values(by=['province'], ascending=False)
red_wine_bubble2 = red_wine_bubble.groupby('province').mean('points').round(1)

merge_df4 = pd.merge(red_wine_bubble, red_wine_bubble2, on="province")
merge_df5 = merge_df4.rename(columns = {'points_y': 'avg_points'})
Final_red_bubble = merge_df5.loc[:, ["province", "type", "avg_points"]]
Final_red_bubble

Unnamed: 0,province,type,avg_points
0,Western Australia,red wine,88.1
1,Western Australia,red wine,88.1
2,Western Australia,red wine,88.1
3,Western Australia,red wine,88.1
4,Western Australia,red wine,88.1
...,...,...,...
81287,Alsace,red wine,88.4
81288,Alsace,red wine,88.4
81289,Alsace,red wine,88.4
81290,Alsace,red wine,88.4


In [33]:
Final_red_bubble2 = Final_red_bubble.loc[Final_red_bubble.duplicated(keep='last'), :]
Final_red_bubble3 = Final_red_bubble2.drop_duplicates()
Final_red_bubble3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55 entries, 0 to 80896
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   province    55 non-null     object 
 1   type        55 non-null     object 
 2   avg_points  55 non-null     float64
dtypes: float64(1), object(2)
memory usage: 1.7+ KB


In [34]:
white_wine_bubble = wine_bubble.loc[wine_bubble.type == 'white wine'].sort_values(by=['province'], ascending=False)
white_wine_bubble2 = white_wine_bubble.groupby('province').mean('points').round(1)

merge_df4 = pd.merge(white_wine_bubble, white_wine_bubble2, on="province")
merge_df5 = merge_df4.rename(columns = {'points_y': 'avg_points'})
Final_white_bubble = merge_df5.loc[:, ["province", "type", "avg_points"]]
Final_white_bubble

Unnamed: 0,province,type,avg_points
0,Western Australia,white wine,87.1
1,Western Australia,white wine,87.1
2,Western Australia,white wine,87.1
3,Western Australia,white wine,87.1
4,Western Australia,white wine,87.1
...,...,...,...
32574,Alsace,white wine,88.7
32575,Alsace,white wine,88.7
32576,Alsace,white wine,88.7
32577,Alsace,white wine,88.7


In [35]:
Final_white_bubble2 = Final_white_bubble.loc[Final_red_bubble.duplicated(keep='last'), :]
Final_white_bubble3 = Final_white_bubble2.drop_duplicates()
Final_white_bubble3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57 entries, 0 to 31622
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   province    57 non-null     object 
 1   type        57 non-null     object 
 2   avg_points  57 non-null     float64
dtypes: float64(1), object(2)
memory usage: 1.8+ KB


In [36]:
Final_wine_bubble = pd.concat([Final_red_bubble3, Final_white_bubble3], axis= 0, ignore_index=True)
Final_wine_bubble

Unnamed: 0,province,type,avg_points
0,Western Australia,red wine,88.1
1,Washington,red wine,89.0
2,Virginia,red wine,85.3
3,Victoria,red wine,87.7
4,Veneto,red wine,88.7
...,...,...,...
107,Beaujolais,white wine,84.2
108,Australia Other,white wine,84.8
109,Arizona,white wine,84.4
110,Andalucia,white wine,89.1


## Export Subset to Json

In [None]:
# Final_wine_bubble.to_json('Bubble_Wine2.js', index=True)

In [37]:
Final_red_bubble4 = Final_red_bubble3.loc[:, ["avg_points", "province"]]


In [38]:
Final_white_bubble4 = Final_white_bubble3.loc[:, ["avg_points", "province"]]


## Transforming Scattered Bubble Data

In [39]:
scaler = MinMaxScaler()

In [40]:
white_final = Final_white_bubble4.copy()
white_final.columns = ["tooltip", "name"]
white_final["value"] = scaler.fit_transform(white_final["tooltip"].values.reshape(-1,1))*100

white_data = json.loads(white_final.to_json(orient="records"))
white_data_final = {
    "name":"white wine",
    "data": white_data
}
white_data_final

{'name': 'white wine',
 'data': [{'tooltip': 87.1,
   'name': 'Western Australia',
   'value': 64.2105263158},
  {'tooltip': 88.4, 'name': 'Washington', 'value': 77.8947368421},
  {'tooltip': 84.7, 'name': 'Virginia', 'value': 38.9473684211},
  {'tooltip': 88.6, 'name': 'Victoria', 'value': 80.0},
  {'tooltip': 86.6, 'name': 'Veneto', 'value': 58.9473684211},
  {'tooltip': 87.7, 'name': 'Tuscany', 'value': 70.5263157895},
  {'tooltip': 84.7, 'name': 'Texas', 'value': 38.9473684211},
  {'tooltip': 88.0, 'name': 'Tasmania', 'value': 73.6842105263},
  {'tooltip': 85.0, 'name': 'Spanish Islands', 'value': 42.1052631579},
  {'tooltip': 84.9, 'name': 'Spain Other', 'value': 41.0526315789},
  {'tooltip': 87.4, 'name': 'Southwest France', 'value': 67.3684210526},
  {'tooltip': 87.3, 'name': 'Southern Italy', 'value': 66.3157894737},
  {'tooltip': 87.2, 'name': 'South Australia', 'value': 65.2631578947},
  {'tooltip': 87.6, 'name': 'Sicily & Sardinia', 'value': 69.4736842105},
  {'tooltip': 88.

In [41]:
red_final = Final_red_bubble4.copy()
red_final.columns = ["tooltip", "name"]
red_final["value"] = scaler.fit_transform(red_final["tooltip"].values.reshape(-1,1))*100

red_data = json.loads(red_final.to_json(orient="records"))
red_data_final = {
    "name":"red wine",
    "data": red_data
}
red_data_final

{'name': 'red wine',
 'data': [{'tooltip': 88.1,
   'name': 'Western Australia',
   'value': 79.4871794872},
  {'tooltip': 89.0, 'name': 'Washington', 'value': 91.0256410256},
  {'tooltip': 85.3, 'name': 'Virginia', 'value': 43.5897435897},
  {'tooltip': 87.7, 'name': 'Victoria', 'value': 74.358974359},
  {'tooltip': 88.7, 'name': 'Veneto', 'value': 87.1794871795},
  {'tooltip': 89.5, 'name': 'Tuscany', 'value': 97.4358974359},
  {'tooltip': 85.8, 'name': 'Texas', 'value': 50.0},
  {'tooltip': 87.6, 'name': 'Tasmania', 'value': 73.0769230769},
  {'tooltip': 88.7, 'name': 'Spanish Islands', 'value': 87.1794871795},
  {'tooltip': 84.8, 'name': 'Spain Other', 'value': 37.1794871795},
  {'tooltip': 88.6, 'name': 'Southwest France', 'value': 85.8974358974},
  {'tooltip': 87.7, 'name': 'Southern Italy', 'value': 74.358974359},
  {'tooltip': 89.0, 'name': 'South Australia', 'value': 91.0256410256},
  {'tooltip': 88.0, 'name': 'Sicily & Sardinia', 'value': 78.2051282051},
  {'tooltip': 89.0, '

In [None]:
#Final_red_bubble4.to_json('Final_red_bubble4.js', index=False)
#Final_white_bubble4.to_json('Final_white_bubble4.js', index=False)

In [None]:
#final = [white_data_final, red_data_final]
#final

In [None]:
#white_wine_dict = dict(zip(Final_white_bubble4.province, Final_white_bubble4.avg_points))
#white_wine_dict

In [None]:
#red_wine_dict = dict(zip(Final_red_bubble4.province, Final_red_bubble4.avg_points))
#red_wine_dict

In [42]:
sunburst_Wine_Data = Final_Wine_Data.loc[:, ["type", "variety", "province", "winery", ]]
sunburst_Wine_Data.describe()

Unnamed: 0,type,variety,province,winery
count,113871,113871,113871,113871
unique,2,266,58,11808
top,red wine,Pinot Noir,California,Williams Selyem
freq,81292,12193,44294,371


In [None]:
sun_final = sunburst_Wine_Data.copy()
sun_final.columns = ["type", "variety", "province", "winery"]
# red_final["value"] = scaler.fit_transform(red_final["tooltip"].values.reshape(-1,1))*100

sun_final = json.loads(sun_final.to_json(orient="records"))
sun_data_final = {
    "type":"Wine Type",
    "variety":"Wine Variety",
    "province": "Province",
    "winery":"Winery",
    "data": sun_final
}
sun_data_final

In [40]:
sunburtst_type = sunburst_Wine_Data["type"].value_counts().to_frame().reset_index(drop=False)
sunburtst_type = sunburtst_type.rename(columns = {'index': 'wine_type', 'type':'type_count'})
sunburtst_type

Unnamed: 0,wine_type,type_count
0,red wine,81292
1,white wine,32579


In [41]:
sunburtst_type2 = sunburst_Wine_Data["variety"].value_counts().to_frame().reset_index(drop=False)
sunburtst_type2

Unnamed: 0,index,variety
0,Pinot Noir,12193
1,Chardonnay,12103
2,Cabernet Sauvignon,10955
3,Red Blend,8181
4,Syrah,5075
...,...,...
261,Saperavi,3
262,Touriga Franca,2
263,Madeira Blend,1
264,Muskat Ottonel,1


In [42]:
sun_type_final = sunburtst_type.copy()
sun_type_final.columns = ["count", "name"]
# red_final["value"] = scaler.fit_transform(red_final["tooltip"].values.reshape(-1,1))*100

sun_final = json.loads(sun_type_final.to_json())
sun_data_final = {
    "count":"type_count",
    "name":"wine_type",
    "childern": sun_type_final
}
sun_data_final

{'count': 'type_count',
 'name': 'wine_type',
 'childern':         count   name
 0    red wine  81292
 1  white wine  32579}

In [170]:
sunburst_red = sunburst_Wine_Data.loc[sunburst_Wine_Data.type == 'red wine']
sunburst_white = sunburst_Wine_Data.loc[sunburst_Wine_Data.type == 'white wine']
sunburst_red.describe()

Unnamed: 0,type,variety,province,winery
count,81292,81292,81292,81292
unique,1,150,58,10283
top,red wine,Pinot Noir,California,Williams Selyem
freq,81292,12193,32388,302


In [185]:
group_data = sunburst_red.groupby(['type','variety','province']).size()
group_data

type      variety    province       
red wine  Aglianico  California          11
                     Southern Italy     248
          Albariño   California          77
                     Catalonia            2
                     Galicia            438
                                       ... 
          Zinfandel  South Australia      5
                     Texas                1
                     Washington          73
          Zweigelt   New York             1
                     Washington           2
Length: 873, dtype: int64

In [187]:
group_data.to_csv('group_data_red.csv', index=True)


In [None]:
group_data.to_json('group_data.js', index=True)

In [44]:
sunburst_red_variety = sunburst_red.value_counts(['variety', 'province', 'winery']).to_frame().reset_index(drop=False)
sunburst_red_variety = sunburst_red_variety.rename(columns = {0: 'count'})
sunburst_red_variety

sunburst_red_variety = sunburst_red_variety.loc[:, ["winery", "count" ]]
sunburst_red_variety

Unnamed: 0,winery,count
0,Williams Selyem,244
1,Testarossa,148
2,Bouchard Père & Fils,117
3,Calera,98
4,Siduri,96
...,...,...
20794,Sarah's Vineyard,1
20795,Talaria,1
20796,Standing Sun,1
20797,Summerland,1


## Format for count of winery's  - Red Wine

In [105]:
sunburst_red_winery = sunburst_red.value_counts(['variety', 'province', 'winery']).to_frame().reset_index(drop=False)
sunburst_red_winery = sunburst_red_winery.rename(columns = {0: 'count'})
sunburst_red_winery

sunburst_red_winery = sunburst_red_winery.loc[:, ["winery", "count" ]]
sunburst_red_winery

Unnamed: 0,winery,count
0,Williams Selyem,244
1,Testarossa,148
2,Bouchard Père & Fils,117
3,Calera,98
4,Siduri,96
...,...,...
20794,Sarah's Vineyard,1
20795,Talaria,1
20796,Standing Sun,1
20797,Summerland,1


In [160]:
sunburst_red_province = sunburst_red.value_counts(['province']).to_frame().reset_index(drop=False)
sunburst_red_province = sunburst_red_province.rename(columns = {0: 'count'})
sunburst_red_province

sunburst_red_province = sunburst_red_province.loc[:, ["province", "count" ]]
sunburst_red_province

Unnamed: 0,province,count
0,California,32388
1,Washington,7360
2,Tuscany,5583
3,Mendoza Province,4009
4,Northern Spain,3982
5,Oregon,3268
6,Piedmont,2562
7,South Australia,2222
8,Bordeaux,2162
9,Veneto,1668


In [107]:
sun_red2_province = sunburst_red_province.copy()
sun_red2_province.columns = ["name", "value" ]
# red_final["value"] = scaler.fit_transform(red_final["tooltip"].values.reshape(-1,1))*100

sun_red2_province = json.loads(sun_red2_province.to_json(orient="records"))
province_red2_final = {
#     "name":"Wines",
#     "children":[{"count":21007, "name":"Red Wine", 
#     "children":[{"count":150, "name":"variety",
    "count":873,
    "name":"Province",
    "children": sun_red2_province
}
# ]}]}
province_red2_final

{'count': 873,
 'name': 'Province',
 'children': [{'name': 'California', 'value': 32388},
  {'name': 'Washington', 'value': 7360},
  {'name': 'Tuscany', 'value': 5583},
  {'name': 'Mendoza Province', 'value': 4009},
  {'name': 'Northern Spain', 'value': 3982},
  {'name': 'Oregon', 'value': 3268},
  {'name': 'Piedmont', 'value': 2562},
  {'name': 'South Australia', 'value': 2222},
  {'name': 'Bordeaux', 'value': 2162},
  {'name': 'Veneto', 'value': 1668},
  {'name': 'Sicily & Sardinia', 'value': 1384},
  {'name': 'Burgundy', 'value': 1361},
  {'name': 'Catalonia', 'value': 1152},
  {'name': 'New York', 'value': 1120},
  {'name': 'Rhône Valley', 'value': 924},
  {'name': 'Southwest France', 'value': 912},
  {'name': 'Central Italy', 'value': 900},
  {'name': 'Southern Italy', 'value': 842},
  {'name': 'Provence', 'value': 790},
  {'name': 'Languedoc-Roussillon', 'value': 766},
  {'name': 'Other', 'value': 755},
  {'name': 'Central Spain', 'value': 622},
  {'name': 'Galicia', 'value': 466

In [108]:
sunburst_red_variety = sunburst_red.value_counts(['variety']).to_frame().reset_index(drop=False)
sunburst_red_variety = sunburst_red_variety.rename(columns = {0: 'count'})
sunburst_red_variety

sunburst_red_variety = sunburst_red_variety.loc[:, ["variety", "count" ]]
sunburst_red_variety

Unnamed: 0,variety,count
0,Pinot Noir,12193
1,Cabernet Sauvignon,10955
2,Red Blend,8181
3,Syrah,5075
4,Merlot,4291
...,...,...
145,Cabernet Sauvignon-Carmenère,4
146,Saperavi,3
147,Zweigelt,3
148,Touriga Franca,2


In [109]:
sun_red2_Variety = sunburst_red_variety.copy()
sun_red2_Variety.columns = ["name", "value" ]
# red_final["value"] = scaler.fit_transform(red_final["tooltip"].values.reshape(-1,1))*100

sun_red2_Variety = json.loads(sun_red2_Variety.to_json(orient="records"))
sun_data_red2_Variety = {
    "name":"Wines",
    "children":[{"count":21007, "name":"Red Wine", 
#     "children":[{"count":150, "name":"variety",
#     "children":[{"count":873, "name":"Province",
    "count":150,
    "value":"Variety",
    "children": sun_red2_Variety
}]}
# ]}]}
sun_data_red2_Variety

{'name': 'Wines',
 'children': [{'count': 150,
   'name': 'Red Wine',
   'value': 'Variety',
   'children': [{'name': 'Pinot Noir', 'value': 12193},
    {'name': 'Cabernet Sauvignon', 'value': 10955},
    {'name': 'Red Blend', 'value': 8181},
    {'name': 'Syrah', 'value': 5075},
    {'name': 'Merlot', 'value': 4291},
    {'name': 'Bordeaux-style Red Blend', 'value': 4213},
    {'name': 'Zinfandel', 'value': 3790},
    {'name': 'Malbec', 'value': 2960},
    {'name': 'Sangiovese', 'value': 2874},
    {'name': 'Tempranillo', 'value': 2515},
    {'name': 'Rosé', 'value': 2148},
    {'name': 'Sparkling Blend', 'value': 1745},
    {'name': 'Shiraz', 'value': 1592},
    {'name': 'Nebbiolo', 'value': 1525},
    {'name': 'Rhône-style Red Blend', 'value': 1434},
    {'name': 'Corvina, Rondinella, Molinara', 'value': 1292},
    {'name': 'Cabernet Franc', 'value': 1257},
    {'name': 'Barbera', 'value': 959},
    {'name': 'Sangiovese Grosso', 'value': 938},
    {'name': 'Petite Sirah', 'value': 8

In [110]:
sun_red2_Winery = sunburst_red_winery.copy()
sun_red2_Winery.columns = ["name", "value" ]
# red_final["value"] = scaler.fit_transform(red_final["tooltip"].values.reshape(-1,1))*100

sun_red2_Winery = json.loads(sun_red2_Winery.to_json(orient="records"))
sun_data_red2_Winery = {
#     "name":"Wines",
#     "children":[{"count":21007, "name":"Red Wine", 
#     "children":[{"count":150, "name":"variety",
#     "children":[{"count":873, "name":"Province",
    "count":20799,
    "value":"Winery",
    "children": sun_red2_Winery
}
# ]}]}]}
sun_data_red2_Winery

{'count': 20799,
 'value': 'Winery',
 'children': [{'name': 'Williams Selyem', 'value': 244},
  {'name': 'Testarossa', 'value': 148},
  {'name': 'Bouchard Père & Fils', 'value': 117},
  {'name': 'Calera', 'value': 98},
  {'name': 'Siduri', 'value': 96},
  {'name': 'Merry Edwards', 'value': 90},
  {'name': 'Ken Wright', 'value': 89},
  {'name': 'Renwood', 'value': 87},
  {'name': 'Freixenet', 'value': 86},
  {'name': 'Peachy Canyon', 'value': 83},
  {'name': 'Erath', 'value': 77},
  {'name': 'Trapiche', 'value': 77},
  {'name': 'Joseph Drouhin', 'value': 76},
  {'name': 'MacPhail', 'value': 75},
  {'name': 'Willamette Valley Vineyards', 'value': 75},
  {'name': 'Louis Latour', 'value': 73},
  {'name': 'Arista', 'value': 73},
  {'name': 'Louis Jadot', 'value': 73},
  {'name': 'Gary Farrell', 'value': 70},
  {'name': 'Roessler', 'value': 69},
  {'name': 'Rex Hill', 'value': 65},
  {'name': 'Schramsberg', 'value': 65},
  {'name': 'Raymond', 'value': 64},
  {'name': 'Segura Viudas', 'value'

In [117]:
sun_red2_final = sunburst_red_variety.copy()
sun_red2_final.columns = ["name", "value" ]
# red_final["value"] = scaler.fit_transform(red_final["tooltip"].values.reshape(-1,1))*100

sun_red2_final = json.loads(sun_red2_final.to_json(orient="records"))
sun_data_red3_final = {
    "name":"Wines",
    "children":[{"count":21007, "name":"Red Wine", 
    "children":[{"count":150, "name":"variety",
    "children": sun_data_red2_Variety,
    "children":[{"count":873, "name":"Province",
    "children": province_red2_final,
    "children":[{"count":20799,"name":"Winery",
    "children": sun_red2_final
                 
}]}]}]}]}
sun_data_red3_final

{'name': 'Wines',
 'children': [{'count': 21007,
   'name': 'Red Wine',
   'children': [{'count': 150,
     'name': 'variety',
     'children': [{'count': 873,
       'name': 'Province',
       'children': [{'count': 20799,
         'name': 'Winery',
         'children': [{'name': 'Pinot Noir', 'value': 12193},
          {'name': 'Cabernet Sauvignon', 'value': 10955},
          {'name': 'Red Blend', 'value': 8181},
          {'name': 'Syrah', 'value': 5075},
          {'name': 'Merlot', 'value': 4291},
          {'name': 'Bordeaux-style Red Blend', 'value': 4213},
          {'name': 'Zinfandel', 'value': 3790},
          {'name': 'Malbec', 'value': 2960},
          {'name': 'Sangiovese', 'value': 2874},
          {'name': 'Tempranillo', 'value': 2515},
          {'name': 'Rosé', 'value': 2148},
          {'name': 'Sparkling Blend', 'value': 1745},
          {'name': 'Shiraz', 'value': 1592},
          {'name': 'Nebbiolo', 'value': 1525},
          {'name': 'Rhône-style Red Blend', 'val

In [112]:
with open('sun_data_red3_final.json', 'w') as fp:
    json.dump(sun_data_red2_final, fp)

# Format for count of winery's  - White Wine

In [161]:
sunburst_white_variety = sunburst_white.value_counts(['variety', 'province', 'winery']).to_frame().reset_index(drop=False)
sunburst_white_variety = sunburst_white_variety.rename(columns = {0: 'count'})

sunburst_white_variety = sunburst_white_variety.loc[:, ["winery", "count" ]]
sunburst_white_variety

Unnamed: 0,winery,count
0,Testarossa,112
1,Joseph Drouhin,103
2,Olivier Leflaive,90
3,Louis Latour,80
4,Bouchard Père & Fils,75
...,...,...
8953,Shoofly,1
8954,1070 Green,1
8955,39°,1
8956,75 Wine Co.,1


In [162]:
sun_red_final = sunburst_red_variety.copy()
sun_red_final.columns = ["name", "value" ]
# red_final["value"] = scaler.fit_transform(red_final["tooltip"].values.reshape(-1,1))*100

sun_red_final = json.loads(sun_red_final.to_json(orient="records"))
sun_data_red_final = {
    "name":"Wines",
    "children":[{"count":30138, "name":"Red Wine", 
    "children":[{"count":150, "name":"variety",
    "children":[{"count":58, "name":"Province",
    "count":20799,
    "name":"Winery",
    "children": sun_red_final
}]}]}]}
sun_data_red_final

{'name': 'Wines',
 'children': [{'count': 30138,
   'name': 'Red Wine',
   'children': [{'count': 150,
     'name': 'variety',
     'children': [{'count': 20799,
       'name': 'Winery',
       'children': [{'name': 'Pinot Noir', 'value': 12193},
        {'name': 'Cabernet Sauvignon', 'value': 10955},
        {'name': 'Red Blend', 'value': 8181},
        {'name': 'Syrah', 'value': 5075},
        {'name': 'Merlot', 'value': 4291},
        {'name': 'Bordeaux-style Red Blend', 'value': 4213},
        {'name': 'Zinfandel', 'value': 3790},
        {'name': 'Malbec', 'value': 2960},
        {'name': 'Sangiovese', 'value': 2874},
        {'name': 'Tempranillo', 'value': 2515},
        {'name': 'Rosé', 'value': 2148},
        {'name': 'Sparkling Blend', 'value': 1745},
        {'name': 'Shiraz', 'value': 1592},
        {'name': 'Nebbiolo', 'value': 1525},
        {'name': 'Rhône-style Red Blend', 'value': 1434},
        {'name': 'Corvina, Rondinella, Molinara', 'value': 1292},
        {'name':

In [None]:
sun_white_final = sunburst_white_variety.copy()
sun_white_final.columns = ["name", "value"]
# red_final["value"] = scaler.fit_transform(red_final["tooltip"].values.reshape(-1,1))*100

sun_white_final = json.loads(sun_white_final.to_json(orient="records"))
sun_data_white_final = {
    "children":[{"count":9131, "name":"White Wine", 
    "children":[{"count":116, "name":"variety",
    "children":[{"count":57, "name":"Province",
    "count":8958,
    "name":"Winery",
    "children": sun_white_final
}]}]}]}
sun_data_white_final

In [None]:
with open('data_white.json', 'w') as fp:
    json.dump(sun_data_white_final, fp)

In [None]:
with open('data_red.json', 'w') as fp:
    json.dump(sun_data_red_final, fp)

In [None]:
sunburtst_variety_red = sunburst_red["variety"].value_counts().to_frame()
sunburtst_variety_white = sunburst_white["variety"].value_counts().to_frame()
sunburtst_variety_red.info()

In [None]:
sunburtst_variety_white.info()

In [None]:
sunburtst_province_red = sunburst_red["province"].value_counts().to_frame()
sunburtst_province_white = sunburst_white["province"].value_counts().to_frame()
sunburtst_province_red.info()

In [None]:
sunburtst_province_white.info()

In [None]:
sunburtst_winery_red = sunburst_red["winery"].value_counts().to_frame()
sunburtst_winery_white = sunburst_white["winery"].value_counts().to_frame()
sunburtst_winery_white

In [None]:
sunburst_white_province = sunburst_white.value_counts([ 'province']).to_frame().reset_index(drop=False)
sunburst_white_province = sunburst_white_variety.rename(columns = {0: 'count'})

# sunburst_white_variety = sunburst_white_variety.loc[:, ["province", "count" ]]
sunburst_white_province

In [None]:
sunburst_red_province = sunburst_red.value_counts([ 'province']).to_frame().reset_index(drop=False)
sunburst_red_province = sunburst_red_variety.rename(columns = {0: 'count'})

# sunburst_red_variety = sunburst_red_variety.loc[:, ["province", "count" ]]
sunburst_red_province

In [None]:
sun_red_province = sunburst_red_province.copy()
sun_red_province.columns = ["name", "value" ]
# red_final["value"] = scaler.fit_transform(red_final["tooltip"].values.reshape(-1,1))*100

sun_red_province = json.loads(sun_red_province.to_json(orient="records"))
sun_data_red_province_final = {
    "name":"Wines",
    "children":[{"count":208, "name":"Red Wine", 
    "children":[{"count":150, "name":"variety",
    "count":58,
    "value":"Province",
    "children": sun_red_province
}]}]}
sun_data_red_province_final

In [None]:
sun_white_province_final = sunburst_white_variety.copy()
sun_white_province_final.columns = ["name", "value"]
# red_final["value"] = scaler.fit_transform(red_final["tooltip"].values.reshape(-1,1))*100

sun_white_province_final = json.loads(sun_white_province_final.to_json(orient="records"))
sun_data_white_province_final = {
    "children":[{"count":173, "name":"White Wine", 
    "children":[{"count":116, "name":"variety",
    "count":57,
    "value":"Winery",
    "children": sun_white_final
}]}]}
sun_data_white_province_final

In [125]:
Top_100_Red
Top_100_Red_sb = Top_100_Red.loc[:, ["type", "variety", "province", "winery", ]]
Top_100_Red_sb.describe()


Unnamed: 0,type,variety,province,winery
count,100,100,100,100
unique,1,15,9,40
top,red wine,Syrah,Tuscany,Le Macchiole
freq,100,19,37,13


In [128]:
sb_variety_100 = Top_100_Red_sb.value_counts(['variety']).to_frame().reset_index(drop=False)
sb_variety_100 = sunburst_red_variety.rename(columns = {0: 'count'})


sb_variety_100 = sb_variety_100.loc[:, ["variety", "count" ]]
sb_variety_100

Unnamed: 0,variety,count
0,Pinot Noir,12193
1,Cabernet Sauvignon,10955
2,Red Blend,8181
3,Syrah,5075
4,Merlot,4291
...,...,...
145,Cabernet Sauvignon-Carmenère,4
146,Saperavi,3
147,Zweigelt,3
148,Touriga Franca,2


In [129]:
sb_red_province = sb_variety_100.copy()
sb_red_province.columns = ["name", "value" ]
# red_final["value"] = scaler.fit_transform(red_final["tooltip"].values.reshape(-1,1))*100

sb_red_province = json.loads(sb_red_province.to_json(orient="records"))
sb_red_province_final = {
    "name":"Wines",
    "children":[{"count":208, "name":"Red Wine", 
    "children":[{"count":150, "name":"variety",
    "count":58,
    "value":"Province",
    "children": sb_red_province
}]}]}
sb_red_province_final

{'name': 'Wines',
 'children': [{'count': 208,
   'name': 'Red Wine',
   'children': [{'count': 58,
     'name': 'variety',
     'value': 'Province',
     'children': [{'name': 'Pinot Noir', 'value': 12193},
      {'name': 'Cabernet Sauvignon', 'value': 10955},
      {'name': 'Red Blend', 'value': 8181},
      {'name': 'Syrah', 'value': 5075},
      {'name': 'Merlot', 'value': 4291},
      {'name': 'Bordeaux-style Red Blend', 'value': 4213},
      {'name': 'Zinfandel', 'value': 3790},
      {'name': 'Malbec', 'value': 2960},
      {'name': 'Sangiovese', 'value': 2874},
      {'name': 'Tempranillo', 'value': 2515},
      {'name': 'Rosé', 'value': 2148},
      {'name': 'Sparkling Blend', 'value': 1745},
      {'name': 'Shiraz', 'value': 1592},
      {'name': 'Nebbiolo', 'value': 1525},
      {'name': 'Rhône-style Red Blend', 'value': 1434},
      {'name': 'Corvina, Rondinella, Molinara', 'value': 1292},
      {'name': 'Cabernet Franc', 'value': 1257},
      {'name': 'Barbera', 'value': 9

In [51]:
Top_100_Red
Top_100_Red_sb = Top_100_Red.loc[:, ["type", "variety", "province", "winery", ]]
Top_100_Red_sb.describe()

Unnamed: 0,type,variety,province,winery
count,100,100,100,100
unique,1,15,9,40
top,red wine,Syrah,Tuscany,Le Macchiole
freq,100,19,37,13


In [53]:
group_data_red = Top_100_Red_sb.groupby(['type','variety','province']).size()
group_data_red

type      variety                   province       
red wine  Bordeaux-style Red Blend  Bordeaux            1
                                    California          4
                                    Washington          2
          Cabernet Blend            California          6
          Cabernet Franc            Tuscany             4
          Cabernet Sauvignon        California          9
                                    Oregon              3
                                    Washington          1
          Merlot                    Tuscany            14
          Nebbiolo                  Piedmont            4
          Pinot Noir                California         11
          Prugnolo Gentile          Tuscany             5
          Red Blend                 California          1
                                    Tuscany             8
          Sangiovese                Tuscany             2
          Sangiovese Grosso         Tuscany             1
          Shiraz    

In [46]:
Top_100_White
Top_100_White_sb = Top_100_White.loc[:, ["type", "variety", "province", "winery", ]]
Top_100_White_sb.describe()

Unnamed: 0,type,variety,province,winery
count,100,100,100,100
unique,1,9,8,36
top,white wine,Chardonnay,Champagne,Domaine Leflaive
freq,100,39,29,10


In [50]:
group_data = Top_100_White_sb.groupby(['type','variety','province']).size()
group_data

type        variety                     province          
white wine  Bordeaux-style White Blend  Bordeaux              12
            Champagne Blend             California             1
                                        Champagne             24
            Chardonnay                  Burgundy              23
                                        California            11
                                        Champagne              5
            Muscadel                    Victoria               3
            Muscat                      Victoria               7
            Picolit                     Northeastern Italy     2
            Pinot Gris                  Alsace                 4
            Tokay                       Victoria               5
            White Blend                 Tuscany                3
dtype: int64

In [154]:
group_data.to_csv('group_data.csv', index=True)

In [155]:
group_data.to_json('group_data.js', index=True)

In [140]:
Top_100_White_sb.groupby('type')[['variety','province','winery']].apply(lambda g: g.values.tolist()).to_dict()

{'red wine': [['Syrah', 'Oregon', 'Cayuse'],
  ['Merlot', 'Tuscany', "Tenuta dell'Ornellaia"],
  ['Cabernet Sauvignon', 'California', 'Shafer'],
  ['Syrah', 'Oregon', 'Cayuse'],
  ['Pinot Noir', 'California', 'Williams Selyem'],
  ['Prugnolo Gentile', 'Tuscany', 'Avignonesi'],
  ['Cabernet Sauvignon', 'California', 'Cardinale'],
  ['Pinot Noir', 'California', 'Williams Selyem'],
  ['Cabernet Blend', 'California', 'Sloan'],
  ['Merlot', 'Tuscany', "Tenuta dell'Ornellaia"],
  ['Red Blend', 'Tuscany', 'Giovanni Chiappini'],
  ['Syrah', 'Oregon', 'Cayuse'],
  ['Red Blend', 'Tuscany', 'Giovanni Chiappini'],
  ['Cabernet Blend', 'California', 'Sloan'],
  ['Merlot', 'Tuscany', "Tenuta dell'Ornellaia"],
  ['Cabernet Sauvignon', 'California', 'Cardinale'],
  ['Prugnolo Gentile', 'Tuscany', 'Avignonesi'],
  ['Syrah', 'Washington', 'Charles Smith'],
  ['Nebbiolo', 'Piedmont', 'Mascarello Giuseppe e Figlio'],
  ['Red Blend', 'California', 'Colgin'],
  ['Merlot', 'Tuscany', 'Le Macchiole'],
  ['Pru

In [141]:
sb_variety_100_f = Top_100_White_sb.value_counts(['variety']).to_frame().reset_index(drop=False)
sb_variety_100_f = sb_variety_100_f.rename(columns = {0: 'count'})


sb_variety_100_f = sb_variety_100_f.loc[:, ["variety", "count" ]]
sb_variety_100_f

Unnamed: 0,variety,count
0,Syrah,19
1,Merlot,14
2,Cabernet Sauvignon,13
3,Pinot Noir,11
4,Red Blend,9
5,Bordeaux-style Red Blend,7
6,Cabernet Blend,6
7,Prugnolo Gentile,5
8,Nebbiolo,4
9,Cabernet Franc,4


In [142]:
sb_red_variety_f = sb_variety_100_f.copy()
sb_red_variety_f.columns = ["name", "value" ]
# red_final["value"] = scaler.fit_transform(red_final["tooltip"].values.reshape(-1,1))*100

sb_red_variety_f = json.loads(sb_red_variety_f.to_json(orient="records"))
sb_red_variety_fi = {
    "name":"Wines",
    "children":[{"count":208, "name":"Red Wine", 
#     "children":[{"count":150, "name":"variety",
    "count":15,
    "value":"Variety",
    "children": sb_red_variety_f
}]}
# ]}
sb_red_variety_fi

{'name': 'Wines',
 'children': [{'count': 15,
   'name': 'Red Wine',
   'value': 'Variety',
   'children': [{'name': 'Syrah', 'value': 19},
    {'name': 'Merlot', 'value': 14},
    {'name': 'Cabernet Sauvignon', 'value': 13},
    {'name': 'Pinot Noir', 'value': 11},
    {'name': 'Red Blend', 'value': 9},
    {'name': 'Bordeaux-style Red Blend', 'value': 7},
    {'name': 'Cabernet Blend', 'value': 6},
    {'name': 'Prugnolo Gentile', 'value': 5},
    {'name': 'Nebbiolo', 'value': 4},
    {'name': 'Cabernet Franc', 'value': 4},
    {'name': 'Sparkling Blend', 'value': 2},
    {'name': 'Shiraz', 'value': 2},
    {'name': 'Sangiovese', 'value': 2},
    {'name': 'Tinto Fino', 'value': 1},
    {'name': 'Sangiovese Grosso', 'value': 1}]}]}

In [147]:
sb_province_100_f = Top_100_White_sb.value_counts(['variety','province']).to_frame().reset_index(drop=False)
sb_province_100_f = sb_province_100_f.rename(columns = {0: 'count'})


# sb_province_100_f = sb_province_100_f.loc[:, ["province", "count" ]]
sb_province_100_f

Unnamed: 0,variety,province,count
0,Merlot,Tuscany,14
1,Pinot Noir,California,11
2,Cabernet Sauvignon,California,9
3,Red Blend,Tuscany,8
4,Syrah,Oregon,7
5,Cabernet Blend,California,6
6,Prugnolo Gentile,Tuscany,5
7,Syrah,Washington,4
8,Cabernet Franc,Tuscany,4
9,Bordeaux-style Red Blend,California,4


In [145]:
sb_red_province_f = sb_province_100_f.copy()
sb_red_province_f.columns = ["name", "value" ]
# red_final["value"] = scaler.fit_transform(red_final["tooltip"].values.reshape(-1,1))*100

sb_red_province_f = json.loads(sb_red_province_f.to_json(orient="records"))
sb_red_province_fi = {
#     "name":"Wines",
#     "children":[{"count":208, "name":"Red Wine", 
#     "children":[{"count":150, "name":"variety",
    "count":15,
    "value":"Province",
    "children": sb_red_province_f
}
# ]}]}
sb_red_province_fi

{'count': 15,
 'value': 'Province',
 'children': [{'name': 'Tuscany', 'value': 37},
  {'name': 'California', 'value': 33},
  {'name': 'Oregon', 'value': 10},
  {'name': 'Washington', 'value': 7},
  {'name': 'Rhône Valley', 'value': 4},
  {'name': 'Piedmont', 'value': 4},
  {'name': 'South Australia', 'value': 3},
  {'name': 'Northern Spain', 'value': 1},
  {'name': 'Bordeaux', 'value': 1}]}

In [132]:
Top_100_White_sb.to_json('Top_100_White_sb.js', index=False)


ValueError: 'index=False' is only valid when 'orient' is 'split' or 'table'