# Scrape data, build dataset

http://www.sakayanyc.com/shop_all.php?pg=all    
https://www.sakeno.com/ranking/
https://www.saketime.jp/ranking/    

In [1]:
import requests
import pandas as pd
import re
import numpy as np
import os

import math
import statistics

from bs4 import BeautifulSoup

import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
matplotlib.rcParams['pdf.fonttype'] = 42

%matplotlib inline

In [2]:
url = f'http://www.sakayanyc.com/shop_all.php?pg=all'
response = requests.get(url)
doc = BeautifulSoup(response.text)

In [3]:
sakes = doc.find_all('td', style='width: 140px; vertical-align: top; padding-bottom: 20px')
rows = []

# the last 'td' on the website is not an actual product, so skip it
for sake in sakes[:-1]:
    row = {}
    row['name'] = sake.find(class_='shop_browse_title').text.strip()
    info = sake.find(class_='shop_browse_info').text.strip()
    row['sake_type'] = info.split('$')[0]
    row['cost'] = info.split('$')[1]
    row['slug'] = sake.a['href']
    rows.append(row)

In [4]:
df = pd.DataFrame(rows)

In [5]:
df.head(3)

Unnamed: 0,cost,name,sake_type,slug
0,38.99/720 ml,Aka Kirishima,(Imo Shochu - Miyazaki),shop_all.php?prod_id=242
1,10.99/180 ml,"Akishika ""Bambi""",(Junmai - Osaka),shop_all.php?prod_id=206
2,15.99/300 ml,"Akitabare Koshiki Junzukuri ""Northern Skies""",(Junmai - Akita),shop_all.php?prod_id=163


## Clean up the data

* get rid of () in sake_type
* sake_type -- break it up into the actual sake type and the prefecture where it's from

In [6]:
df.head(1)

Unnamed: 0,cost,name,sake_type,slug
0,38.99/720 ml,Aka Kirishima,(Imo Shochu - Miyazaki),shop_all.php?prod_id=242


In [7]:
df['sake_type'] = df.sake_type.str.replace('(', '')
df['sake_type'] = df.sake_type.str.replace(')', '')
df.sake_type.head(3)

0    Imo Shochu - Miyazaki
1           Junmai - Osaka
2           Junmai - Akita
Name: sake_type, dtype: object

In [8]:
df['sake_type_cleaned'] = df.sake_type.str.split(' - ').str.get(0)
df['prefecture'] = df.sake_type.str.split(' - ').str.get(1)

In [9]:
df['sake_type_cleaned'] = df['sake_type_cleaned'].str.strip()
df['sake_type_cleaned'].value_counts()

Junmai Ginjo                              47
Junmai                                    28
Junmai Daiginjo                           25
Tokubetsu Junmai                          17
Daiginjo                                   9
Mugi Shochu                                8
Imo Shochu                                 8
Ginjo                                      7
Junmai Kimoto                              5
Junmai Yamahai                             5
Junmai Yamahai                             4
Honjozo                                    4
Junmai Kimoto                              3
Junmai Daiginjo Muroka Genshu              2
Kome Shochu                                2
Junmai Daiginjo Yamahai                    2
Awamori                                    2
Junmai Daiginjo Kimoto                     2
Tokubetsu Junmai Kimoto                    2
Ginjo Nama Genshu                          2
Junmai Ginjo Nigori                        1
Nama Nama Genshu                           1
Ginjo Nigo

In [10]:
df['cost_cleaned'] = df.cost.str.split('/').str.get(0)

In [11]:
df['cost_cleaned'] = df.cost_cleaned.astype(float)

In [12]:
df['vol'] = df.cost.str.split('/').str.get(1).str.replace(' ml', '')
df['vol'] = df.vol.astype(float)

In [13]:
df.dtypes

cost                  object
name                  object
sake_type             object
slug                  object
sake_type_cleaned     object
prefecture            object
cost_cleaned         float64
vol                  float64
dtype: object

In [14]:
df.head(2)

Unnamed: 0,cost,name,sake_type,slug,sake_type_cleaned,prefecture,cost_cleaned,vol
0,38.99/720 ml,Aka Kirishima,Imo Shochu - Miyazaki,shop_all.php?prod_id=242,Imo Shochu,Miyazaki,38.99,720.0
1,10.99/180 ml,"Akishika ""Bambi""",Junmai - Osaka,shop_all.php?prod_id=206,Junmai,Osaka,10.99,180.0


# Get rid of umeshu, ume, yuzu, and awamori. I just want sake.

In [15]:
df = df.sort_values('sake_type_cleaned').reset_index()

In [16]:
df.tail(5)

Unnamed: 0,index,cost,name,sake_type,slug,sake_type_cleaned,prefecture,cost_cleaned,vol
210,155,30.99/720 ml,Souden,Tokubetsu Junmai Yamahai - Fukuoka,shop_all.php?prod_id=296,Tokubetsu Junmai Yamahai,Fukuoka,30.99,720.0
211,103,48.99/720 ml,"Mana 1751 ""True Vision""",Tokubetsu Junmai Yamahai Muroka Genshu - Fukui,shop_all.php?prod_id=300,Tokubetsu Junmai Yamahai Muroka Genshu,Fukui,48.99,720.0
212,196,30.99/750 ml,Ume no Yado Aragoshi Ume,Ume - Nara,shop_all.php?prod_id=169,Ume,Nara,30.99,750.0
213,110,54.99/750 ml,Mito no Kairakuen,Umeshu - Ibaraki,shop_all.php?prod_id=165,Umeshu,Ibaraki,54.99,750.0
214,195,30.99/750 ml,Ume no Yado,Yuzu - Nara,shop_all.php?prod_id=170,Yuzu,Nara,30.99,750.0


In [17]:
df.head(5)

Unnamed: 0,index,cost,name,sake_type,slug,sake_type_cleaned,prefecture,cost_cleaned,vol
0,214,53.99/750 ml,Zuisen Hiryu,Awamori - Okinawa,shop_all.php?prod_id=237,Awamori,Okinawa,53.99,750.0
1,213,34.99/750 ml,"Zuisen ""Hakuryu"" Awamori",Awamori - Okinawa,shop_all.php?prod_id=95,Awamori,Okinawa,34.99,750.0
2,106,118.99/720 ml,Masumi Yumedono,Daiginjo - Nagano,shop_all.php?prod_id=194,Daiginjo,Nagano,118.99,720.0
3,116,70.99/720 ml,Nanbu Bijin,Daiginjo - Iwate,shop_all.php?prod_id=195,Daiginjo,Iwate,70.99,720.0
4,160,88.99/720 ml,Suirakuten,Daiginjo - Akita,shop_all.php?prod_id=151,Daiginjo,Akita,88.99,720.0


In [18]:
df_sake = df[3:-3]

In [19]:
df_sake.sake_type_cleaned.value_counts()

Junmai Ginjo                              47
Junmai                                    28
Junmai Daiginjo                           25
Tokubetsu Junmai                          17
Daiginjo                                   8
Mugi Shochu                                8
Imo Shochu                                 8
Ginjo                                      7
Junmai Yamahai                             5
Junmai Kimoto                              5
Junmai Yamahai                             4
Honjozo                                    4
Junmai Kimoto                              3
Tokubetsu Junmai Kimoto                    2
Junmai Daiginjo Yamahai                    2
Ginjo Nama Genshu                          2
Junmai Daiginjo Muroka Genshu              2
Kome Shochu                                2
Junmai Daiginjo Kimoto                     2
Junmai Kimoto Nama                         1
Tokubetsu Junmai Nigori                    1
Tokubetsu Junmai Yamahai                   1
Nigori Tok

## Clean up sake_type to make the categories consistent. Also, get rid of shochu & bottles that aren't 720ml

In [20]:
df_sake = df_sake[df_sake.sake_type_cleaned.str.contains('Shochu') == False]

In [21]:
df_sake = df_sake[df_sake.cost.str.contains('720')].reset_index()

In [22]:
df_sake

Unnamed: 0,level_0,index,cost,name,sake_type,slug,sake_type_cleaned,prefecture,cost_cleaned,vol
0,3,116,70.99/720 ml,Nanbu Bijin,Daiginjo - Iwate,shop_all.php?prod_id=195,Daiginjo,Iwate,70.99,720.0
1,4,160,88.99/720 ml,Suirakuten,Daiginjo - Akita,shop_all.php?prod_id=151,Daiginjo,Akita,88.99,720.0
2,6,78,89.99/720 ml,Ken,Daiginjo - Fukushima,shop_all.php?prod_id=128,Daiginjo,Fukushima,89.99,720.0
3,7,181,55.99/720 ml,"Tedorigawa ""Ikina Onna""",Daiginjo - Ishikawa,shop_all.php?prod_id=78,Daiginjo,Ishikawa,55.99,720.0
4,8,68,72.99/720 ml,Kakurei,Daiginjo - Niigata,shop_all.php?prod_id=38,Daiginjo,Niigata,72.99,720.0
5,9,179,80.99/720 ml,"Tatsuriki ""Kome no Sasayaki"" Daiginjo",Daiginjo - Hyogo,shop_all.php?prod_id=272,Daiginjo,Hyogo,80.99,720.0
6,10,91,107.99/720 ml,Kokuryu Ryu,Daiginjo - Fukui,shop_all.php?prod_id=130,Daiginjo,Fukui,107.99,720.0
7,11,211,24.99/720 ml,Yuri Masamune,Futsu-shu - Akita,shop_all.php?prod_id=93,Futsu-shu,Akita,24.99,720.0
8,12,137,44.99/720 ml,Sawahime,Ginjo - Tochigi,shop_all.php?prod_id=297,Ginjo,Tochigi,44.99,720.0
9,13,27,40.99/720 ml,"Dewazakura ""Izumi Judan"" Tenth Degree",Ginjo - Yamagata,shop_all.php?prod_id=99,Ginjo,Yamagata,40.99,720.0


# Scrape more details for each sake


* rice
* rice polish
* alcohol
* SMV
* acidity
* amino_acid

## Scrape details for one sake

In [23]:
slug = df_sake.slug[0]
url = f'http://www.sakayanyc.com/{slug}'
response = requests.get(url)
doc = BeautifulSoup(response.text)

infos = doc.find_all(alt='@')
details = []

for info in infos[:7]:
    info = info.next_element.strip()
    details.append(info)

rice = details[0].split(':')[1].strip()
polish = details[1].split(':')[1].strip()
alcohol = details[2].split(':')[1].strip()
yeast = details[3].split(':')[1].strip()
SMV = details[4].split(':')[1].strip()
acidity = details[5].split(':')[1].strip()
amino_acid = details[6].split(':')[1].strip()

## Scrape info for all sake

In [24]:
rows = []
for slug in df_sake.slug:
    url = f'http://www.sakayanyc.com/{slug}'
    response = requests.get(url)
    doc = BeautifulSoup(response.text)

    infos = doc.find_all(alt='@')
    details = []

    for info in infos[:7]:
        info = info.next_element.strip()
        details.append(info)
        
    row = {}
    row['slug'] = slug
    row['rice'] = details[0].split(':')[1].strip()
    row['polish'] = details[1].split(':')[1].strip()
    row['alcohol'] = details[2].split(':')[1].strip()
    row['yeast'] = details[3].split(':')[1].strip()
    row['SMV'] = details[4].split(':')[1].strip()
    row['acidity'] = details[5].split(':')[1].strip()
    row['amino_acid'] = details[6].split(':')[1].strip()
    
    rows.append(row)

In [25]:
df_info = pd.DataFrame(rows)

In [26]:
df_info.head(1)

Unnamed: 0,SMV,acidity,alcohol,amino_acid,polish,rice,slug,yeast
0,5,1.3,16-17%,na,40%,Gin Otome,shop_all.php?prod_id=195,Iwate #2 & #1601


In [27]:
df_sake.head(1)

Unnamed: 0,level_0,index,cost,name,sake_type,slug,sake_type_cleaned,prefecture,cost_cleaned,vol
0,3,116,70.99/720 ml,Nanbu Bijin,Daiginjo - Iwate,shop_all.php?prod_id=195,Daiginjo,Iwate,70.99,720.0


# Merge dataframes

In [28]:
merged = df_sake.merge(df_info, left_on='slug', right_on='slug')
merged.head(2)

Unnamed: 0,level_0,index,cost,name,sake_type,slug,sake_type_cleaned,prefecture,cost_cleaned,vol,SMV,acidity,alcohol,amino_acid,polish,rice,yeast
0,3,116,70.99/720 ml,Nanbu Bijin,Daiginjo - Iwate,shop_all.php?prod_id=195,Daiginjo,Iwate,70.99,720.0,+5,1.3,16-17%,na,40%,Gin Otome,Iwate #2 & #1601
1,4,160,88.99/720 ml,Suirakuten,Daiginjo - Akita,shop_all.php?prod_id=151,Daiginjo,Akita,88.99,720.0,+3 - +5,1.1 - 1.3,15-16%,na,38%,Yamada Nishiki,#9


In [29]:
# Get rid of extraneous columns
merged = merged.loc[:, 'cost':]

## Clean up sake_type so they fall under the main, legal categories used in Japan

Categories should be:

{'Daiginjo',
 'Futsu-shu',
 'Ginjo',
 'Honjozo',
 'Junmai',
 'Junmai Daiginjo',
 'Junmai Ginjo',
 'Tokubetsu Honjozo',
 'Tokubetsu Junmai'}
 
 * Get rid of Umeshu

In [30]:
merged.sake_type_cleaned.unique()

array(['Daiginjo', 'Futsu-shu', 'Ginjo', 'Ginjo Nama Genshu', 'Honjozo',
       'Junmai', 'Junmai Daiginjo', 'Junmai Daiginjo Muroka Genshu',
       'Junmai Daiginjo Nama Genshu', 'Junmai Daiginjo\xa0Kimoto',
       'Junmai Daiginjo\xa0Yamahai', 'Junmai Genshu', 'Junmai Ginjo',
       'Junmai Ginjo Nama', 'Junmai Ginjo Nama Genshu',
       'Junmai Ginjo Nigori', 'Junmai Ginjo Yamahai',
       'Junmai Ginjo\xa0Umeshu', 'Junmai Kimoto', 'Junmai Yamahai',
       'Junmai\xa0Kimoto', 'Junmai\xa0Kimoto\xa0Nama',
       'Junmai\xa0Yamahai', 'Kimoto\xa0Tokubetsu Junmai', 'Taru Futsu',
       'Tokubetsu Honjozo', 'Tokubetsu Junmai', 'Tokubetsu Junmai Kimoto',
       'Tokubetsu Junmai Nigori', 'Tokubetsu Junmai Yamahai',
       'Tokubetsu Junmai Yamahai Muroka Genshu'], dtype=object)

In [31]:
sake_cleaned = []
for sake in merged.sake_type_cleaned:
    if 'Yamahai' in sake:
        sake = sake.replace('Yamahai', '')
    elif sake == 'Ginjo Nama Genshu':
        sake = 'Ginjo Genshu'
    elif sake == 'Tokubetsu Junmai Kimoto':
        sake = 'Tokubetsu Junmai'
    elif sake == 'Kimoto\xa0Tokubetsu Junmai':
        sake = 'Tokubetsu Junmai'
    elif sake == 'Junmai Daiginjo\xa0Kimoto':
        sake = 'Junmai Daiginjo'
    elif sake == 'Junmai Daiginjo\xa0Yamahai':
        sake = 'Junmai Daiginjo'       
    elif sake == 'Junmai\xa0Kimoto\xa0Nama':
        sake = 'Junmai Nama'
    elif sake == 'Junmai\xa0Kimoto':
        sake = 'Junmai'
    elif sake == 'Taru Futsu':
        sake = 'Futsu-shu'
    elif 'Kimoto' in sake:
        sake = sake.replace('Kimoto', '').strip()
    elif 'Yamahai' in sake:
        sake = sake.replace('Yamahai', '').strip()
    elif 'Muroka Genshu' in sake:
        sake = sake.replace('Muroka Genshu', 'Genshu').strip()
    else:
        sake = sake.strip()
    sake_cleaned.append(sake)

In [32]:
merged['sake_type_cleaned'] = sake_cleaned
merged.sake_type_cleaned.value_counts()

Junmai Ginjo                       44
Junmai                             33
Junmai Daiginjo                    25
Tokubetsu Junmai                   19
Ginjo                               7
Daiginjo                            7
Junmai                              5
Junmai                              4
Honjozo                             3
Junmai Daiginjo Genshu              2
Junmai Daiginjo                     2
Futsu-shu                           2
Tokubetsu Junmai  Muroka Genshu     1
Junmai Daiginjo Nama Genshu         1
Junmai Genshu                       1
Tokubetsu Junmai                    1
Junmai Ginjo Nama Genshu            1
Junmai Ginjo Nigori                 1
Tokubetsu Junmai Nigori             1
Junmai Ginjo Umeshu                 1
Junmai Ginjo Nama                   1
Ginjo Genshu                        1
Tokubetsu Honjozo                   1
Junmai Nama                         1
Junmai Ginjo                        1
Name: sake_type_cleaned, dtype: int64

# Actually, I'm going to narrow down the categories more

In [33]:
sake_cleaned = []

for sake in merged.sake_type_cleaned:
    if sake == 'Junmai Daiginjo Genshu':
        sake = 'Junmai Daiginjo'
    elif 'Nama' in sake:
        sake = sake.replace('Nama', '').strip()
    elif 'Genshu' in sake:
        sake = sake.replace(' Genshu', '')
    elif 'Nigori' in sake:
        sake = sake.replace(' Nigori', '')
    else:
        sake = sake.strip()
    sake_cleaned.append(sake)

In [34]:
sake_cleaned2 = []
for sake in sake_cleaned:
    if sake == 'Tokubetsu Junmai  Muroka':
        sake = 'Tokubetsu Junmai'
    else:
        sake = sake.strip()
    sake_cleaned2.append(sake)

In [35]:
set(sake_cleaned2)

{'Daiginjo',
 'Futsu-shu',
 'Ginjo',
 'Honjozo',
 'Junmai',
 'Junmai Daiginjo',
 'Junmai Daiginjo  Genshu',
 'Junmai Ginjo',
 'Junmai Ginjo  Genshu',
 'Junmai Ginjo\xa0Umeshu',
 'Tokubetsu Honjozo',
 'Tokubetsu Junmai'}

In [36]:
merged['sake_type_cleaned'] = sake_cleaned2

## Add tags for nama, genshu, nigori

In [37]:
merged['nama'] = merged.sake_type.str.contains("Nama").astype(int)
merged['nigori'] = merged.sake_type.str.contains("Nigori").astype(int)
merged['genshu'] = merged.sake_type.str.contains("Genshu").astype(int)

## Get rid of the one umeshu

In [38]:
merged[merged.sake_type.str.contains('Umeshu')]

Unnamed: 0,cost,name,sake_type,slug,sake_type_cleaned,prefecture,cost_cleaned,vol,SMV,acidity,alcohol,amino_acid,polish,rice,yeast,nama,nigori,genshu
123,57.99/720 ml,Kakurei Plum,Junmai Ginjo Umeshu - Niigata,shop_all.php?prod_id=39,Junmai Ginjo Umeshu,Niigata,57.99,720.0,na,na,9-10%,na,55%,Miyama Nishiki,na,0,0,0


In [39]:
merged = merged[merged.sake_type.str.contains('Umeshu') == False]

# Change datatype to float for SMV, acidity, alcohol, amino_acid, polish

In [40]:
merged.dtypes

cost                  object
name                  object
sake_type             object
slug                  object
sake_type_cleaned     object
prefecture            object
cost_cleaned         float64
vol                  float64
SMV                   object
acidity               object
alcohol               object
amino_acid            object
polish                object
rice                  object
yeast                 object
nama                   int64
nigori                 int64
genshu                 int64
dtype: object

## Turn n/a and na into np.nan & get rid of row 147, which has an invalid SMV

In [41]:
merged = merged[merged.SMV != '#9']

In [42]:
# merged.SMV.value_counts(dropna=False)

In [43]:
merged[merged.SMV.str.contains(' - ')]

Unnamed: 0,cost,name,sake_type,slug,sake_type_cleaned,prefecture,cost_cleaned,vol,SMV,acidity,alcohol,amino_acid,polish,rice,yeast,nama,nigori,genshu
1,88.99/720 ml,Suirakuten,Daiginjo - Akita,shop_all.php?prod_id=151,Daiginjo,Akita,88.99,720.0,+3 - +5,1.1 - 1.3,15-16%,na,38%,Yamada Nishiki,#9,0,0,0
61,64.99/720 ml,Chokaisan,Junmai Daiginjo - Akita,shop_all.php?prod_id=153,Junmai Daiginjo,Akita,64.99,720.0,+1 - +3,1.3 - 1.5,15-16%,na,50%,Miyama Nishiki,Akita flower yeast,0,0,0
107,54.99/720 ml,"Tenryo ""Hidahomare""",Junmai Ginjo - Gifu,shop_all.php?prod_id=260,Junmai Ginjo,Gifu,54.99,720.0,+3 - +5,1.3-1.5,15-16%,1.1-1.3,50%,Hidahomare,"Hana Yeast ""Nadeshiko""",0,0,0


In [44]:
merged['SMV_cleaned'] = merged.SMV
merged['SMV_cleaned'] = merged.SMV_cleaned.replace({'+3 - +5': '4', '+1 - +3':'2', '+':''})
merged['SMV_cleaned'] = merged.SMV_cleaned.str.replace('+','')
merged['SMV_cleaned'] = merged.SMV_cleaned.replace(r'n/?a', np.nan, regex=True)
merged['SMV_cleaned'] = merged.SMV_cleaned.astype(float)

In [45]:
merged.dtypes

cost                  object
name                  object
sake_type             object
slug                  object
sake_type_cleaned     object
prefecture            object
cost_cleaned         float64
vol                  float64
SMV                   object
acidity               object
alcohol               object
amino_acid            object
polish                object
rice                  object
yeast                 object
nama                   int64
nigori                 int64
genshu                 int64
SMV_cleaned          float64
dtype: object

In [46]:
merged.SMV_cleaned.value_counts(dropna=False)

 3.0     35
 4.0     26
 2.0     21
 5.0     20
 1.0     14
 0.0      6
 3.5      5
 NaN      5
 6.0      4
 12.0     4
-2.0      3
-1.0      3
 1.5      2
 10.0     2
 15.0     2
 2.5      1
 9.5      1
-3.0      1
 5.5      1
 4.5      1
 7.0      1
 6.5      1
-4.0      1
-10.0     1
 0.5      1
 8.0      1
-7.0      1
Name: SMV_cleaned, dtype: int64

## Clean acidity

In [47]:
merged.acidity.value_counts(dropna=False)

1.4          26
1.5          24
1.6          17
1.3          16
1.2          15
1.8          13
1.7          12
n/a           9
1.9           9
na            8
1.0           3
2.1           2
2.6           2
2.2           2
1.4-1.7       1
1.1           1
1.3-1.5       1
1.1 - 1.3     1
1.35          1
1.3 - 1.5     1
Name: acidity, dtype: int64

In [48]:
merged['acidity_cleaned'] = merged.acidity.replace({'1.4-1.7': '1.55', 
                                                    '1.3 - 1.5':'1.4', 
                                                    '1.1 - 1.3':'1.2',
                                                    '1.3-1.5':'1.4'})

merged['acidity_cleaned'] = merged.acidity_cleaned.replace(r'n/?a', np.nan, regex=True)
merged['acidity_cleaned'] = merged.acidity_cleaned.astype(float)

## Clean alcohol

In [49]:
merged['alcohol_cleaned'] = merged.alcohol.str[:-1]

In [50]:
for idx, alc in enumerate(merged.alcohol_cleaned.str.split('-')):
    try:
        if len(alc) == 2:
            alc = (int(alc[0]) + int(alc[1])) / 2
            merged.alcohol_cleaned.iloc[idx] = alc
        else:
            alc = int(alc)
            merged.alcohol_cleaned.iloc[idx] = alc
    except:
        pass

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [51]:
merged['alcohol_cleaned'] = merged.alcohol_cleaned.astype(float)

In [52]:
merged.dtypes

cost                  object
name                  object
sake_type             object
slug                  object
sake_type_cleaned     object
prefecture            object
cost_cleaned         float64
vol                  float64
SMV                   object
acidity               object
alcohol               object
amino_acid            object
polish                object
rice                  object
yeast                 object
nama                   int64
nigori                 int64
genshu                 int64
SMV_cleaned          float64
acidity_cleaned      float64
alcohol_cleaned      float64
dtype: object

## Clean amino_acid

In [53]:
merged.amino_acid.value_counts()

na         86
n/a        21
           10
1.0         7
1.2         7
1.3         6
1.1         4
0.9         4
1.6         3
1           3
1.4         3
0.8         2
1.8         2
2.0         2
1.2-1.4     1
1.1-1.3     1
1.7         1
2           1
Name: amino_acid, dtype: int64

In [54]:
merged['amino_acid_cleaned'] = merged.amino_acid

In [55]:
merged['amino_acid_cleaned'] = merged.amino_acid_cleaned.replace({'1.1-1.3': '1.2', 
                                                    '1.2-1.4':'1.3', '':np.nan})

In [56]:
merged['amino_acid_cleaned'] = merged.amino_acid_cleaned.replace(r'n/?a', np.nan, regex=True)

In [57]:
merged.amino_acid_cleaned.value_counts()

1.2    8
1.0    7
1.3    7
1.1    4
0.9    4
1.4    3
1      3
1.6    3
1.8    2
0.8    2
2.0    2
1.7    1
2      1
Name: amino_acid_cleaned, dtype: int64

In [58]:
merged['amino_acid_cleaned'] = merged.amino_acid_cleaned.astype(float)

## Clean polish

In [59]:
merged.polish.value_counts()

50%                          40
60%                          31
55%                          30
65%                          16
40%                           9
70%                           6
45%                           5
58%                           4
35%                           3
48%                           3
57%                           2
38%                           2
75%                           2
49%                           1
33%                           1
66%                           1
Super flat rice polishing     1
90%                           1
na                            1
64%                           1
18%                           1
68%                           1
23%                           1
                              1
Name: polish, dtype: int64

In [60]:
merged['polish_cleaned'] = merged.polish.replace({'%': '',
                                                  'Super flat rice polishing':np.nan,
                                                  '':np.nan,
                                                 'na':np.nan})

In [61]:
merged['polish_cleaned'] = merged.polish_cleaned.str[:-1]

In [62]:
merged.polish_cleaned.value_counts(dropna=False)

50     40
60     31
55     30
65     16
40      9
70      6
45      5
58      4
48      3
35      3
NaN     3
75      2
38      2
57      2
64      1
49      1
68      1
33      1
18      1
23      1
66      1
90      1
Name: polish_cleaned, dtype: int64

In [63]:
merged['polish_cleaned'] = merged.polish_cleaned.astype(float)

In [64]:
merged = merged.drop(columns='cost')
merged

Unnamed: 0,name,sake_type,slug,sake_type_cleaned,prefecture,cost_cleaned,vol,SMV,acidity,alcohol,...,rice,yeast,nama,nigori,genshu,SMV_cleaned,acidity_cleaned,alcohol_cleaned,amino_acid_cleaned,polish_cleaned
0,Nanbu Bijin,Daiginjo - Iwate,shop_all.php?prod_id=195,Daiginjo,Iwate,70.99,720.0,+5,1.3,16-17%,...,Gin Otome,Iwate #2 & #1601,0,0,0,5.0,1.30,16.5,,40.0
1,Suirakuten,Daiginjo - Akita,shop_all.php?prod_id=151,Daiginjo,Akita,88.99,720.0,+3 - +5,1.1 - 1.3,15-16%,...,Yamada Nishiki,#9,0,0,0,4.0,1.20,15.5,,38.0
2,Ken,Daiginjo - Fukushima,shop_all.php?prod_id=128,Daiginjo,Fukushima,89.99,720.0,+3.5,1.2,15.9%,...,Yamada Nishiki,Suehiro Yeast,0,0,0,3.5,1.20,15.9,,40.0
3,"Tedorigawa ""Ikina Onna""",Daiginjo - Ishikawa,shop_all.php?prod_id=78,Daiginjo,Ishikawa,55.99,720.0,+6,1.2,16.2%,...,Yamada Nishiki,Brewer's Proprietary #9,0,0,0,6.0,1.20,16.2,,40.0
4,Kakurei,Daiginjo - Niigata,shop_all.php?prod_id=38,Daiginjo,Niigata,72.99,720.0,+5,1.5,15-16%,...,Yamada Nishiki,#14,0,0,0,5.0,1.50,15.5,0.9,48.0
5,"Tatsuriki ""Kome no Sasayaki"" Daiginjo",Daiginjo - Hyogo,shop_all.php?prod_id=272,Daiginjo,Hyogo,80.99,720.0,+3.5,,17%,...,Yamada Nishiki,#9,0,0,0,3.5,,17.0,,50.0
6,Kokuryu Ryu,Daiginjo - Fukui,shop_all.php?prod_id=130,Daiginjo,Fukui,107.99,720.0,+4,1.2,15-16%,...,Yamada Nishiki,Kokuryu Yeast,0,0,0,4.0,1.20,15.5,,40.0
7,Yuri Masamune,Futsu-shu - Akita,shop_all.php?prod_id=93,Futsu-shu,Akita,24.99,720.0,+2.5,1.3,15%,...,Hitomibore,Brewer's Proprietary,0,0,0,2.5,1.30,15.0,1.0,68.0
8,Sawahime,Ginjo - Tochigi,shop_all.php?prod_id=297,Ginjo,Tochigi,44.99,720.0,+4,1.3,16-17%,...,Hitogokochi,Prefecture Yeast,0,0,0,4.0,1.30,16.5,,50.0
9,"Dewazakura ""Izumi Judan"" Tenth Degree",Ginjo - Yamagata,shop_all.php?prod_id=99,Ginjo,Yamagata,40.99,720.0,+12,1.4,17.5%,...,Miyama Nishiki,"Yamagata, YK_0107",0,0,0,12.0,1.40,17.5,,50.0


In [65]:
merged.dtypes

name                   object
sake_type              object
slug                   object
sake_type_cleaned      object
prefecture             object
cost_cleaned          float64
vol                   float64
SMV                    object
acidity                object
alcohol                object
amino_acid             object
polish                 object
rice                   object
yeast                  object
nama                    int64
nigori                  int64
genshu                  int64
SMV_cleaned           float64
acidity_cleaned       float64
alcohol_cleaned       float64
amino_acid_cleaned    float64
polish_cleaned        float64
dtype: object

In [66]:
merged.to_csv('sake_info.csv', index=False)

# Make bins for rich/light, sweet/dry

In [67]:
df = pd.read_csv('sake_info.csv')

In [68]:
df['SMV_dry_sweet_bins'] = pd.cut(df.SMV_cleaned, bins = [-10, -5.9, -3.4, -1.4, 1.4, 3.4, 5.9, 15])

In [69]:
df['SMV_dry_sweet'] = pd.cut(df.SMV_cleaned, bins = [-10, -5.9, -3.4, -1.4, 1.4, 3.4, 5.9, 15],
      labels=['0very_sweet', '1sweet', '2slightly_sweet', '3neutral',
              '4slightly_dry', '5dry', '6very_dry'])

In [70]:
df['SMV_dry_sweet'].value_counts()

4slightly_dry      59
5dry               53
3neutral           24
6very_dry          16
2slightly_sweet     4
1sweet              1
0very_sweet         1
Name: SMV_dry_sweet, dtype: int64

In [71]:
df.acidity_cleaned.describe()

count    147.000000
mean       1.524490
std        0.269282
min        1.000000
25%        1.375000
50%        1.500000
75%        1.700000
max        2.600000
Name: acidity_cleaned, dtype: float64

In [72]:
df['acidity_richness_bins'] = pd.cut(df.acidity_cleaned, bins = [1, 1.2, 1.4, 1.6, 1.8, 2.6])

In [73]:
df['acidity_richness'] = pd.cut(df.acidity_cleaned, bins = [1, 1.2, 1.4, 1.6, 1.8, 2.6],
      labels = ['0very_light', '1light', '2neutral', '3rich', '4very_rich'])

In [77]:
df.acidity_richness.value_counts()

1light         45
2neutral       42
3rich          25
0very_light    17
4very_rich     15
Name: acidity_richness, dtype: int64

In [75]:
df.sake_type_cleaned.value_counts()

Junmai Ginjo               47
Junmai                     44
Junmai Daiginjo            29
Tokubetsu Junmai           21
Ginjo                       8
Daiginjo                    7
Honjozo                     3
Futsu-shu                   2
Junmai Ginjo  Genshu        1
Junmai Daiginjo  Genshu     1
Tokubetsu Honjozo           1
Name: sake_type_cleaned, dtype: int64

In [76]:
df.to_csv('sake_info.csv', index=False)

# Clean up rice types

In [2]:
df = pd.read_csv('sake_info.csv')

In [4]:
df['yamada_nishiki'] = df.rice.str.contains("Yamada Nishiki").astype(int)
df['gohyaku_mangoku'] = df.rice.str.contains("Gohyaku Mangoku").astype(int)

In [5]:
df.to_csv('sake_info.csv', index=False)