In [4]:
import pandas as pd
pd.set_option('max_rows', 5)
import numpy as np
reviews = pd.read_csv('winemag-data-130k-v2.csv', index_col = 0)

In [5]:
reviews

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
...,...,...,...,...,...,...,...,...,...,...,...,...,...
36167,Italy,Here's a straightforward and fruity Nero d'Avo...,,88,18.0,Sicily & Sardinia,Sicilia,,Kerin O’Keefe,@kerinokeefe,Morgante 2012 Nero d'Avola (Sicilia),Nero d'Avola,Morgante
36168,Greece,This rounded white starts with aromas of lemon...,Dry White,88,15.0,Peloponnese,,,Susan Kostrzewa,@suskostrzewa,My Big Fat Greek Wine 2012 Dry White Moschofi,,


# Summary functions

In [8]:
reviews.points.describe()   # 데이터타입: float64

count    36169.000000
mean        88.397274
             ...     
75%         91.000000
max        100.000000
Name: points, Length: 8, dtype: float64

In [9]:
reviews.taster_name.describe()    # 데이터타입: object

count          28743
unique            19
top       Roger Voss
freq            7175
Name: taster_name, dtype: object

In [10]:
reviews.points.mean()    # points열의 평균

88.39727390859576

In [11]:
reviews.taster_name.unique()    # taster_name열의 고유값 전체

array(['Kerin O’Keefe', 'Roger Voss', 'Paul Gregutt',
       'Alexander Peartree', 'Michael Schachner', 'Anna Lee C. Iijima',
       'Virginie Boone', 'Matt Kettmann', nan, 'Sean P. Sullivan',
       'Jim Gordon', 'Joe Czerwinski', 'Anne Krebiehl\xa0MW',
       'Lauren Buzzeo', 'Mike DeSimone', 'Jeff Jenssen',
       'Susan Kostrzewa', 'Carrie Dykes', 'Fiona Adams',
       'Christina Pickard'], dtype=object)

In [12]:
reviews.taster_name.value_counts()    # taster_name열의 각 고유값의 개수

Roger Voss           7175
Michael Schachner    4360
                     ... 
Fiona Adams            11
Christina Pickard       1
Name: taster_name, Length: 19, dtype: int64

# Maps

In [13]:
# map() returns a new Series where all the values have been transformed by your function
review_points_mean = reviews.points.mean()
reviews.points.map(lambda p: p - review_points_mean)    # map()는 Series중에서 단 하나의 값만을 취함

0       -1.397274
1       -1.397274
           ...   
36167   -0.397274
36168   -0.397274
Name: points, Length: 36169, dtype: float64

In [16]:
# apply() is the equivalent method if we want to transform a whole DataFrame
# by calling a custom method on each row.
def remean_points(row):
    row.points = row.points - review_points_mean
    return row

reviews.apply(remean_points, axis = 'columns')

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,-1.397274,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,-1.397274,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
...,...,...,...,...,...,...,...,...,...,...,...,...,...
36167,Italy,Here's a straightforward and fruity Nero d'Avo...,,-0.397274,18.0,Sicily & Sardinia,Sicilia,,Kerin O’Keefe,@kerinokeefe,Morgante 2012 Nero d'Avola (Sicilia),Nero d'Avola,Morgante
36168,Greece,This rounded white starts with aromas of lemon...,Dry White,-0.397274,15.0,Peloponnese,,,Susan Kostrzewa,@suskostrzewa,My Big Fat Greek Wine 2012 Dry White Moschofi,,


In [18]:
# map() and apply() return new, transformed Series and DataFrames
# They don't modify the original data
reviews.head(1)

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia


In [19]:
# Pandas figures out that we must subtract mean value from every value in the dataset
review_points_mean = reviews.points.mean()
reviews.points - review_points_mean

0       -1.397274
1       -1.397274
           ...   
36167   -0.397274
36168   -0.397274
Name: points, Length: 36169, dtype: float64

In [21]:
# Pandas understand what to do if we perform these operations between Series of equal length
# Combining country and region
reviews.country + ' - ' + reviews.region_1

0           Italy - Etna
1                    NaN
              ...       
36167    Italy - Sicilia
36168                NaN
Length: 36169, dtype: object