In [1]:
import pandas as pd
df = pd.read_csv('wine.csv')
df.head(3)

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm


In [21]:
df.price.map(lambda a: a - 10.0)
# can also map values to numbers. ex. map male to 1 and female to 0
# df['map genders'] = df.gender.map({'male': 1, 'female': 0})

0          NaN
1          5.0
2          4.0
3          3.0
4         55.0
          ... 
129966    18.0
129967    65.0
129968    20.0
129969    22.0
129970    11.0
Name: price, Length: 129970, dtype: float64

In [5]:
# create new colum with length of strings in description
df['length country'] = df.description.apply(len)
df.loc[:, ['country', 'length country']]


Unnamed: 0,country,length country
0,Italy,172
1,Portugal,227
2,US,186
3,US,199
4,US,249
...,...,...
129966,Germany,227
129967,US,233
129968,France,225
129969,France,216


In [13]:
# variety has 1 missing value so cannot get first word
df.isnull().sum()



Unnamed: 0                   0
country                     63
description                  0
designation              37465
points                       0
price                     8996
province                    63
region_1                 21247
region_2                 79460
taster_name              26244
taster_twitter_handle    31213
title                        0
variety                      1
winery                       0
length country               0
dtype: int64

In [16]:
df.dropna(subset = ['variety'], inplace = True)

In [17]:
# variety now has 0 missing values
df.isnull().sum()

Unnamed: 0                   0
country                     63
description                  0
designation              37464
points                       0
price                     8996
province                    63
region_1                 21246
region_2                 79459
taster_name              26243
taster_twitter_handle    31212
title                        0
variety                      0
winery                       0
length country               0
dtype: int64

In [36]:
# get first word from variety column and sotre in first word column
df['first word'] = df.variety.str.split(" ").apply(lambda a : a[0]).head()
df[['variety', 'first word']]

Unnamed: 0,variety,first word
0,White Blend,White
1,Portuguese Red,Portuguese
2,Pinot Gris,Pinot
3,Riesling,Riesling
4,Pinot Noir,Pinot
...,...,...
129966,Riesling,
129967,Pinot Noir,
129968,Gewürztraminer,
129969,Pinot Gris,


In [41]:
# remove missing values from first character column
df.dropna(subset = ['first word'], inplace = True)


In [42]:
# can use your own apply function
def get_first_character(s, position):
    return s[position]

In [44]:
df['first char'] = df['first word'].apply(get_first_character, position = 0)
df.loc[:, ['first word', 'first char']]

Unnamed: 0,first word,first char
0,White,W
1,Portuguese,P
2,Pinot,P
3,Riesling,R
4,Pinot,P


In [25]:
df2 = df[['points', 'price']].iloc[1:13]
df2.head()

Unnamed: 0,points,price
1,87,15.0
2,87,14.0
3,87,13.0
4,87,65.0
5,87,15.0


In [48]:
# can also use apply as a dataframe method 
# choose the axis, axis 0 is default
# this get's the max values across each column
df2.apply(max)

points    87.0
price     65.0
min       65.0
max        NaN
dtype: float64

In [53]:
# this gets the max across each row
df2.apply(max, axis = 1)


1     87.0
2     87.0
3     87.0
4     87.0
5     87.0
6     87.0
7     87.0
8     87.0
9     87.0
10    87.0
11    87.0
12    87.0
dtype: float64

In [55]:
# apply map is a dataframe method that doesn't apply across an axis but rather across the entire map
# change every value in df to float
df2.applymap(float)

Unnamed: 0,points,price,min,max
1,87.0,15.0,15.0,
2,87.0,14.0,14.0,
3,87.0,13.0,13.0,
4,87.0,65.0,65.0,
5,87.0,15.0,15.0,
6,87.0,16.0,16.0,
7,87.0,24.0,24.0,
8,87.0,12.0,12.0,
9,87.0,27.0,27.0,
10,87.0,19.0,19.0,
