# String methods in pandas

In [1]:
import pandas as pd 

In [4]:
orders = pd.read_table('http://bit.ly/chiporders')
orders.head()

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98


In [10]:
orders.dtypes

order_id               int64
quantity               int64
item_name             object
choice_description    object
item_price            object
dtype: object

In [5]:
# making the item_name column in upper case
orders['item_name'].str.upper()

0                CHIPS AND FRESH TOMATO SALSA
1                                        IZZE
2                            NANTUCKET NECTAR
3       CHIPS AND TOMATILLO-GREEN CHILI SALSA
4                                CHICKEN BOWL
                        ...                  
4617                            STEAK BURRITO
4618                            STEAK BURRITO
4619                       CHICKEN SALAD BOWL
4620                       CHICKEN SALAD BOWL
4621                       CHICKEN SALAD BOWL
Name: item_name, Length: 4622, dtype: object

In [7]:
#remove [ ] from choice_description column
orders['choice_description'].str.replace('[','').str.replace(']','')

0                                                     NaN
1                                              Clementine
2                                                   Apple
3                                                     NaN
4       Tomatillo-Red Chili Salsa (Hot), Black Beans, ...
                              ...                        
4617    Fresh Tomato Salsa, Rice, Black Beans, Sour Cr...
4618    Fresh Tomato Salsa, Rice, Sour Cream, Cheese, ...
4619    Fresh Tomato Salsa, Fajita Vegetables, Pinto B...
4620       Fresh Tomato Salsa, Fajita Vegetables, Lettuce
4621    Fresh Tomato Salsa, Fajita Vegetables, Pinto B...
Name: choice_description, Length: 4622, dtype: object

In [9]:
#filter the dataframe having 'Salsa' in item_name column
orders[orders['item_name'].str.contains('Salsa')]

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
15,8,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
25,13,1,Chips and Fresh Tomato Salsa,,$2.39
30,15,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
...,...,...,...,...,...
4425,1764,1,Chips and Fresh Tomato Salsa,,$2.95
4466,1779,1,Chips and Tomatillo Red Chili Salsa,,$2.95
4476,1784,1,Chips and Tomatillo Green Chili Salsa,,$2.95
4493,1787,1,Chips and Tomatillo Green Chili Salsa,,$2.95


In [17]:
#remove the $ sign from item_price and change the data type of its to float
orders['item_price'] = orders['item_price'].str.replace('$','').astype(float)

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,2.39
1,1,1,Izze,[Clementine],3.39
2,1,1,Nantucket Nectar,[Apple],3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",16.98


order_id                int64
quantity                int64
item_name              object
choice_description     object
item_price            float64
dtype: object

# "groupby" in pandas

In [21]:
drinks = pd.read_csv('http://bit.ly/drinksbycountry')
drinks.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa


In [26]:
#Avg. beer_servings of each continent
drinks.groupby('continent')['beer_servings'].mean()

continent
Africa            61.471698
Asia              37.045455
Europe           193.777778
North America    145.434783
Oceania           89.687500
South America    175.083333
Name: beer_servings, dtype: float64

In [27]:
# Max. beer serving by each continent
drinks.groupby('continent')['beer_servings'].max()

continent
Africa           376
Asia             247
Europe           361
North America    285
Oceania          306
South America    333
Name: beer_servings, dtype: int64

In [29]:
#return count, min ,max and mean of beer_serving by each continent
drinks.groupby('continent')['beer_servings'].agg(['count','min','max','mean'])

Unnamed: 0_level_0,count,min,max,mean
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Africa,53,0,376,61.471698
Asia,44,0,247,37.045455
Europe,45,0,361,193.777778
North America,23,1,285,145.434783
Oceania,16,0,306,89.6875
South America,12,93,333,175.083333


In [28]:
drinks.groupby('continent').mean()

Unnamed: 0_level_0,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Africa,61.471698,16.339623,16.264151,3.007547
Asia,37.045455,60.840909,9.068182,2.170455
Europe,193.777778,132.555556,142.222222,8.617778
North America,145.434783,165.73913,24.521739,5.995652
Oceania,89.6875,58.4375,35.625,3.38125
South America,175.083333,114.75,62.416667,6.308333
