# String methods in pandas

In [1]:
import pandas as pd 

In [2]:
orders = pd.read_table('http://bit.ly/chiporders')
orders.head()

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98


In [10]:
orders.dtypes

order_id               int64
quantity               int64
item_name             object
choice_description    object
item_price            object
dtype: object

In [5]:
# making the item_name column in upper case
orders['item_name'].str.upper()

0                CHIPS AND FRESH TOMATO SALSA
1                                        IZZE
2                            NANTUCKET NECTAR
3       CHIPS AND TOMATILLO-GREEN CHILI SALSA
4                                CHICKEN BOWL
                        ...                  
4617                            STEAK BURRITO
4618                            STEAK BURRITO
4619                       CHICKEN SALAD BOWL
4620                       CHICKEN SALAD BOWL
4621                       CHICKEN SALAD BOWL
Name: item_name, Length: 4622, dtype: object

In [7]:
#remove [ ] from choice_description column
orders['choice_description'].str.replace('[','').str.replace(']','')

0                                                     NaN
1                                              Clementine
2                                                   Apple
3                                                     NaN
4       Tomatillo-Red Chili Salsa (Hot), Black Beans, ...
                              ...                        
4617    Fresh Tomato Salsa, Rice, Black Beans, Sour Cr...
4618    Fresh Tomato Salsa, Rice, Sour Cream, Cheese, ...
4619    Fresh Tomato Salsa, Fajita Vegetables, Pinto B...
4620       Fresh Tomato Salsa, Fajita Vegetables, Lettuce
4621    Fresh Tomato Salsa, Fajita Vegetables, Pinto B...
Name: choice_description, Length: 4622, dtype: object

In [9]:
#filter the dataframe having 'Salsa' in item_name column
orders[orders['item_name'].str.contains('Salsa')]

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
15,8,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
25,13,1,Chips and Fresh Tomato Salsa,,$2.39
30,15,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
...,...,...,...,...,...
4425,1764,1,Chips and Fresh Tomato Salsa,,$2.95
4466,1779,1,Chips and Tomatillo Red Chili Salsa,,$2.95
4476,1784,1,Chips and Tomatillo Green Chili Salsa,,$2.95
4493,1787,1,Chips and Tomatillo Green Chili Salsa,,$2.95


In [17]:
#remove the $ sign from item_price and change the data type of its to float
orders['item_price'] = orders['item_price'].str.replace('$','').astype(float)

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,2.39
1,1,1,Izze,[Clementine],3.39
2,1,1,Nantucket Nectar,[Apple],3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",16.98


In [3]:
# extract the first word of string in "item_name" 
orders['item_name'].str.split(' ')         #split function make the list of strings[here spliting is done on the bases ofspace]

0                [Chips, and, Fresh, Tomato, Salsa]
1                                            [Izze]
2                               [Nantucket, Nectar]
3       [Chips, and, Tomatillo-Green, Chili, Salsa]
4                                   [Chicken, Bowl]
                           ...                     
4617                               [Steak, Burrito]
4618                               [Steak, Burrito]
4619                         [Chicken, Salad, Bowl]
4620                         [Chicken, Salad, Bowl]
4621                         [Chicken, Salad, Bowl]
Name: item_name, Length: 4622, dtype: object

In [5]:
# specify the positional argument in the get function to get the position of element u want.
orders['item_name'].str.split(' ').str.get(0)   # here we want first element of the list

0           Chips
1            Izze
2       Nantucket
3           Chips
4         Chicken
          ...    
4617        Steak
4618        Steak
4619      Chicken
4620      Chicken
4621      Chicken
Name: item_name, Length: 4622, dtype: object

# "groupby" in pandas

In [21]:
drinks = pd.read_csv('http://bit.ly/drinksbycountry')
drinks.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa


In [26]:
#Avg. beer_servings of each continent
drinks.groupby('continent')['beer_servings'].mean()

continent
Africa            61.471698
Asia              37.045455
Europe           193.777778
North America    145.434783
Oceania           89.687500
South America    175.083333
Name: beer_servings, dtype: float64

In [27]:
# Max. beer serving by each continent
drinks.groupby('continent')['beer_servings'].max()

continent
Africa           376
Asia             247
Europe           361
North America    285
Oceania          306
South America    333
Name: beer_servings, dtype: int64

In [29]:
#return count, min ,max and mean of beer_serving by each continent
drinks.groupby('continent')['beer_servings'].agg(['count','min','max','mean'])

Unnamed: 0_level_0,count,min,max,mean
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Africa,53,0,376,61.471698
Asia,44,0,247,37.045455
Europe,45,0,361,193.777778
North America,23,1,285,145.434783
Oceania,16,0,306,89.6875
South America,12,93,333,175.083333


In [28]:
drinks.groupby('continent').mean()

Unnamed: 0_level_0,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Africa,61.471698,16.339623,16.264151,3.007547
Asia,37.045455,60.840909,9.068182,2.170455
Europe,193.777778,132.555556,142.222222,8.617778
North America,145.434783,165.73913,24.521739,5.995652
Oceania,89.6875,58.4375,35.625,3.38125
South America,175.083333,114.75,62.416667,6.308333


# Use of map(), apply() and applymap() methods on series or Dataframe

In [6]:
train = pd.read_csv('http://bit.ly/kaggletrain')
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


**map() is a Series method** : it maps all the existing values of series to different set of values

In [7]:
#map the female to 0 and male to 1
train['Sex_num'] = train['Sex'].map({'female': 0 , 'male':1})

In [9]:
train.loc[0:4,['Sex','Sex_num']]

Unnamed: 0,Sex,Sex_num
0,male,1
1,female,0
2,female,0
3,female,0
4,male,1


**apply() is both Series and Dataframe method** : 

**apply()  on series :** it applies the function to each element of pandas series

In [10]:
# count the characters of each element of "Name" column
train['name_len'] = train['Name'].apply(len)

In [11]:
train.loc[0:4, ['Name','name_len']]

Unnamed: 0,Name,name_len
0,"Braund, Mr. Owen Harris",23
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",51
2,"Heikkinen, Miss. Laina",22
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",44
4,"Allen, Mr. William Henry",24


In [12]:
# round of the fare price to integers
import numpy as np
train['Fare_ceil'] = train['Fare'].apply(np.ceil)

In [14]:
train.loc[0:4,['Fare','Fare_ceil']]

Unnamed: 0,Fare,Fare_ceil
0,7.25,8.0
1,71.2833,72.0
2,7.925,8.0
3,53.1,54.0
4,8.05,9.0


In [18]:
# get the first word of the name column
train['Name'].str.split(',').apply(lambda x: x[0])

0         Braund
1        Cumings
2      Heikkinen
3       Futrelle
4          Allen
         ...    
886     Montvila
887       Graham
888     Johnston
889         Behr
890       Dooley
Name: Name, Length: 891, dtype: object

**apply() on dataframe :** it applies function to Dataframe along the axis u have to specify

In [19]:
drinks = pd.read_csv('http://bit.ly/drinksbycountry')
drinks.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa


In [22]:
# return the max value in each column. 
drinks.loc[:,'beer_servings':'wine_servings'].apply(max, axis = 0)

beer_servings      376
spirit_servings    438
wine_servings      370
dtype: int64

In [30]:
# return the max value in each row. 
drinks.loc[:,'beer_servings':'wine_servings'].apply(max, axis = 1).head()

0      0
1    132
2     25
3    312
4    217
dtype: int64

In [31]:
# returns column which contains the max value in each row
drinks.loc[:,'beer_servings':'wine_servings'].apply(np.argmax, axis =1).head()

0      beer_servings
1    spirit_servings
2      beer_servings
3      wine_servings
4      beer_servings
dtype: object

**applymap() method applies function to all the elements of the dataframe.**

In [28]:
drinks.loc[:,'beer_servings':'wine_servings'].applymap(float).head()

Unnamed: 0,beer_servings,spirit_servings,wine_servings
0,0.0,0.0,0.0
1,89.0,132.0,54.0
2,25.0,0.0,14.0
3,245.0,138.0,312.0
4,217.0,57.0,45.0
