## Pandas: Combining Datasets

In [3]:
#import and auxiliary functions
import pandas as pd
import numpy as np

def make_df(cols, ind):
    """Quickly make a DataFrame"""
    data = {c: [str(c) + str(i) for i in ind] for c in cols}
    return pd.DataFrame(data, ind)

# example DataFrame
make_df('ABC', range(3))

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2


#### Contatenation using concat

In [14]:
x = [[[1,2],
    [3,4]],
     [[1,2],
    [3,4]]]
np.concatenate([x,x], axis=2)

array([[[1, 2, 1, 2],
        [3, 4, 3, 4]],

       [[1, 2, 1, 2],
        [3, 4, 3, 4]]])

In [3]:
#pd.concat
#pd.concat(objs, axis=0, join='outer', join_axes=None, 
#          ignore_index=False,keys=None, levels=None,
#          names=None, verify_integrity=False, copy=True)

In [4]:
pd.concat?

In [5]:
# series
ser1 = pd.Series(['A', 'B', 'C'], index=[1, 2, 3])
ser2 = pd.Series(['D', 'E', 'F'], index=[4, 5, 6])
pd.concat([ser1, ser2])

1    A
2    B
3    C
4    D
5    E
6    F
dtype: object

In [16]:
#DataFrames
df1 = make_df('AB', [1, 2])
df2 = make_df('AB', [3, 4])
print(df1,'\n'); print(df2,'\n')
print(pd.concat([df1, df2]), '\n')
print(pd.concat([df1, df2], axis=1)) #along the column

    A   B
1  A1  B1
2  A2  B2 

    A   B
3  A3  B3
4  A4  B4 

    A   B
1  A1  B1
2  A2  B2
3  A3  B3
4  A4  B4 

     A    B    A    B
1   A1   B1  NaN  NaN
2   A2   B2  NaN  NaN
3  NaN  NaN   A3   B3
4  NaN  NaN   A4   B4


In [7]:
print(pd.concat([df1, df2], axis=1,join="inner"))

Empty DataFrame
Columns: [A, B, A, B]
Index: []


In [18]:
##deal with duplicate indices
print(pd.concat([df1, df1]), '\n')

# print(pd.concat([df1, df1], verify_integrity=True))

    A   B
1  A1  B1
2  A2  B2
1  A1  B1
2  A2  B2 



In [19]:
# ignore: reordered
print(pd.concat([df2, df1]), '\n')
print(pd.concat([df2, df1], ignore_index=True)) 

    A   B
3  A3  B3
4  A4  B4
1  A1  B1
2  A2  B2 

    A   B
0  A3  B3
1  A4  B4
2  A1  B1
3  A2  B2


In [10]:
# add keys indicating sources
df3 = df1
df4 = pd.concat([df1, df3], keys=['df1','df3'])
print(df4)

        A   B
df1 1  A1  B1
    2  A2  B2
df3 1  A1  B1
    2  A2  B2


In [11]:
df4.index

MultiIndex([('df1', 1),
            ('df1', 2),
            ('df3', 1),
            ('df3', 2)],
           )

In [12]:
# union/intersection of the input columns
df5 = make_df('ABC', [1, 2])
df6 = make_df('BCD', [3, 4])
print(df5,'\n'); print(df6,'\n')
print(pd.concat([df5, df6]),'\n')

#intersection: inner
print(pd.concat([df5, df6],join='inner'))

    A   B   C
1  A1  B1  C1
2  A2  B2  C2 

    B   C   D
3  B3  C3  D3
4  B4  C4  D4 

     A   B   C    D
1   A1  B1  C1  NaN
2   A2  B2  C2  NaN
3  NaN  B3  C3   D3
4  NaN  B4  C4   D4 

    B   C
1  B1  C1
2  B2  C2
3  B3  C3
4  B4  C4


#### Concatenation using append

In [13]:
print(df2.append(df1))

    A   B
3  A3  B3
4  A4  B4
1  A1  B1
2  A2  B2


#### Merge

In [14]:
#one to one join
df1 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'group': ['Accounting', 'Engineering', 'Engineering', 'HR']})
df2 = pd.DataFrame({'employee': ['Lisa', 'Bob', 'Jake', 'Sue'],
                    'hire_date': [2004, 2008, 2012, 2014]})
print(df1,'\n'); print(df2,'\n')
df3 = pd.merge(df1, df2)
print(df3,'\n')

  employee        group
0      Bob   Accounting
1     Jake  Engineering
2     Lisa  Engineering
3      Sue           HR 

  employee  hire_date
0     Lisa       2004
1      Bob       2008
2     Jake       2012
3      Sue       2014 

  employee        group  hire_date
0      Bob   Accounting       2008
1     Jake  Engineering       2012
2     Lisa  Engineering       2004
3      Sue           HR       2014 



In [15]:
#many to one join
df4 = pd.DataFrame({'group': ['Accounting', 'Engineering', 'HR'],
                    'supervisor': ['Carly', 'Guido', 'Steve']})
print(df3,'\n'); print(df4,'\n'); 
#additional column with the “supervisor” information,
# information repeated as required by the inputs
print(pd.merge(df3, df4))

  employee        group  hire_date
0      Bob   Accounting       2008
1     Jake  Engineering       2012
2     Lisa  Engineering       2004
3      Sue           HR       2014 

         group supervisor
0   Accounting      Carly
1  Engineering      Guido
2           HR      Steve 

  employee        group  hire_date supervisor
0      Bob   Accounting       2008      Carly
1     Jake  Engineering       2012      Guido
2     Lisa  Engineering       2004      Guido
3      Sue           HR       2014      Steve


In [16]:
#many to many join
df5 = pd.DataFrame({'group': ['Accounting', 'Accounting',
                              'Engineering', 'Engineering', 'HR', 'HR'],
                    'skills': ['math', 'spreadsheets', 'coding', 'linux',
                               'spreadsheets', 'organization']})
print(df1,'\n'); print(df5,'\n')
print(pd.merge(df1, df5))
#group correspond to two skills, thus two rows per employee

  employee        group
0      Bob   Accounting
1     Jake  Engineering
2     Lisa  Engineering
3      Sue           HR 

         group        skills
0   Accounting          math
1   Accounting  spreadsheets
2  Engineering        coding
3  Engineering         linux
4           HR  spreadsheets
5           HR  organization 

  employee        group        skills
0      Bob   Accounting          math
1      Bob   Accounting  spreadsheets
2     Jake  Engineering        coding
3     Jake  Engineering         linux
4     Lisa  Engineering        coding
5     Lisa  Engineering         linux
6      Sue           HR  spreadsheets
7      Sue           HR  organization


In [17]:
# specify the merge key
print(df1,'\n'); print(df2,'\n'); print(pd.merge(df1, df2, on='employee'))

  employee        group
0      Bob   Accounting
1     Jake  Engineering
2     Lisa  Engineering
3      Sue           HR 

  employee  hire_date
0     Lisa       2004
1      Bob       2008
2     Jake       2012
3      Sue       2014 

  employee        group  hire_date
0      Bob   Accounting       2008
1     Jake  Engineering       2012
2     Lisa  Engineering       2004
3      Sue           HR       2014


In [18]:
# different keys for different datasets
df3 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'salary': [70000, 80000, 120000, 90000]})
print(df1,'\n'); print(df3,'\n');
print(pd.merge(df1, df3, left_on="employee", right_on="name"))

  employee        group
0      Bob   Accounting
1     Jake  Engineering
2     Lisa  Engineering
3      Sue           HR 

   name  salary
0   Bob   70000
1  Jake   80000
2  Lisa  120000
3   Sue   90000 

  employee        group  name  salary
0      Bob   Accounting   Bob   70000
1     Jake  Engineering  Jake   80000
2     Lisa  Engineering  Lisa  120000
3      Sue           HR   Sue   90000


In [19]:
# drop the duplicated one
pd.merge(df1, df3, 
         left_on="employee", right_on="name").drop('name', axis=1)

Unnamed: 0,employee,group,salary
0,Bob,Accounting,70000
1,Jake,Engineering,80000
2,Lisa,Engineering,120000
3,Sue,HR,90000


In [20]:
#index merge: employee as the row index this name
df1a = df1.set_index('employee')
df2a = df2.set_index('employee')
print(df1a,'\n'); print(df2a,'\n')

                group
employee             
Bob        Accounting
Jake      Engineering
Lisa      Engineering
Sue                HR 

          hire_date
employee           
Lisa           2004
Bob            2008
Jake           2012
Sue            2014 



In [21]:
#then merge using indices
print(pd.merge(df1a, df2a, left_index=True, right_index=True))

                group  hire_date
employee                        
Bob        Accounting       2008
Jake      Engineering       2012
Lisa      Engineering       2004
Sue                HR       2014


In [22]:
# join(): merge using incices by default
print(df1a.join(df2a))

                group  hire_date
employee                        
Bob        Accounting       2008
Jake      Engineering       2012
Lisa      Engineering       2004
Sue                HR       2014


In [23]:
# mixed of index and column
print(pd.merge(df1a, df3, left_index=True, right_on='name'))

         group  name  salary
0   Accounting   Bob   70000
1  Engineering  Jake   80000
2  Engineering  Lisa  120000
3           HR   Sue   90000


#### Row-wise consideration

In [24]:
#example
df6 = pd.DataFrame({'name': ['Peter', 'Paul', 'Mary'],
                    'food': ['fish', 'beans', 'bread']},
                   columns=['name', 'food'])
df7 = pd.DataFrame({'name': ['Mary', 'Joseph'],
                    'drink': ['wine', 'beer']},
                   columns=['name', 'drink'])
print(df6,'\n'); print(df7,'\n'); 

    name   food
0  Peter   fish
1   Paul  beans
2   Mary  bread 

     name drink
0    Mary  wine
1  Joseph  beer 



In [25]:
print(pd.merge(df6, df7))
#equivalent
print(pd.merge(df6, df7, how='inner'))

   name   food drink
0  Mary  bread  wine
   name   food drink
0  Mary  bread  wine


In [26]:
#how argument
print(pd.merge(df6, df7, how='outer'),'\n')

print(pd.merge(df6, df7, how='left'))

     name   food drink
0   Peter   fish   NaN
1    Paul  beans   NaN
2    Mary  bread  wine
3  Joseph    NaN  beer 

    name   food drink
0  Peter   fish   NaN
1   Paul  beans   NaN
2   Mary  bread  wine


#### Overlapping column names

In [27]:
df8 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'rank': [1, 2, 3, 4]})
df9 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'rank': [3, 1, 4, 2]})
print(df8,'\n'); print(df9,'\n')

   name  rank
0   Bob     1
1  Jake     2
2  Lisa     3
3   Sue     4 

   name  rank
0   Bob     3
1  Jake     1
2  Lisa     4
3   Sue     2 



In [28]:
print(pd.merge(df8, df9, on="name"),'\n')
print(pd.merge(df8, df9, on="name", suffixes=["_L", "_R"]))

   name  rank_x  rank_y
0   Bob       1       3
1  Jake       2       1
2  Lisa       3       4
3   Sue       4       2 

   name  rank_L  rank_R
0   Bob       1       3
1  Jake       2       1
2  Lisa       3       4
3   Sue       4       2


### GroupBy: Conditional Aggregation

In [29]:
import pandas as pd
import numpy as np
np.random.seed(1234)
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'data': range(6),
                   'random': np.random.random(6)}, columns=['key', 'data', 'random'])
df

Unnamed: 0,key,data,random
0,A,0,0.191519
1,B,1,0.622109
2,C,2,0.437728
3,A,3,0.785359
4,B,4,0.779976
5,C,5,0.272593


In [30]:
# DataFrameGroupBy object: group data by the desired key column
df.groupby('key')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000194814B73A0>

In [31]:
print(df.groupby('key').sum(),'\n')
print(df.groupby('key')['random'].min())

     data    random
key                
A       3  0.976878
B       5  1.402085
C       7  0.710320 

key
A    0.191519
B    0.622109
C    0.272593
Name: random, dtype: float64


In [32]:
print(df.groupby('key').min())

     data    random
key                
A       0  0.191519
B       1  0.622109
C       2  0.272593


In [33]:
# iteration over groups
for (key, group) in df.groupby('key'):
    print((key,group),'\n')
for (key, group) in df.groupby('key'):
    print("{} shape={}".format(key, group.shape))

('A',   key  data    random
0   A     0  0.191519
3   A     3  0.785359) 

('B',   key  data    random
1   B     1  0.622109
4   B     4  0.779976) 

('C',   key  data    random
2   C     2  0.437728
5   C     5  0.272593) 

A shape=(2, 3)
B shape=(2, 3)
C shape=(2, 3)


In [34]:
# describe()
df.groupby('key')['random'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
A,2.0,0.488439,0.419908,0.191519,0.339979,0.488439,0.636899,0.785359
B,2.0,0.701042,0.111629,0.622109,0.661576,0.701042,0.740509,0.779976
C,2.0,0.35516,0.116768,0.272593,0.313876,0.35516,0.396444,0.437728


#### Aggregate, filter, transform, apply

In [35]:
#take a string, a function, or a list
df.groupby('key').aggregate(['min', np.median, max])

Unnamed: 0_level_0,data,data,data,random,random,random
Unnamed: 0_level_1,min,median,max,min,median,max
key,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A,0,1.5,3,0.191519,0.488439,0.785359
B,1,2.5,4,0.622109,0.701042,0.779976
C,2,3.5,5,0.272593,0.35516,0.437728


In [36]:
#dictionary mapping
df.groupby('key').aggregate({'data': 'min',
                             'random': 'max'})

Unnamed: 0_level_0,data,random
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0,0.785359
B,1,0.779976
C,2,0.437728


In [37]:
#filtering
def filter_func(x):
    return x['random'].min() < 0.3
print(df, '\n'); 
print(df.groupby('key').min(), '\n')

  key  data    random
0   A     0  0.191519
1   B     1  0.622109
2   C     2  0.437728
3   A     3  0.785359
4   B     4  0.779976
5   C     5  0.272593 

     data    random
key                
A       0  0.191519
B       1  0.622109
C       2  0.272593 



In [38]:
#keep groups that meet certain criteria
print(df.groupby('key').filter(filter_func))

  key  data    random
0   A     0  0.191519
2   C     2  0.437728
3   A     3  0.785359
5   C     5  0.272593


In [39]:
# transformation
# example:center the data by subtracting the group-wise mean
df.groupby('key').transform(lambda x: x - x.mean())

Unnamed: 0,data,random
0,-1.5,-0.29692
1,-1.5,-0.078934
2,-1.5,0.082568
3,1.5,0.29692
4,1.5,0.078934
5,1.5,-0.082568


In [40]:
# apply a function to the group results
def norm_by_data2(x):
    
    # x is a DataFrame of group values
    x['random'] /= x['data'].sum()
    return x

print(df,'\n'); print(df.groupby('key').apply(norm_by_data2))

  key  data    random
0   A     0  0.191519
1   B     1  0.622109
2   C     2  0.437728
3   A     3  0.785359
4   B     4  0.779976
5   C     5  0.272593 

  key  data    random
0   A     0  0.063840
1   B     1  0.124422
2   C     2  0.062533
3   A     3  0.261786
4   B     4  0.155995
5   C     5  0.038942


In [41]:
# group data by a specified list
L = [2, 0, 0, 0, 1, 1]
print(df,'\n'); print(df.groupby(L).sum())

  key  data    random
0   A     0  0.191519
1   B     1  0.622109
2   C     2  0.437728
3   A     3  0.785359
4   B     4  0.779976
5   C     5  0.272593 

   data    random
0     6  1.845195
1     9  1.052568
2     0  0.191519


In [42]:
#group data by mapping
df2 = df.set_index('key')
mapping = {'A': 'vowel', 'B': 'consonant', 'C': 'consonant'}
print(df2,'\n'); print(df2.groupby(mapping).sum())

     data    random
key                
A       0  0.191519
B       1  0.622109
C       2  0.437728
A       3  0.785359
B       4  0.779976
C       5  0.272593 

           data    random
key                      
consonant    12  2.112405
vowel         3  0.976878


In [43]:
#group data by function
print(df2,'\n'); print(df2.groupby(str.lower).mean())

     data    random
key                
A       0  0.191519
B       1  0.622109
C       2  0.437728
A       3  0.785359
B       4  0.779976
C       5  0.272593 

     data    random
key                
a     1.5  0.488439
b     2.5  0.701042
c     3.5  0.355160


In [44]:
#group data by multi-index
df20 = df2.groupby([str.lower, mapping]).mean()
df20

Unnamed: 0_level_0,Unnamed: 1_level_0,data,random
key,key,Unnamed: 2_level_1,Unnamed: 3_level_1
a,vowel,1.5,0.488439
b,consonant,2.5,0.701042
c,consonant,3.5,0.35516


In [45]:
df20.index

MultiIndex([('a',     'vowel'),
            ('b', 'consonant'),
            ('c', 'consonant')],
           names=['key', 'key'])

### Pivot Table

In [46]:
# example dataset
import numpy as np
import pandas as pd
import seaborn as sns
titanic = sns.load_dataset('titanic')
titanic.head(6)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True


In [47]:
# group by class and gender
# select survival, apply a mean aggregate
# unstack the hierarchical index
titanic.groupby(['sex', 'class'])['survived'].aggregate('mean')

sex     class 
female  First     0.968085
        Second    0.921053
        Third     0.500000
male    First     0.368852
        Second    0.157407
        Third     0.135447
Name: survived, dtype: float64

In [48]:
titanic.groupby(['sex', 'class'])['survived']

<pandas.core.groupby.generic.SeriesGroupBy object at 0x0000019499ECCC40>

In [49]:
titanic.groupby(['sex', 'class'])['survived'].aggregate('mean').unstack()

class,First,Second,Third
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


In [50]:
#pivot table alternative
titanic.pivot_table('survived', index='sex', columns='class')

class,First,Second,Third
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


In [51]:
#multilevel pivot tables
#a third dimension, as an example
age = pd.cut(titanic['age'], [0, 18, 80])
titanic.pivot_table('survived', ['sex', age], 'class')

Unnamed: 0_level_0,class,First,Second,Third
sex,age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,"(0, 18]",0.909091,1.0,0.511628
female,"(18, 80]",0.972973,0.9,0.423729
male,"(0, 18]",0.8,0.6,0.215686
male,"(18, 80]",0.375,0.071429,0.133663


In [52]:
# multilevel at columns
fare = pd.qcut(titanic['fare'], 2)
titanic.pivot_table('survived', ['sex', age], [fare, 'class'])

Unnamed: 0_level_0,fare,"(-0.001, 14.454]","(-0.001, 14.454]","(-0.001, 14.454]","(14.454, 512.329]","(14.454, 512.329]","(14.454, 512.329]"
Unnamed: 0_level_1,class,First,Second,Third,First,Second,Third
sex,age,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
female,"(0, 18]",,1.0,0.714286,0.909091,1.0,0.318182
female,"(18, 80]",,0.88,0.444444,0.972973,0.914286,0.391304
male,"(0, 18]",,0.0,0.26087,0.8,0.818182,0.178571
male,"(18, 80]",0.0,0.098039,0.125,0.391304,0.030303,0.192308


In [53]:
pd.DataFrame.pivot_table?

In [54]:
#check on the quantiles
titanic['fare'].quantile(0.5)

14.4542

In [55]:
#aggfunc: controls what type of aggregation is applied
titanic.pivot_table(index='sex', columns='class',
                    aggfunc={'survived':sum, 'fare':'mean'})

Unnamed: 0_level_0,fare,fare,fare,survived,survived,survived
class,First,Second,Third,First,Second,Third
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
female,106.125798,21.970121,16.11881,91,70,72
male,67.226127,19.741782,12.661633,45,17,47


In [56]:
#margins
titanic.pivot_table('survived', index='sex', columns='class', 
                    margins=True)

class,First,Second,Third,All
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.968085,0.921053,0.5,0.742038
male,0.368852,0.157407,0.135447,0.188908
All,0.62963,0.472826,0.242363,0.383838


### String Operations

In [57]:
# vectorized operation for numpy
data = ['peter', 'Paul', 'MARY', 'gUIDO']
[s.capitalize() for s in data]

['Peter', 'Paul', 'Mary', 'Guido']

In [58]:
data = ['peter', 'Paul', None, 'MARY', 'gUIDO']
# [s.capitalize() for s in data]

In [59]:
# pandas is convenient
import pandas as pd
names = pd.Series(data)
names.str.capitalize()

0    Peter
1     Paul
2     None
3     Mary
4    Guido
dtype: object

In [60]:
print(names,'\n')
print(names.str.upper(),'\n')
print(names.str.swapcase())

0    peter
1     Paul
2     None
3     MARY
4    gUIDO
dtype: object 

0    PETER
1     PAUL
2     None
3     MARY
4    GUIDO
dtype: object 

0    PETER
1     pAUL
2     None
3     mary
4    Guido
dtype: object


#### String methods available: 
len() lower() translate() islower() <br>
ljust() upper() startswith() isupper() <br>
rjust() find() endswith() isnumeric() <br>
center() rfind() isalnum() isdecimal() <br>
zfill() index() isalpha() split() <br>
strip() rindex() isdigit() rsplit() <br>
rstrip() capitalize() isspace() partition() <br>
lstrip() swapcase() istitle() rpartition() <br>

In [61]:
monte = pd.Series(['Graham Chapman', 'John Cleese', 'Terry Gilliam',
'Eric Idle', 'Terry Jones', 'Michael Palin'])
monte.str.len()

0    14
1    11
2    13
3     9
4    11
5    13
dtype: int64

In [62]:
monte.str.startswith('T')

0    False
1    False
2     True
3    False
4     True
5    False
dtype: bool

In [63]:
#pd.Series.str.split?
monte.str.split()

0    [Graham, Chapman]
1       [John, Cleese]
2     [Terry, Gilliam]
3         [Eric, Idle]
4       [Terry, Jones]
5     [Michael, Palin]
dtype: object

#### Miscellaneous methods 
get() Index each element <br>
slice() Slice each element<br>
slice_replace() Replace slice in each element with passed value<br>
cat() Concatenate strings<br>
repeat() Repeat values<br>
normalize() Return Unicode form of string<br>
pad() Add whitespace to left, right, or both sides of strings<br>
wrap() Split long strings into lines with length less than a given width<br>
join() Join strings in each element of the Series with passed separator<br>
get_dummies() Extract dummy variables as a DataFrame

In [64]:
#vectorized element access
monte.str[0:3]

0    Gra
1    Joh
2    Ter
3    Eri
4    Ter
5    Mic
dtype: object

In [65]:
#last element of each entry
monte.str.split().str.get(-1)

0    Chapman
1     Cleese
2    Gilliam
3       Idle
4      Jones
5      Palin
dtype: object

In [66]:
full_monte = pd.DataFrame({'name': monte,
                           'info': ['B|C|D', 'B|D', 'A|C', 
                                    'B|D', 'B|C','B|C|D']})
full_monte

Unnamed: 0,name,info
0,Graham Chapman,B|C|D
1,John Cleese,B|D
2,Terry Gilliam,A|C
3,Eric Idle,B|D
4,Terry Jones,B|C
5,Michael Palin,B|C|D


In [67]:
#quickly split out these indicator variables into a DataFrame
full_monte['info'].str.get_dummies('|')

Unnamed: 0,A,B,C,D
0,0,1,1,1
1,0,1,0,1
2,1,0,1,0
3,0,1,0,1
4,0,1,1,0
5,0,1,1,1
