In [2]:
import pandas as pd
import numpy as np

In [4]:
# What are vectorized operations
a = np.array([1,2,3,4])
a * 4

array([ 4,  8, 12, 16])

In [5]:
# problem in vectorized opertions in vanilla python
s = ['cat','mat',None,'rat']
# This will throw error because NoneType has no attribute 'startswith' .
[i.startswith('c') for i in s]

AttributeError: 'NoneType' object has no attribute 'startswith'

In [6]:
# How pandas solves this issue?

s = pd.Series(['cat','mat',None,'rat'])
# string accessor is used to access string methods and convert None to NaN
# so that it will not throw error
s.str.startswith('c')

# fast and optimized since in pandas it is implemented in C

0     True
1    False
2     None
3    False
dtype: object

In [7]:
# import titanic
df = pd.read_csv('../DataSets/titanic.csv')
df['Name']

0                                Braund, Mr. Owen Harris
1      Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                 Heikkinen, Miss. Laina
3           Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                               Allen, Mr. William Henry
                             ...                        
886                                Montvila, Rev. Juozas
887                         Graham, Miss. Margaret Edith
888             Johnston, Miss. Catherine Helen "Carrie"
889                                Behr, Mr. Karl Howell
890                                  Dooley, Mr. Patrick
Name: Name, Length: 891, dtype: object

## Common String Methods
- `lower` : Convert strings in the Series/Index to lowercase.
- `upper` : Convert strings in the Series/Index to uppercase.
- `capitalize` : Convert the first character of each string to uppercase.
- `title` : Convert each string to title case(first letter of each word capitalized).
- `len` : Compute the length of each string in the Series/Index.
- `strip` : Remove leading and trailing whitespace from each string in the Series/Index.
- `split` : Split strings around given separator/delimiter and return a DataFrame/MultiIndex expanding out the splits.
- `get` : Extract element from each component at specified position .
- `replace` : Replace occurrences of pattern/regex/string with some other string.
- `contains` : Test if pattern or regex is contained within a string of a Series or Index.
- `startswith` : Test if the start of each string element matches a pattern.
- `endswith` : Test if the end of each string element matches a pattern.
- `isdigit` : Check whether all characters in each string in the Series/Index are digits.
- `isalpha` : Check whether all characters in each string in the Series/Index are alphabetic.
- `slicing` : Slice each string in the Series/Index.

In [16]:
df['Name'].str.lower().head()

0                              braund, mr. owen harris
1    cumings, mrs. john bradley (florence briggs th...
2                               heikkinen, miss. laina
3         futrelle, mrs. jacques heath (lily may peel)
4                             allen, mr. william henry
Name: Name, dtype: object

In [17]:
df['Name'].str.upper().head()

0                              BRAUND, MR. OWEN HARRIS
1    CUMINGS, MRS. JOHN BRADLEY (FLORENCE BRIGGS TH...
2                               HEIKKINEN, MISS. LAINA
3         FUTRELLE, MRS. JACQUES HEATH (LILY MAY PEEL)
4                             ALLEN, MR. WILLIAM HENRY
Name: Name, dtype: object

In [18]:
df['Name'].str.capitalize().head()

0                              Braund, mr. owen harris
1    Cumings, mrs. john bradley (florence briggs th...
2                               Heikkinen, miss. laina
3         Futrelle, mrs. jacques heath (lily may peel)
4                             Allen, mr. william henry
Name: Name, dtype: object

In [19]:
df['Name'].str.title().head()

0                              Braund, Mr. Owen Harris
1    Cumings, Mrs. John Bradley (Florence Briggs Th...
2                               Heikkinen, Miss. Laina
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                             Allen, Mr. William Henry
Name: Name, dtype: object

In [22]:
df['Name'].str.len().max()

np.int64(82)

In [36]:
x = df['Name'].str.len().max() == 82
df['Name'][x]

KeyError: np.True_

In [43]:
"                       ritesh               ".strip()

'ritesh'

In [8]:
df['Name'] = df['Name'].str.strip()

In [9]:
# split -> get
df['lastname'] = df['Name'].str.split(',').str.get(0)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,lastname
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Braund
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Cumings
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Heikkinen
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Futrelle
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Allen


In [20]:
df['Name'].str.split(',').str.get(1).str.split(" ")
# without strip() we will get the space before the title too .

0                                  [, Mr., Owen, Harris]
1      [, Mrs., John, Bradley, (Florence, Briggs, Tha...
2                                       [, Miss., Laina]
3            [, Mrs., Jacques, Heath, (Lily, May, Peel)]
4                                [, Mr., William, Henry]
                             ...                        
886                                     [, Rev., Juozas]
887                           [, Miss., Margaret, Edith]
888                [, Miss., Catherine, Helen, "Carrie"]
889                                [, Mr., Karl, Howell]
890                                     [, Mr., Patrick]
Name: Name, Length: 891, dtype: object

In [22]:
df[['title','firstname']] = df['Name'].str.split(',').str.get(1).str.strip().str.split(' ', n=1, expand=True)
# here n is number of splits (since some people may have multiple first names) .
# expand = true is used to return dataframe
df.head()
# strip is used to remove leading space since after comma there is a space .

df['title'].value_counts()

title
Mr.          517
Miss.        182
Mrs.         125
Master.       40
Dr.            7
Rev.           6
Col.           2
Mlle.          2
Major.         2
Ms.            1
Mme.           1
Don.           1
Lady.          1
Sir.           1
Capt.          1
the            1
Jonkheer.      1
Name: count, dtype: int64

In [23]:
# replace
df['title'] = df['title'].str.replace('Ms.','Miss.')
df['title'] = df['title'].str.replace('Mlle.','Miss.')

In [24]:
df['title'].value_counts()

title
Mr.          517
Miss.        185
Mrs.         125
Master.       40
Dr.            7
Rev.           6
Major.         2
Col.           2
Don.           1
Lady.          1
Mme.           1
Sir.           1
Capt.          1
the            1
Jonkheer.      1
Name: count, dtype: int64

In [26]:
# filtering
# startswith/endswith
df[df['firstname'].str.startswith('A')]




Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,lastname,title,firstname
13,14,0,3,"Andersson, Mr. Anders Johan",male,39.0,1,5,347082,31.2750,,S,Andersson,Mr.,Anders Johan
22,23,1,3,"McGowan, Miss. Anna ""Annie""",female,15.0,0,0,330923,8.0292,,Q,McGowan,Miss.,"Anna ""Annie"""
35,36,0,1,"Holverson, Mr. Alexander Oskar",male,42.0,1,0,113789,52.0000,,S,Holverson,Mr.,Alexander Oskar
38,39,0,3,"Vander Planke, Miss. Augusta Maria",female,18.0,2,0,345764,18.0000,,S,Vander Planke,Miss.,Augusta Maria
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0000,B28,,Icard,Miss.,Amelie
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
842,843,1,1,"Serepeca, Miss. Augusta",female,30.0,0,0,113798,31.0000,,C,Serepeca,Miss.,Augusta
845,846,0,3,"Abbing, Mr. Anthony",male,42.0,0,0,C.A. 5547,7.5500,,S,Abbing,Mr.,Anthony
866,867,1,2,"Duran y More, Miss. Asuncion",female,27.0,1,0,SC/PARIS 2149,13.8583,,C,Duran y More,Miss.,Asuncion
875,876,1,3,"Najib, Miss. Adele Kiamie ""Jane""",female,15.0,0,0,2667,7.2250,,C,Najib,Miss.,"Adele Kiamie ""Jane"""


In [27]:
df[df['firstname'].str.endswith('A')]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,lastname,title,firstname
64,65,0,1,"Stewart, Mr. Albert A",male,,0,0,PC 17605,27.7208,,C,Stewart,Mr.,Albert A
303,304,1,2,"Keane, Miss. Nora A",female,,0,0,226593,12.35,E101,Q,Keane,Miss.,Nora A


In [29]:
# isdigit/isalpha...   : check if all characters are digit/alpha
df[df['firstname'].str.isdigit()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,lastname,title,firstname


In [None]:
# applying regex
# contains
# search john -> both case
df[df['firstname'].str.contains('john',case=False)]
# find lastnames with start and end char vowel
df[df['lastname'].str.contains('^[^aeiouAEIOU].+[^aeiouAEIOU]$')]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,lastname,title,firstname
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,Braund,Mr.,Owen Harris
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Cumings,Mrs.,John Bradley (Florence Briggs Thayer)
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,Heikkinen,Miss.,Laina
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,Moran,Mr.,James
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,McCarthy,Mr.,Timothy J
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
884,885,0,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.0500,,S,Sutehall,Mr.,Henry Jr
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,Graham,Miss.,Margaret Edith
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,Johnston,Miss.,"Catherine Helen ""Carrie"""
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,Behr,Mr.,Karl Howell


In [None]:
# slicing
df['Name'].str[::-1]

0                                sirraH newO .rM ,dnuarB
1      )reyahT sggirB ecnerolF( yeldarB nhoJ .srM ,sg...
2                                 aniaL .ssiM ,nenikkieH
3           )leeP yaM yliL( htaeH seuqcaJ .srM ,ellertuF
4                               yrneH mailliW .rM ,nellA
                             ...                        
886                                sazouJ .veR ,alivtnoM
887                         htidE teragraM .ssiM ,maharG
888             "eirraC" neleH enirehtaC .ssiM ,notsnhoJ
889                                llewoH lraK .rM ,rheB
890                                  kcirtaP .rM ,yelooD
Name: Name, Length: 891, dtype: object