In [11]:
import pandas as pd
import numpy as np

In [12]:
df = pd.DataFrame({
    "A":[" x", "y1 ", " z 90 ", 1.0],
    "B":["ab_ab_0", "c_d_a", np.nan, "g_h"],
    "C":["pre_hello", "pre_world", "pre_love", "pre_python"],
    "D":["1_x", "2_x", "3_x", "4_x"],
    "E":[1, 10, 34, 5689]
})
df

Unnamed: 0,A,B,C,D,E
0,x,ab_ab_0,pre_hello,1_x,1
1,y1,c_d_a,pre_world,2_x,10
2,z 90,,pre_love,3_x,34
3,1.0,g_h,pre_python,4_x,5689


## 1) New and experimental "string" datatype instead of "object"



In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   A       4 non-null      object
 1   B       3 non-null      object
 2   C       4 non-null      object
 3   D       4 non-null      object
 4   E       4 non-null      int64 
dtypes: int64(1), object(4)
memory usage: 292.0+ bytes


In [17]:
type(df.A[1])

str

In [18]:
df1 = df.astype("string")
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   A       4 non-null      string
 1   B       3 non-null      string
 2   C       4 non-null      string
 3   D       4 non-null      string
 4   E       4 non-null      string
dtypes: string(5)
memory usage: 292.0 bytes


In [22]:
df1

Unnamed: 0,A,B,C,D,E
0,x,ab_ab_0,pre_hello,1_x,1
1,y1,c_d_a,pre_world,2_x,10
2,z 90,,pre_love,3_x,34
3,1.0,g_h,pre_python,4_x,5689


## 2) Capitalize, lowercase, uppercase



In [25]:
df.A.str.capitalize()

0         x
1       Y1 
2     z 90 
3       NaN
Name: A, dtype: object

In [28]:
df.C.str.upper()

0     PRE_HELLO
1     PRE_WORLD
2      PRE_LOVE
3    PRE_PYTHON
Name: C, dtype: object

In [30]:
df.D.str.upper()

0    1_X
1    2_X
2    3_X
3    4_X
Name: D, dtype: object

## 3) concatenate



In [35]:
df

Unnamed: 0,A,B,C,D,E
0,x,ab_ab_0,pre_hello,1_x,1
1,y1,c_d_a,pre_world,2_x,10
2,z 90,,pre_love,3_x,34
3,1.0,g_h,pre_python,4_x,5689


In [37]:
# method 1
df.C + " "+ df.D

0     pre_hello 1_x
1     pre_world 2_x
2      pre_love 3_x
3    pre_python 4_x
dtype: object

In [38]:
# method 2
df.C.str.cat(df.D)

0     pre_hello1_x
1     pre_world2_x
2      pre_love3_x
3    pre_python4_x
Name: C, dtype: object

## 4) Split



In [39]:
df

Unnamed: 0,A,B,C,D,E
0,x,ab_ab_0,pre_hello,1_x,1
1,y1,c_d_a,pre_world,2_x,10
2,z 90,,pre_love,3_x,34
3,1.0,g_h,pre_python,4_x,5689


In [40]:
df.B.str.split("_")

0    [ab, ab, 0]
1      [c, d, a]
2            NaN
3         [g, h]
Name: B, dtype: object

In [46]:
df.B.str.split("_", expand=True)

Unnamed: 0,0,1,2
0,ab,ab,0
1,c,d,a
2,,,
3,g,h,


## 5) Replace parts of a string



In [47]:
df1

Unnamed: 0,A,B,C,D,E
0,x,ab_ab_0,pre_hello,1_x,1
1,y1,c_d_a,pre_world,2_x,10
2,z 90,,pre_love,3_x,34
3,1.0,g_h,pre_python,4_x,5689


In [49]:
df1.B = df1["B"].str.replace("_", "-")

In [50]:
df1

Unnamed: 0,A,B,C,D,E
0,x,ab-ab-0,pre_hello,1_x,1
1,y1,c-d-a,pre_world,2_x,10
2,z 90,,pre_love,3_x,34
3,1.0,g-h,pre_python,4_x,5689


## 6) Strip empty spaces



In [53]:
df1.A.to_list()

[' x', 'y1 ', ' z 90 ', '1.0']

In [56]:
df1.A.str.strip().to_list()

['x', 'y1', 'z 90', '1.0']

In [57]:
df1.A.str.lstrip().to_list()

['x', 'y1 ', 'z 90 ', '1.0']

In [58]:
df1.A.str.rstrip().to_list()

[' x', 'y1', ' z 90', '1.0']

## 7) Pad



In [59]:
df1

Unnamed: 0,A,B,C,D,E
0,x,ab-ab-0,pre_hello,1_x,1
1,y1,c-d-a,pre_world,2_x,10
2,z 90,,pre_love,3_x,34
3,1.0,g-h,pre_python,4_x,5689


In [65]:
df1.E.str.pad(width=4, fillchar="0")

0    0001
1    0010
2    0034
3    5689
Name: E, dtype: string

In [66]:
df1.B.str.pad(width=10, fillchar="-")

0    ---ab-ab-0
1    -----c-d-a
2          <NA>
3    -------g-h
Name: B, dtype: string

## 8) zero fill (zfill)



In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   A       4 non-null      object
 1   B       3 non-null      object
 2   C       4 non-null      object
 3   D       4 non-null      object
 4   E       4 non-null      int64 
dtypes: int64(1), object(4)
memory usage: 292.0+ bytes


In [70]:
df.E.astype("string").str.zfill(width=4)

0    0001
1    0010
2    0034
3    5689
Name: E, dtype: string

In [71]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   A       4 non-null      string
 1   B       3 non-null      string
 2   C       4 non-null      string
 3   D       4 non-null      string
 4   E       4 non-null      string
dtypes: string(5)
memory usage: 292.0 bytes


## 9) Remove prefix and/or suffix



In [72]:
df1

Unnamed: 0,A,B,C,D,E
0,x,ab-ab-0,pre_hello,1_x,1
1,y1,c-d-a,pre_world,2_x,10
2,z 90,,pre_love,3_x,34
3,1.0,g-h,pre_python,4_x,5689


In [73]:
df1.C.str.removeprefix("pre_")

0     hello
1     world
2      love
3    python
Name: C, dtype: string

In [74]:
df1.D.str.removesuffix("_x")

0    1
1    2
2    3
3    4
Name: D, dtype: string

## 10) Slice each string: (forward and backward)



In [77]:
df1

Unnamed: 0,A,B,C,D,E
0,x,ab-ab-0,pre_hello,1_x,1
1,y1,c-d-a,pre_world,2_x,10
2,z 90,,pre_love,3_x,34
3,1.0,g-h,pre_python,4_x,5689


In [80]:
df1.C.str.slice(2, 7)

0    e_hel
1    e_wor
2    e_lov
3    e_pyt
Name: C, dtype: string

In [85]:
df1.C.str.slice(-5,)

0    hello
1    world
2    _love
3    ython
Name: C, dtype: string

## 11) len() vs count("pattern")



In [86]:
df

Unnamed: 0,A,B,C,D,E
0,x,ab_ab_0,pre_hello,1_x,1
1,y1,c_d_a,pre_world,2_x,10
2,z 90,,pre_love,3_x,34
3,1.0,g_h,pre_python,4_x,5689


In [87]:
df.B.str.len()

0    7.0
1    5.0
2    NaN
3    3.0
Name: B, dtype: float64

In [88]:
df.B.str.count("a")

0    2.0
1    1.0
2    NaN
3    0.0
Name: B, dtype: float64