# Cheat Sheet: Working with text data in Python

## Example data

In [26]:
import pandas as pd

suits = pd.Series(["clubs", "Diamonds", "hearts", "Spades"])
dirty = pd.Series(["rock ", "  paper", "scissors"])

## String lengths and substrings

#### Get the number of characters (code points) in a string with .str.len() 

In [27]:
suits.str.len()

0    5
1    8
2    6
3    6
dtype: int64

#### Take a substring by position with .str[]

In [28]:
suits.str[2:5]

0    ubs
1    amo
2    art
3    ade
dtype: object

#### Take a substring by negative position with .str[]

In [29]:
suits.str[:-3]

0       cl
1    Diamo
2      hea
3      Spa
dtype: object

#### Trim whitespace from the start and end of a string with .str.strip()

In [30]:
dirty.str.strip()

0        rock
1       paper
2    scissors
dtype: object

#### Pad strings to a given length with .:str.pad()

In [31]:
suits.str.pad(10, fillchar="_", side="right")

0    clubs_____
1    Diamonds__
2    hearts____
3    Spades____
dtype: object

## Changing case

#### Converting to lower case with .str.lower()

In [32]:
suits.str.lower()

0       clubs
1    diamonds
2      hearts
3      spades
dtype: object

#### Converting to upper case with .str.upper()

In [33]:
suits.str.upper()

0       CLUBS
1    DIAMONDS
2      HEARTS
3      SPADES
dtype: object

#### Converting to title case with.str.title()

In [34]:
suits.str.title()

0       Clubs
1    Diamonds
2      Hearts
3      Spades
dtype: object

#### Converting to sentence case with .str.captialize()

In [35]:
suits.str.capitalize()

0       Clubs
1    Diamonds
2      Hearts
3      Spades
dtype: object

## Formatting strings

#### Formatting numbers with .style.format()

In [8]:
import numpy as np
import pandas as pd

nums = np.random.rand(3, 3)
df = pd.DataFrame(nums, columns=["f1", "f2", "f3"])
print(df)

         f1        f2        f3
0  0.299888  0.375619  0.185181
1  0.543743  0.272028  0.635070
2  0.465982  0.414964  0.727664


In [11]:
df.style.format(precision=1)

Unnamed: 0,f1,f2,f3
0,0.3,0.4,0.2
1,0.5,0.3,0.6
2,0.5,0.4,0.7


## Splitting strings

#### Splitting strings by a separator with .str.split()

In [16]:
answers = ["cat, rat, dog", "bunny, fly, tiger", "mice, shield, teach"]
answers = pd.Series(answers)

In [17]:
answers.str.split(", ")

0          [cat, rat, dog]
1      [bunny, fly, tiger]
2    [mice, shield, teach]
dtype: object

## Joining/concatenating strings

#### Combining strings with +

In [21]:
"Let" + "'s " + "join"

"Let's join"

#### Combining strings with a separator

In [23]:
groceries = [["tomato", "potato", "onion"], ["chips", "bread", "butter"]]
groceries = pd.Series(groceries)

groceries.str.join(", ")

0    tomato, potato, onion
1     chips, bread, butter
dtype: object

#### Collapsing strings with .str.cat()

In [25]:
quality = pd.Series(["fair", "good", "premium"])

quality.str.cat(sep=" ")

'fair good premium'

#### Repeating strings with .str.repeat()

In [26]:
s = pd.Series(["x", "y", "z"])

s.str.repeat(repeats=2)

0    xx
1    yy
2    zz
dtype: object

In [27]:
s.str.repeat(repeats=[3, 2, 1])

0    xxx
1     yy
2      z
dtype: object

## Detecting matches

In [37]:
actions = pd.Series(["hanging1", "bringing", "2claning"])

#### Detecting if a string contains a match of a regular expression with .str.contains()

In [38]:
actions.str.contains(pat="\\d", regex=True)

0     True
1    False
2     True
dtype: bool

#### Finding first index of strings that match a regular expression with .str.find()

In [39]:
actions.str.find("ng")

0    2
1    3
2    6
dtype: int64

#### Finding last index of strings that match a regular expression with .str.rfind()

In [40]:
actions.str.rfind("ng")

0    5
1    6
2    6
dtype: int64

#### Counting the number of matches to a regex with .str.count()

In [41]:
actions.str.count(pat="ng")

0    2
1    2
2    1
dtype: int64

## Extracting matches

## Replacing matches