# pandas and text methods

We will create a DataFrame to practice with:

In [None]:
import pandas as pd

names = ["Erika Schumacher", "Javi López", "Maria Rovira", "Ana Garamond", 
         "Shekhar Biswas", "Muriel Adams", "Saira Polom", "Alex Edwin", 
         "Kit Ching", "Dog Woof"]
ages = [22, 50, 23, 29, 44, 30, 25, 71, 35, 2]
nations = ["DE", "ES", "ES", "ES", "IN", "DE", "IN", "UK", "UK", "XX"]
sibilings = [2, 0, 4, 1, 1, 2, 3, 7, 0, 9]
colors = ["Red", "Yellow", "Yellow", "Blue", "Red", "Yellow", "Blue", "Blue", "Red", "Gray"]



people = pd.DataFrame({"name":names,
                       "age":ages,
                       "country":nations,
                       "sibilings":sibilings,
                       "favourite_color":colors
                      })

people.head()

Unnamed: 0,name,age,country,sibilings,favourite_color
0,Erika Schumacher,22,DE,2,Red
1,Javi López,50,ES,0,Yellow
2,Maria Rovira,23,ES,4,Yellow
3,Ana Garamond,29,ES,1,Blue
4,Shekhar Biswas,44,IN,1,Red


In [None]:
people.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   name             10 non-null     object
 1   age              10 non-null     int64 
 2   country          10 non-null     object
 3   sibilings        10 non-null     int64 
 4   favourite_color  10 non-null     object
dtypes: int64(2), object(3)
memory usage: 528.0+ bytes


## String Operations

You have already learned how to filter data with simple conditions, like getting all people whose favourite colour is "Red":

In [None]:
people.loc[people["favourite_color"]=="Red",:]

Unnamed: 0,name,age,country,sibilings,favourite_color
0,Erika Schumacher,22,DE,2,Red
4,Shekhar Biswas,44,IN,1,Red
8,Kit Ching,35,UK,0,Red


In [None]:
people.loc[(people["favourite_color"]=="Red")|(people["favourite_color"]=="Blue"),:]

Unnamed: 0,name,age,country,sibilings,favourite_color
0,Erika Schumacher,22,DE,2,Red
3,Ana Garamond,29,ES,1,Blue
4,Shekhar Biswas,44,IN,1,Red
6,Saira Polom,25,IN,3,Blue
7,Alex Edwin,71,UK,7,Blue
8,Kit Ching,35,UK,0,Red


In [None]:
people.loc[(people["favourite_color"]=="Red")|(people["favourite_color"]=="Blue")|(people["favourite_color"]=="Gray"),:]

Unnamed: 0,name,age,country,sibilings,favourite_color
0,Erika Schumacher,22,DE,2,Red
3,Ana Garamond,29,ES,1,Blue
4,Shekhar Biswas,44,IN,1,Red
6,Saira Polom,25,IN,3,Blue
7,Alex Edwin,71,UK,7,Blue
8,Kit Ching,35,UK,0,Red
9,Dog Woof,2,XX,9,Gray


In [None]:
people.loc[people['favourite_color'].isin(['Red', 'Blue', 'Gray']), :]

Unnamed: 0,name,age,country,sibilings,favourite_color
0,Erika Schumacher,22,DE,2,Red
3,Ana Garamond,29,ES,1,Blue
4,Shekhar Biswas,44,IN,1,Red
6,Saira Polom,25,IN,3,Blue
7,Alex Edwin,71,UK,7,Blue
8,Kit Ching,35,UK,0,Red
9,Dog Woof,2,XX,9,Gray


In [None]:
people.loc[~people['favourite_color'].isin(['Red', 'Blue', 'Gray']), :]

Unnamed: 0,name,age,country,sibilings,favourite_color
1,Javi López,50,ES,0,Yellow
2,Maria Rovira,23,ES,4,Yellow
5,Muriel Adams,30,DE,2,Yellow


In [None]:
people.loc[people['age'].between(25, 50), :]

Unnamed: 0,name,age,country,sibilings,favourite_color
1,Javi López,50,ES,0,Yellow
3,Ana Garamond,29,ES,1,Blue
4,Shekhar Biswas,44,IN,1,Red
5,Muriel Adams,30,DE,2,Yellow
6,Saira Polom,25,IN,3,Blue
8,Kit Ching,35,UK,0,Red


In [None]:
people["name"]

0    Erika Schumacher
1          Javi López
2        Maria Rovira
3        Ana Garamond
4      Shekhar Biswas
5        Muriel Adams
6         Saira Polom
7          Alex Edwin
8           Kit Ching
9            Dog Woof
Name: name, dtype: object

In [None]:
the_filter = people["name"].str.startswith('A')
the_filter

0    False
1    False
2    False
3     True
4    False
5    False
6    False
7     True
8    False
9    False
Name: name, dtype: bool

In [None]:
people.loc[the_filter, :]

Unnamed: 0,name,age,country,sibilings,favourite_color
3,Ana Garamond,29,ES,1,Blue
7,Alex Edwin,71,UK,7,Blue


In [None]:
people.loc[people["name"].str.endswith("ms"), :]

Unnamed: 0,name,age,country,sibilings,favourite_color
5,Muriel Adams,30,DE,2,Yellow


In [None]:
the_filter = people["name"].str.contains("m")
people.loc[the_filter, :]

Unnamed: 0,name,age,country,sibilings,favourite_color
0,Erika Schumacher,22,DE,2,Red
3,Ana Garamond,29,ES,1,Blue
5,Muriel Adams,30,DE,2,Yellow
6,Saira Polom,25,IN,3,Blue


In [None]:
people.name.str.lower()

0    erika schumacher
1          javi lópez
2        maria rovira
3        ana garamond
4      shekhar biswas
5        muriel adams
6         saira polom
7          alex edwin
8           kit ching
9            dog woof
Name: name, dtype: object

In [None]:
people.loc[people.name.str.lower().str.contains("maria"), :]

Unnamed: 0,name,age,country,sibilings,favourite_color
2,maria rovira,23,ES,4,Yellow


In [None]:
people.name.str.upper()

0    ERIKA SCHUMACHER
1          JAVI LÓPEZ
2        MARIA ROVIRA
3        ANA GARAMOND
4      SHEKHAR BISWAS
5        MURIEL ADAMS
6         SAIRA POLOM
7          ALEX EDWIN
8           KIT CHING
9            DOG WOOF
Name: name, dtype: object

In [None]:
people.name

0    Erika Schumacher
1          Javi López
2        Maria Rovira
3        Ana Garamond
4      Shekhar Biswas
5        Muriel Adams
6         Saira Polom
7          Alex Edwin
8           Kit Ching
9            Dog Woof
Name: name, dtype: object

In [None]:
people["name"] = people.name.str.lower()
people

Unnamed: 0,name,age,country,sibilings,favourite_color
0,erika schumacher,22,DE,2,Red
1,javi lópez,50,ES,0,Yellow
2,maria rovira,23,ES,4,Yellow
3,ana garamond,29,ES,1,Blue
4,shekhar biswas,44,IN,1,Red
5,muriel adams,30,DE,2,Yellow
6,saira polom,25,IN,3,Blue
7,alex edwin,71,UK,7,Blue
8,kit ching,35,UK,0,Red
9,dog woof,2,XX,9,Gray


In [None]:
people.name.str.title()

0    Erika Schumacher
1          Javi López
2        Maria Rovira
3        Ana Garamond
4      Shekhar Biswas
5        Muriel Adams
6         Saira Polom
7          Alex Edwin
8           Kit Ching
9            Dog Woof
Name: name, dtype: object

In [None]:
people.name.str.len()

0    16
1    10
2    12
3    12
4    14
5    12
6    11
7    10
8     9
9     8
Name: name, dtype: int64

In [None]:
people.name.str[:2]

0    er
1    ja
2    ma
3    an
4    sh
5    mu
6    sa
7    al
8    ki
9    do
Name: name, dtype: object

In [None]:
people['first_two'] = people.name.str[:2]
people

Unnamed: 0,name,age,country,sibilings,favourite_color,first_two
0,erika schumacher,22,DE,2,Red,er
1,javi lópez,50,ES,0,Yellow,ja
2,maria rovira,23,ES,4,Yellow,ma
3,ana garamond,29,ES,1,Blue,an
4,shekhar biswas,44,IN,1,Red,sh
5,muriel adams,30,DE,2,Yellow,mu
6,saira polom,25,IN,3,Blue,sa
7,alex edwin,71,UK,7,Blue,al
8,kit ching,35,UK,0,Red,ki
9,dog woof,2,XX,9,Gray,do


In [None]:
people['last_two'] = people.name.str[-2:]
people

Unnamed: 0,name,age,country,sibilings,favourite_color,first_two,last_two
0,erika schumacher,22,DE,2,Red,er,er
1,javi lópez,50,ES,0,Yellow,ja,ez
2,maria rovira,23,ES,4,Yellow,ma,ra
3,ana garamond,29,ES,1,Blue,an,nd
4,shekhar biswas,44,IN,1,Red,sh,as
5,muriel adams,30,DE,2,Yellow,mu,ms
6,saira polom,25,IN,3,Blue,sa,om
7,alex edwin,71,UK,7,Blue,al,in
8,kit ching,35,UK,0,Red,ki,ng
9,dog woof,2,XX,9,Gray,do,of


In [None]:
people.name.str.cat(sep=",")

'erika schumacher,javi lópez,maria rovira,ana garamond,shekhar biswas,muriel adams,saira polom,alex edwin,kit ching,dog woof'

In [None]:
people.name + " " + people.country

0    erika schumacher DE
1          javi lópez ES
2        maria rovira ES
3        ana garamond ES
4      shekhar biswas IN
5        muriel adams DE
6         saira polom IN
7          alex edwin UK
8           kit ching UK
9            dog woof XX
dtype: object

In [None]:
print("Ali", " Elkassas")

Ali  Elkassas


In [None]:
people.name.str.split(" ", expand=True)

Unnamed: 0,0,1
0,erika,schumacher
1,javi,lópez
2,maria,rovira
3,ana,garamond
4,shekhar,biswas
5,muriel,adams
6,saira,polom
7,alex,edwin
8,kit,ching
9,dog,woof


In [None]:
word = "        I love     mango       "
word[0:5:2] #[start:end:step]

'   '

In [None]:
word[::2]

'mno'

In [None]:
word.strip()

'I love     mango'

In [None]:
word.replace("m", "z")

'        I love     zango       '

In [None]:
print(f"the dataset has the size {people.size} and the shape is the column names are:\n {people.shape}")

the dataset has the size 70 and the shape is the column names are:
 (10, 7)


In [None]:
people.sample(2)

Unnamed: 0,name,age,country,sibilings,favourite_color,first_two,last_two
2,maria rovira,23,ES,4,Yellow,ma,ra
8,kit ching,35,UK,0,Red,ki,ng


In [None]:
test_str = "abcdef"

test_str[4:6]

'ef'

In [None]:
test_str[-2:]

'ef'

In [None]:
test_str[-1]

'f'

When it comes to text data, sometimes the conditions are more complex. How would we select all the people whose name starts with a certain letter? 

This is where pandas String Operations are really helpful. Go through [this user guide](https://pandas.pydata.org/docs/user_guide/text.html#string-methods) from Pandas' documentation, it's a good introduction. Here are some examples:

Filtering rows with name starting with A:

- first we generate the boolean expression

In [None]:
# https://pandas.pydata.org/docs/reference/api/pandas.Series.str.startswith.html

people.name.str.startswith("A")

0    False
1    False
2    False
3     True
4    False
5    False
6    False
7     True
8    False
9    False
Name: name, dtype: bool

- and then pass it to `loc[]`

In [None]:
people.loc[people.name.str.startswith("A"),]

Unnamed: 0,name,age,country,sibilings,favourite_color
3,Ana Garamond,29,ES,1,Blue
7,Alex Edwin,71,UK,7,Blue


String methods can also change text:

In [None]:
# https://pandas.pydata.org/docs/reference/api/pandas.Series.str.lower.html

people.name.str.lower()

0    erika schumacher
1          javi lópez
2        maria rovira
3        ana garamond
4      shekhar biswas
5        muriel adams
6         saira polom
7          alex edwin
8           kit ching
9            dog woof
Name: name, dtype: object

Note that we have just outputted these names, but we have not changed the original dataframe:

In [None]:
people.head(2)

Unnamed: 0,name,age,country,sibilings,favourite_color
0,Erika Schumacher,22,DE,2,Red
1,Javi López,50,ES,0,Yellow


pandas will not make changes to the original data unless you explicitly tell it to do so. If we wanted to change the original dataframe, we would have assign this output (the names in lower case) to the column in the dataframe we want to change. When doing that, it is important that you select that column using `loc[]`, and not simply `DataFrame.column`:

In [None]:
people.loc[:,"name"] = people.name.str.lower()

In [None]:
# now the original dataframe has been modified:
people.head(2)

Unnamed: 0,name,age,country,sibilings,favourite_color
0,erika schumacher,22,DE,2,Red
1,javi lópez,50,ES,0,Yellow


###### **Exercise 1:**
select all people whose name contains (either in the first name or the surname) the letter `p`.

In [None]:
# your code here

###### **Exercise 2:**
select all people whose full name + surname has more than 12 characters.

In [None]:
# your code here

###### **Exercise 3:**
select all people whose surname starts with the letter `e`:

In [None]:
# your code here

###### **Exercise 4:**
Create a new dataframe, `people_names`, where the first name and the last name are split into two different columns, `first_name` and `last_name`. The first row of the new dataframe should look like this:

`name           	first_name	last_name	age	country 	sibilings	favourite_color`

`erika schumacher	erika    	schumacher	22	DE      	2       	Red`

In [None]:
# your code here


In [None]:
#df[for any selection/slicing even with loc use square brackets].method()


people[['first_name', 'last_name']] = people['name'].str.split(expand=True)
people = people.iloc[:, [0, 5, 6, 1, 2, 3, 4]]
people

Unnamed: 0,name,first_name,last_name,age,country,sibilings,favourite_color
0,Erika Schumacher,Erika,Schumacher,22,DE,2,Red
1,Javi López,Javi,López,50,ES,0,Yellow
2,Maria Rovira,Maria,Rovira,23,ES,4,Yellow
3,Ana Garamond,Ana,Garamond,29,ES,1,Blue
4,Shekhar Biswas,Shekhar,Biswas,44,IN,1,Red
5,Muriel Adams,Muriel,Adams,30,DE,2,Yellow
6,Saira Polom,Saira,Polom,25,IN,3,Blue
7,Alex Edwin,Alex,Edwin,71,UK,7,Blue
8,Kit Ching,Kit,Ching,35,UK,0,Red
9,Dog Woof,Dog,Woof,2,XX,9,Gray


Unnamed: 0,name,first_name,last_name,age,country,sibilings,favourite_color
0,Erika Schumacher,Erika,Schumacher,22,DE,2,Red
1,Javi López,Javi,López,50,ES,0,Yellow
2,Maria Rovira,Maria,Rovira,23,ES,4,Yellow
3,Ana Garamond,Ana,Garamond,29,ES,1,Blue
4,Shekhar Biswas,Shekhar,Biswas,44,IN,1,Red
5,Muriel Adams,Muriel,Adams,30,DE,2,Yellow
6,Saira Polom,Saira,Polom,25,IN,3,Blue
7,Alex Edwin,Alex,Edwin,71,UK,7,Blue
8,Kit Ching,Kit,Ching,35,UK,0,Red
9,Dog Woof,Dog,Woof,2,XX,9,Gray


Unnamed: 0,name,first_name,last_name,age,country,sibilings,favourite_color
0,Erika Schumacher,Erika,Schumacher,22,DE,2,Red
1,Javi López,Javi,López,50,ES,0,Yellow
2,Maria Rovira,Maria,Rovira,23,ES,4,Yellow
3,Ana Garamond,Ana,Garamond,29,ES,1,Blue
4,Shekhar Biswas,Shekhar,Biswas,44,IN,1,Red
5,Muriel Adams,Muriel,Adams,30,DE,2,Yellow
6,Saira Polom,Saira,Polom,25,IN,3,Blue
7,Alex Edwin,Alex,Edwin,71,UK,7,Blue
8,Kit Ching,Kit,Ching,35,UK,0,Red
9,Dog Woof,Dog,Woof,2,XX,9,Gray


In [None]:
people

Unnamed: 0,name,age,country,sibilings,favourite_color,first_name,last_name
0,Erika Schumacher,22,DE,2,Red,Erika,Schumacher
1,Javi López,50,ES,0,Yellow,Javi,López
2,Maria Rovira,23,ES,4,Yellow,Maria,Rovira
3,Ana Garamond,29,ES,1,Blue,Ana,Garamond
4,Shekhar Biswas,44,IN,1,Red,Shekhar,Biswas
5,Muriel Adams,30,DE,2,Yellow,Muriel,Adams
6,Saira Polom,25,IN,3,Blue,Saira,Polom
7,Alex Edwin,71,UK,7,Blue,Alex,Edwin
8,Kit Ching,35,UK,0,Red,Kit,Ching
9,Dog Woof,2,XX,9,Gray,Dog,Woof


In [None]:
#phelp = people.name.str.split(" ",expand=True)
#phelp.columns = ['fname','lname']
%%timeit
phelp = pd.DataFrame()
phelp[['first_name', 'last_name']] = people['name'].str.split(expand=True)
people_names = pd.concat([people.iloc[:, :1], phelp, people.iloc[:, 1:]], axis=1)
people_names

Unnamed: 0,name,first_name,last_name,age,country,sibilings,favourite_color,first_name.1,last_name.1
0,Erika Schumacher,Erika,Schumacher,22,DE,2,Red,Erika,Schumacher
1,Javi López,Javi,López,50,ES,0,Yellow,Javi,López
2,Maria Rovira,Maria,Rovira,23,ES,4,Yellow,Maria,Rovira
3,Ana Garamond,Ana,Garamond,29,ES,1,Blue,Ana,Garamond
4,Shekhar Biswas,Shekhar,Biswas,44,IN,1,Red,Shekhar,Biswas
5,Muriel Adams,Muriel,Adams,30,DE,2,Yellow,Muriel,Adams
6,Saira Polom,Saira,Polom,25,IN,3,Blue,Saira,Polom
7,Alex Edwin,Alex,Edwin,71,UK,7,Blue,Alex,Edwin
8,Kit Ching,Kit,Ching,35,UK,0,Red,Kit,Ching
9,Dog Woof,Dog,Woof,2,XX,9,Gray,Dog,Woof
