[Reference](https://medium.com/@sayahfares19/text-analysis-with-pandas-guide-af8b12873b04)

# 1. Manipulate Case

In [1]:
import pandas as pd
import numpy as np

serie = pd.Series(['lev gor\'kov', np.nan, 'brillouin', 'albert einstein', 'carl m. bender'])

print(f'Lowercase all letters:\n{serie.str.lower()}\n')

print(f'Uppercase all letters:\n{serie.str.upper()}\n')

# Convert strings in the Series/Index to be capitalized
print(f'Uppercase the first letter:\n{serie.str.capitalize()}\n')

print(f'Uppercase the first letter of each word:\n{serie.str.title()}\n')

Lowercase all letters:
0        lev gor'kov
1                NaN
2          brillouin
3    albert einstein
4     carl m. bender
dtype: object

Uppercase all letters:
0        LEV GOR'KOV
1                NaN
2          BRILLOUIN
3    ALBERT EINSTEIN
4     CARL M. BENDER
dtype: object

Uppercase the first letter:
0        Lev gor'kov
1                NaN
2          Brillouin
3    Albert einstein
4     Carl m. bender
dtype: object

Uppercase the first letter of each word:
0        Lev Gor'Kov
1                NaN
2          Brillouin
3    Albert Einstein
4     Carl M. Bender
dtype: object



# 2. Split Strings

In [2]:
import pandas as pd
import numpy as np

serie = pd.Series(['lev gor\'kov', np.nan, 'brillouin', 'albert einstein', 'carl m. bender'])

print(f'Before Splitting:\n{serie}\n')

new_serie = (
    serie.str.title()
    .str.split(' ', expand=True, n=1)
    .rename(columns={0:'First Name', 1:'Last Name'})
)

print(f'After Splitting:\n{new_serie}')

Before Splitting:
0        lev gor'kov
1                NaN
2          brillouin
3    albert einstein
4     carl m. bender
dtype: object

After Splitting:
  First Name  Last Name
0        Lev    Gor'Kov
1        NaN        NaN
2  Brillouin       None
3     Albert   Einstein
4       Carl  M. Bender


# 3. Replace String

In [3]:
import pandas as pd
import numpy as np

serie = pd.Series(['lev gor\'kov', np.nan, 'Dr. brillouin', 'Pr. albert einstein', 'carl m. bender'])

print(f'Before Replacing:\n{serie}\n')

new_serie = (
    serie.str.replace('Dr.', '', regex=False)
    .str.replace('Pr.', '', regex=False)
    .str.strip()
    .str.title()
    .str.split(' ', expand=True, n=1)
    .rename(columns={0:'First Name', 1:'Last Name'})
)

print(f'After Replacing:\n{new_serie}')

Before Replacing:
0            lev gor'kov
1                    NaN
2          Dr. brillouin
3    Pr. albert einstein
4         carl m. bender
dtype: object

After Replacing:
  First Name  Last Name
0        Lev    Gor'Kov
1        NaN        NaN
2  Brillouin       None
3     Albert   Einstein
4       Carl  M. Bender


# 4. Concatenate

In [4]:
import pandas as pd
import numpy as np

s_1 = pd.Series(["Albert", "John", "Robert", np.nan, "Jack"], dtype="string")
s_2 = pd.Series(["Doe", "Piter", "David", "Eden", "Carl"], dtype="string")


# We can specify a separator
print(f'Concatinate and ignore missing values:\n{s_1.str.cat(s_2, sep=" ")}\n')


# Missing values are ignored by default, 
# use 'na_rep' to catch them
print()
print(f'Concatinate and replace missing values with "-":\n{s_1.str.cat(s_2, sep=" ", na_rep="-")}\n')

Concatinate and ignore missing values:
0      Albert Doe
1      John Piter
2    Robert David
3            <NA>
4       Jack Carl
dtype: string


Concatinate and replace missing values with "-":
0      Albert Doe
1      John Piter
2    Robert David
3          - Eden
4       Jack Carl
dtype: string



# 5. Additional Methods:
-    .startswith(pattern): It returns true if the element or string in the DataFrame Index starts with the pattern.
-    .endswith(pattern): It returns true if the element or string in the DataFrame Index ends with the pattern.
-    .repeat(value): It repeats each element with a given number of times like the below example, there are two appearances of each string in DataFrame.
-    .find(pattern): It returns the first position of the first occurrence of the pattern.

# 6. Information Extraction from Text

In [5]:
import pandas as pd


time_sentences = ["Saturday: Weekend (Not working day)",
                  "Sunday: Weekend (Not working day)",
                  "Monday: The doctor's appointment is at 2:45pm.",
                  "Tuesday: The dentist's appointment is at 11:30 am.",
                  "Wednesday: At 7:00pm, there is a basketball game!",
                  "Thursday: Be back home by 11:15 pm at the latest.",
                  "Friday: Take the train at 08:10 am, arrive at 09:00am."]

df = pd.DataFrame(time_sentences, columns=['text'])

(
    df
    .assign(text=df.text.str.lower(),
            text_len=df.text.str.len(),
            word_count=df.text.str.count(" ") + 1,
            weekend=df.text.str.contains("saturday|sunday", case=False),
            appointment_time=df.text.str.findall(r"(\d?\d):(\d\d)"),
           )
)

Unnamed: 0,text,text_len,word_count,weekend,appointment_time
0,saturday: weekend (not working day),35,5,True,[]
1,sunday: weekend (not working day),33,5,True,[]
2,monday: the doctor's appointment is at 2:45pm.,46,7,False,"[(2, 45)]"
3,tuesday: the dentist's appointment is at 11:30...,50,8,False,"[(11, 30)]"
4,"wednesday: at 7:00pm, there is a basketball game!",49,8,False,"[(7, 00)]"
5,thursday: be back home by 11:15 pm at the latest.,49,10,False,"[(11, 15)]"
6,"friday: take the train at 08:10 am, arrive at ...",54,10,False,"[(08, 10), (09, 00)]"
