# 1. String Operations

### Changing case

In [1]:
my_string = "tHis Is a niCe StriNg"

In [2]:
# Converting to lowercase
print(my_string.lower())

this is a nice string


In [3]:
# Converting to uppercase
print(my_string.upper())

THIS IS A NICE STRING


In [4]:
# Capitalizing the first character
print(my_string.capitalize())

This is a nice string


### Splitting

In [5]:
# Splitting a string into a list of substrings
my_string = "This string will be split"

In [6]:
# 'split' starts splitting at the left
print(my_string.split(sep=" ", maxsplit=2)) # maxsplit tells the maximum numbers of substrings we want

['This', 'string', 'will be split']


In [7]:
# 'rsplit' begins at the right of the string
print(my_string.rsplit(sep=" ", maxsplit=2)) # maxsplit tells the maximum numbers of substrings we want

['This string will', 'be', 'split']


In [8]:
# Escape sequences
my_string_1 = "This string will be split\nin two" # \n causes the text following it to move to a new line.

my_string_2 = "This string will be split\rin two" # \r moves the cursor back to the beginning of the line, overwriting any characters that are already there.

In [9]:
# Newline
print(my_string_1)

# Carriage return
print(my_string_2)

This string will be split
in two
This string will be splitin two


In [10]:
# Split the string in the same line returning a list of 2 elements
print(my_string_1.splitlines())

['This string will be split', 'in two']


### Joining

In [11]:
my_list = ["this", "would", "be", "a", "string"]

In [12]:
# Concatenate strings from list or another iterable
print(" ".join(my_list))
print("_".join(my_list))

this would be a string
this_would_be_a_string


### Stripping & Trimming

In [13]:
my_string = " This string will be stripped\n"

In [14]:
# Strips characters from left to right: .strip()
print(my_string.strip())

This string will be stripped


In [15]:
# Remove characters from the right end: .rstrip()
print(my_string.rstrip())

 This string will be stripped


In [16]:
# Remove characters from the left end: .lstrip()
print(my_string.lstrip())

This string will be stripped



## Example #1: Normalizing reviews

In [17]:
movie = "$I supposed that coming from MTV Films I should expect no less$"

In [18]:
# Convert the string in the variable movie to lowercase. Print the result.
movie_lower = movie.lower()
print(movie_lower)

$i supposed that coming from mtv films i should expect no less$


In [19]:
# Remove the $ that occur at the start and at the end of the string contained in movie_lower. Print the results.
movie_no_sign = movie_lower.strip("$")
print(movie_no_sign)

i supposed that coming from mtv films i should expect no less


In [20]:
# Split the string contained in movie_no_sign into as many substrings as possible. Print the results.
movie_split = movie_no_sign.split()
print(movie_split)

['i', 'supposed', 'that', 'coming', 'from', 'mtv', 'films', 'i', 'should', 'expect', 'no', 'less']


In [21]:
# To get the root of the second word contained in movie_split, select all the characters except the last one.
word_root = movie_split[1][:-1]
print(word_root)

suppose


## Example #2: Time to join!

In [22]:
movie = "the film,however,is all good<\i>"

In [23]:
# Remove tag <\i> from the end of the string. Print the results.
movie_tag = movie.rstrip('<\i>')
print(movie_tag)

the film,however,is all good


In [24]:
# Split the string using commas and print results
movie_no_comma = movie_tag.split(sep=",")
print(movie_no_comma)

['the film', 'however', 'is all good']


In [25]:
# Join back together and print results
movie_join = " ".join(movie_no_comma)
print(movie_join)

the film however is all good


## Example #3: Split lines or split the line?

In [26]:
file = "mtv films election, a high school comedy, is a current example from there, director steven spielberg wastes no time, taking us into the water on a midnight swim"

In [27]:
# Split the string file into many substrings at line boundaries.
file_split = file.splitlines()
print(file_split)

['mtv films election, a high school comedy, is a current example from there, director steven spielberg wastes no time, taking us into the water on a midnight swim']


In [28]:
# Complete the for-loop to split the strings into many substrings using commas as a separator element.

for substring in file_split:
    substring_split = substring.split(",")
    print(substring_split)

['mtv films election', ' a high school comedy', ' is a current example from there', ' director steven spielberg wastes no time', ' taking us into the water on a midnight swim']


# 2. Finding and Replacing

### Find

In [29]:
my_string = "Where's Waldo"

In [30]:
# Finding the lowest index a specified substring
my_string.find("Waldo")

8

In [31]:
# Searches for "Waldo" between indices 0 (inclusive) and 6 (exclusive). Returns -1 because "Waldo" is not in this range.
print(my_string.find("Waldo", 0, 6))  

# Searches for "Waldo" between indices 7 (inclusive) and 13 (exclusive). Returns 8 because "Waldo" starts at index 8 within this range.
print(my_string.find("Waldo", 7, 13))  

-1
8


### Index

In [32]:
# The index method raises an Exception, different from .find() that returns -1
try:
    my_string.index("Wenda")
except ValueError:
    print("Not found")

Not found


### Count

In [33]:
# How many times the substring appears in a string
my_string = "How many fruits do you have in your fruit basket"
print(my_string.count("fruit"))
print(my_string.count("fruit",0,16))

2
1


### Replace

In [34]:
# Replace substrings with new substrings
my_string = "The red house is between the blue house and the old house"
print(my_string.replace("house","car"))
print(my_string.replace("house","car",2))

The red car is between the blue car and the old car
The red car is between the blue car and the old house


## Example #1: Finding a substring

In [35]:
import pandas as pd
movies = pd.read_csv('short_movies.csv')
movies = movies.iloc[200:203]
movies = movies["text"]

In [36]:
movies

200    it's clear that he's passionate about his beli...
201    I believe you I always said that the actor act...
202    it's astonishing how frightening the actor act...
Name: text, dtype: object

In [37]:
for movie in movies:
  	# If actor is not found between character 37 and 41 inclusive
    # Print word not found
    if movie.find("actor", 37, 42) == -1:
        print("Word not found")
    # Count occurrences and replace two with one
    elif movie.count("actor") == 2:  
        print(movie.replace("actor actor", "actor"))
    else:
        # Replace three occurrences with one
        print(movie.replace("actor actor actor", "actor"))

Word not found
I believe you I always said that the actor is amazing in every movie he has played
it's astonishing how frightening the actor norton looks with a shaved head and a swastika on his chest.


## Example #2: Where's the word?

In [38]:
# Find the index where money occurs between characters with index 12 and 50. If not found, the method should return -1.
for movie in movies:
  # Find the first occurrence of word
  print(movie.find("money", 12, 51))

-1
-1
-1


In [39]:
# Find the index where money occurs between characters with index 12 and 50. If not found, it should raise an error.
for movie in movies:
  try:
    # Find the first occurrence of word
  	print(movie.index("money", 12, 51))
  except ValueError:
    print("substring not found")

substring not found
substring not found
substring not found


## Example #3: Replacing negations

In [40]:
movies = "the rest of the story isn't important because all it does is serve as a mere backdrop for the two stars to share the screen ."

In [41]:
# Replace the substring isn't with the word is.
movies_no_negation = movies.replace("isn't", "is")

# Replace the substring important with the word insignificant.
movies_antonym = movies_no_negation.replace("important", "insignificant")

# Print out the result contained in the variable movies_antonym
print(movies_antonym)

the rest of the story is insignificant because all it does is serve as a mere backdrop for the two stars to share the screen .


# 2. Formatting Strings

In [42]:
custom_string = "String formatting"
print(f"{custom_string} is a powerful technique")

String formatting is a powerful technique


### Positional formatting

In [43]:
# Placeholder replace by value - 'text{}'.format(value)
print("Machine learning provides {} the ability to learn {}".format("systems", "automaticaly"))

Machine learning provides systems the ability to learn automaticaly


In [44]:
# Variables ofr initial string and values passed into the method
my_string = "{} rely on {} datasets"
method = "Supervised algorithms"
condition = "labeled"
print(my_string.format(method, condition))

Supervised algorithms rely on labeled datasets


### Reordering values

In [45]:
# Include an index number into the placeholders to reorder values
print("{} has a friend called {} and a sister called {}".format("Betty","Linda","Daisy"))
print("{2} has a friend called {0} and a sister called {1}".format("Betty","Linda","Daisy"))

Betty has a friend called Linda and a sister called Daisy
Daisy has a friend called Betty and a sister called Linda


### Named placeholders

In [46]:
# Specify a name for the placeholders
tool = "Unsupervised algorithms"
goal = "patterns"
print("{title} try to find {aim} in the dataset".format(title=tool, aim=goal))

Unsupervised algorithms try to find patterns in the dataset


In [47]:
my_methods = {"tool": "Unsupervised algorithms", "goal":"patterns"}
print('{data[tool]} try to find {data[goal]} in the dataset'.format(data=my_methods))

Unsupervised algorithms try to find patterns in the dataset


### Format specifier

In [48]:
# Specify data type to be used: {index:specifier}
print("Only {0:f}% of the {1} produced worldwide is {2}!".format(0.5155675,"data","analyzed"))
print("Only {0:.2f}% of the {1} produced worldwide is {2}!".format(0.5155675,"data","analyzed"))

Only 0.515567% of the data produced worldwide is analyzed!
Only 0.52% of the data produced worldwide is analyzed!


### Formatting datetime

In [49]:
from datetime import datetime
print(datetime.now())
print("Today's date is {:%Y-%m-%d %H:%M}".format(datetime.now()))

2025-01-05 09:33:27.004726
Today's date is 2025-01-05 09:33


## Example #1: Put it in order!

In [50]:
wikipedia_article = "In computer science, artificial intelligence (AI), sometimes called machine intelligence, is intelligence demonstrated by machines, in contrast to the natural intelligence displayed by humans and animals."
print(wikipedia_article)

In computer science, artificial intelligence (AI), sometimes called machine intelligence, is intelligence demonstrated by machines, in contrast to the natural intelligence displayed by humans and animals.


- Assign the substrings going from the 4th to the 19th character inclusive, and from the 22nd to the 44th character inclusive of `wikipedia_article` to the variables `first_pos` and `second_pos`, respectively. Adjust the strings to be lowercase.

In [51]:
# Assign the substrings to the variables
first_pos = wikipedia_article[3:19].lower()
second_pos = wikipedia_article[21:44].lower()
print(first_pos, second_pos)

computer science artificial intelligence


- Define a string with the text `"The tool is used in"` adding placeholders after the word `tool` and the word `in` for future positional formatting. Append it to the list `my_list.`

In [52]:
# Define string with placeholders 
my_list = []
my_list.append("The tool {} is used in {}")

- Now reorder them so the second argument passed to the method will replace the first placeholder. Append to the list `my_list`.

In [53]:
# Define string with rearranged placeholders
my_list.append("The tool {1} is used in {0}")

- Complete the for-loop so that it uses the `.format()` method and the variables `first_pos` and `second_pos` to print out every string in `my_list`.

In [54]:
# Use format to print strings
for my_string in my_list:
    print(my_string.format(first_pos, second_pos))

The tool computer science is used in artificial intelligence
The tool artificial intelligence is used in computer science


## Example #2: Calling by its name

In [55]:
courses = ['artificial intelligence', 'neural networks']

- Create a dictionary assigning the first and second element appearing in the list `courses` to the keys `"field"` and `"tool"` respectively.

In [56]:
# Create a dictionary
plan = {
  		"field": courses[0],
        "tool": courses[1]
        }
print(plan)

{'field': 'artificial intelligence', 'tool': 'neural networks'}


- Print out the resulting message using the `.format()` method, passing the `plan` dictionary to replace the `data` placeholders.

In [57]:
# Complete the placeholders accessing elements of field and tool keys in the data dictionary
my_message = "If you are interested in {data[field]}, you can take the course related to {data[tool]}"

# Use the plan dictionary to replace placeholders
print(my_message.format(data=plan))

If you are interested in artificial intelligence, you can take the course related to neural networks


## Example #3: What day is today?

You write down some specifiers to help you: `%d`(day), `%B` (monthname), `%m` (monthnumber), `%Y`(year), `%H` (hour) and `%M`(minutes)

In [58]:
# Import datetime 
from datetime import datetime

# Assign date to get_date
get_date = datetime.now()

# Add named placeholders with format specifiers
message = "Good morning. Today is {today:%B %d, %Y}. It's {today:%H:%M} ... time to work!"

# Use the format method replacing the placeholder with get_date
print(message.format(today=get_date))

Good morning. Today is January 05, 2025. It's 09:33 ... time to work!


# 3. Formatted string literal

`f"literal string {expression}"`

In [59]:
way = "code"
method = "learning Python faster"
print(f"Practicing how to {way} is the best method for {method}")

Practicing how to code is the best method for learning Python faster


### Conversions allowed:

- `!s`: (string version)

- `!r`: (string with printable representation with quotes)

- `!a`: (same as !r but escape the non-ASCII characters)

In [60]:
# !r (string with printable representation with quotes)
name = "Python"
print(f"Python is called {name!r} due to a comedy series")

Python is called 'Python' due to a comedy series


### Format specifiers:

- `e`: scientific notation, like 5 10^3

- `d`: digit

- `f`: float

- `datetime`

In [61]:
number = 90.458513812842
print(f"In the last 2 years, {number:.2f}% of the data was produced worldwide")

In the last 2 years, 90.46% of the data was produced worldwide


In [62]:
from datetime import datetime
my_today = datetime.now()
print(f"Today's date is {my_today:%B %d, %Y}")

Today's date is January 05, 2025


### Index lookups

In [63]:
family = {"dad": "John", "siblings": "Peter"}
print("Is your dad called {family[dad]}?".format(family=family))

Is your dad called John?


In [64]:
# Use quotes for index lookups: family["dad"]
# print(f"Is your dad called {family[dad]}?")
"""
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[64], line 2
      1 # Use quotes for index lookups: family["dad"]
----> 2 print(f"Is your dad called {family[dad]}?")

NameError: name 'dad' is not defined
"""

'\n---------------------------------------------------------------------------\nNameError                                 Traceback (most recent call last)\nCell In[64], line 2\n      1 # Use quotes for index lookups: family["dad"]\n----> 2 print(f"Is your dad called {family[dad]}?")\n\nNameError: name \'dad\' is not defined\n'

In [65]:
# Use quotes for index lookups: family["dad"]
print(f"Is your dad called {family['dad']}?")
print(f'Is your dad called {family["dad"]}?')

Is your dad called John?
Is your dad called John?


### Escape sequences

In [66]:
# Backslashes
print("My dad is called \"John\"")

My dad is called "John"


### Inline operations
- Advantage: evaluate expressions and call functions inline

In [67]:
my_number = 4
my_multiplier = 7
print(f'{my_number} multiplied by {my_multiplier} is {my_number * my_multiplier}')

4 multiplied by 7 is 28


### Calling functions

In [68]:
def my_function(a,b):
    return a + b
print(f"If you sum up 10 and 20, the result is {my_function(10,20)}")

If you sum up 10 and 20, the result is 30


## Example #1: Literally formatting

In [69]:
field1 = "sexiest job"
field2 = "data is produced daily"
field3 = "Individuals"
fact1 = 21
fact2 = 2500000000000000000
fact3 = 72.41415415151
fact4 = 1.09

- Complete the f-string to include the variable `field1` with quotes and the variable `fact1` as a digit.

In [70]:
# Complete the f-string
print(f"Data science is considered {field1!r} in the {fact1}st century")

Data science is considered 'sexiest job' in the 21st century


- Complete the f-string to include the variable `fact2` using exponential notation, and the variable `field2`.

In [71]:
# Complete the f-string
print(f"About {fact2:e} of {field2} in the world")

About 2.500000e+18 of data is produced daily in the world


- Complete the f-string to include `field3`, `fact3` rounded to 2 decimals, and `fact4` rounded to one decimal.

In [72]:
# Complete the f-string
print(f"{field3} create around {fact3:.2f}% of the data but only {fact4:.1f}% is analyzed")

Individuals create around 72.41% of the data but only 1.1% is analyzed


## Example #2: Make this function

In [73]:
number1 = 120
number2 = 7
string1 = 'httpswww.regex.com'
list_links = ['www.news.com','www.google.com','www.yahoo.com','www.bbc.com',
              'www.msn.com','www.facebook.com', 'www.news.google.com']

- Inside the f-string, include `number1`,`number2` and the result of dividing `number1` by `number`2 rounded to one decimal.

In [74]:
# Include both variables and the result of dividing them 
print(f"{number1} tweets were downloaded in {number2} minutes indicating a speed of {number1/number2:.1f} tweets per min")

120 tweets were downloaded in 7 minutes indicating a speed of 17.1 tweets per min


- Inside the f-string, use `.replace()` to replace the substring `https` with an empty substring in string1.

In [75]:
# Replace the substring https by an empty string
print(f"{string1.replace('https', '')}")

www.regex.com


- Inside the f-string, get `list_links` length, multiply it by 100 and divide it by 120. Round the result to two decimals.

In [76]:
# Divide the length of list by 120 rounded to two decimals
print(f"Only {len(list_links)*100/120:.2f}% of the posts contain links")

Only 5.83% of the posts contain links


## Example #3: On time

In [78]:
from datetime import datetime
east = {'date': datetime(2007, 4, 20, 0, 0), 'price': 1232443}
west = {'date': datetime(2006, 5, 26, 0, 0), 'price': 1432673}

- Inside the f-string, access the values of the keys `price` and `date` in `east` dictionary. Format the date to `month-day-year`.

In [79]:
# Access values of date and price in east dictionary
print(f"The price for a house in the east neighborhood was ${east['price']} in {east['date']:%m-%d-%Y}")

The price for a house in the east neighborhood was $1232443 in 04-20-2007


- Inside the f-string, access the values of the keys `price` and `date` in `west` dictionary. Format the date to `month-day-year`.

In [80]:
# Access values of date and price in west dictionary
print(f"The price for a house in the west neighborhood was ${west['price']} in {west['date']:%m-%d-%Y}.")

The price for a house in the west neighborhood was $1432673 in 05-26-2006.


# 4. Template strings

In [1]:
from string import Template

### Substitution

In [3]:
# $identifier
my_string = Template('Data science has been called $identifier') # '$' as placeholders or identifiers
my_string.substitute(identifier="sexiest job of the 21st century")

'Data science has been called sexiest job of the 21st century'

In [4]:
# Many $identifier and variables
job = "Data science"
name = "sexiest job of the 21st century"
my_string = Template('$title has been called $description')
my_string.substitute(title=job,description=name)

'Data science has been called sexiest job of the 21st century'

In [5]:
# Use ${identifier} when valid characters follow identifier
my_string = Template('I find Python very ${noun}ing but my sister has lost $noun')
my_string.substitute(noun="interest")

'I find Python very interesting but my sister has lost interest'

In [7]:
# Working with numbers, using $$ to escape the dollar sign
my_string = Template('I paid for the Python course only $$ $price, amazing!')
my_string.substitute(price="12.50")

'I paid for the Python course only $ 12.50, amazing!'

### Safe substitution

In [9]:
# Raise error when placeholder is missing
favorite = dict(flavor="chocolate")
my_string = Template('I love $flavor $cake very much')
# my_string.substitute(favorite) # KeyError: 'cake'
try:
    my_string.substitute(favorite)
except KeyError:
    print("missing information")

missing information


In [10]:
# Safe substitution
favorite = dict(flavor="chocolate")
my_string = Template('I love $flavor $cake very much')
my_string.safe_substitute(favorite)

'I love chocolate $cake very much'

## Example #1: Preparing a report

In [11]:
tool1 = 'Natural Language Toolkit'
tool2 = 'TextBlob'
tool3 = 'Gensim'
description1 = 'suite of libraries and programs for symbolic and statistical natural language processing (NLP) for English written in the Python programming language. It was developed by Steven Bird and Edward Loper in the Department of Computer and Information Science at the University of Pennsylvania.'
description2 = 'Python library for processing textual data. It provides a simple API for diving into common natural language processing tasks such as part-of-speech tagging, noun phrase extraction, sentiment analysis, classification, translation, and more.'
description3 = 'robust open-source vector space modeling and topic modeling toolkit implemented in Python. It uses NumPy, SciPy and optionally Cython for performance. Gensim is specifically designed to handle large text collections, using data streaming and efficient incremental algorithms, which differentiates it from most other scientific software packages that only target batch and in-memory processing.'

In [12]:
from string import Template

- Complete the template using `$tool` and `$description` identifiers.



In [14]:
# Create a template
wikipedia = Template("$tool is a $description")

- Substitute identifiers with the correct tool and description variables in the template and print out the results.

In [15]:
# Substitute variables in template
print(wikipedia.substitute(tool=tool1, description=description1))
print(wikipedia.substitute(tool=tool2, description=description2))
print(wikipedia.substitute(tool=tool3, description=description3))

Natural Language Toolkit is a suite of libraries and programs for symbolic and statistical natural language processing (NLP) for English written in the Python programming language. It was developed by Steven Bird and Edward Loper in the Department of Computer and Information Science at the University of Pennsylvania.
TextBlob is a Python library for processing textual data. It provides a simple API for diving into common natural language processing tasks such as part-of-speech tagging, noun phrase extraction, sentiment analysis, classification, translation, and more.
Gensim is a robust open-source vector space modeling and topic modeling toolkit implemented in Python. It uses NumPy, SciPy and optionally Cython for performance. Gensim is specifically designed to handle large text collections, using data streaming and efficient incremental algorithms, which differentiates it from most other scientific software packages that only target batch and in-memory processing.


## Example #2: Identifying prices

In [16]:
tools = ['Natural Language Toolkit', '20', 'month']

- Assign the first, second, and third element of `tools` to the variables `our_tool`, `our_fee` and `our_pay` respectively.

In [17]:
# Import template
from string import Template

# Select variables
our_tool = tools[0]
our_fee = tools[1]
our_pay = tools[2]

- Complete the template string using `$tool`, `$fee`, and `$pay` as identifiers. Add the dollar sign before the `$fee` identifier and add the characters `ly` directly after the `$pay` identifier.

In [20]:
# Create template
course = Template("We are offering a 3-month beginner course on $tool just for $$ $fee ${pay}ly")

- Substitute identifiers with the three variables you created and print out the results.

In [21]:
# Substitute identifiers with three variables
print(course.substitute(tool=our_tool, fee=our_fee, pay=our_pay))

We are offering a 3-month beginner course on Natural Language Toolkit just for $ 20 monthly


## Example #3: Playing safe

In [22]:
answers = {'answer1': 'I really like the app. But there are some features that can be improved'}

- Complete the template string using `$answer1` and `$answer2` as identifiers.

In [23]:
# Complete template string using identifiers
the_answers = Template("Check your answer 1: $answer1, and your answer 2: $answer2")

- Use the method `.substitute()` to replace the identifiers with the values in `answers` in the predefined template.

In [24]:
# Use substitute to replace identifiers
try:
    print(the_answers.substitute(answers))
except KeyError:
    print("Missing information")

Missing information


- Use the method `.safe_substitute()` to replace the identifiers with the values in `answers` in the predefined template.

In [25]:
# Use safe_substitute to replace identifiers
try:
    print(the_answers.safe_substitute(answers))
except KeyError:
    print("Missing information")

Check your answer 1: I really like the app. But there are some features that can be improved, and your answer 2: $answer2
