In [1]:
# Import pandas for data manipulation and numpy for numerical operations
import pandas as pd
import numpy as np

In [2]:
# What are vectorized operations
# Demonstrate vectorized operations using numpy arrays
a = np.array([1,2,3,4])
a * 4  # Multiplies each element by 4

array([ 4,  8, 12, 16])

In [3]:
# Demonstrate the problem with vectorized operations in vanilla Python lists
s = ['cat','mat',None,'rat']

# This will raise an error if None is present, as NoneType has no 'startswith' method
[i.startswith('c') for i in s]

AttributeError: 'NoneType' object has no attribute 'startswith'

In [4]:
# How pandas solves this issue?

s = pd.Series(['cat','mat',None,'rat'])
# Use the string accessor to safely apply string methods to each element
s.str.startswith('c')

# This is fast and optimized for missing values

0     True
1    False
2     None
3    False
dtype: object

In [5]:
# Import the Titanic dataset and display the 'Name' column
df = pd.read_csv('DataSets/titanic_expense/titanic.csv')
df['Name']  # Display the Name column

0                                Braund, Mr. Owen Harris
1      Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                 Heikkinen, Miss. Laina
3           Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                               Allen, Mr. William Henry
                             ...                        
886                                Montvila, Rev. Juozas
887                         Graham, Miss. Margaret Edith
888             Johnston, Miss. Catherine Helen "Carrie"
889                                Behr, Mr. Karl Howell
890                                  Dooley, Mr. Patrick
Name: Name, Length: 891, dtype: object

In [None]:
# Common Functions
# lower/upper/capitalize/title
# Convert all names to uppercase
df['Name'].str.upper()
# Capitalize the first character of each name
df['Name'].str.capitalize()
# Convert names to title case
df['Name'].str.title() #sb word ka
# len
# Find the name with length 82
df['Name'][df['Name'].str.len() == 82].values[0]
# strip
# Remove leading and trailing spaces from a string
"                   Shyam                             ".strip()
# Remove leading and trailing spaces from all names in the column
df['Name'].str.strip()

0                                Braund, Mr. Owen Harris
1      Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                 Heikkinen, Miss. Laina
3           Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                               Allen, Mr. William Henry
                             ...                        
886                                Montvila, Rev. Juozas
887                         Graham, Miss. Margaret Edith
888             Johnston, Miss. Catherine Helen "Carrie"
889                                Behr, Mr. Karl Howell
890                                  Dooley, Mr. Patrick
Name: Name, Length: 891, dtype: object

In [7]:
# Split the 'Name' column by comma and extract the last name
df['lastname'] = df['Name'].str.split(',').str.get(0)
# Display the updated DataFrame
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,lastname
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Braund
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Cumings
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Heikkinen
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Futrelle
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Allen


In [None]:
# Extract title and firstname from the 'Name' column
# First, split by comma, then strip and split by space to get title and firstname
df[['title','firstname']] = df['Name'].str.split(',').str.get(1).str.strip().str.split(' ', n=1, expand=True)#n=1 shows the number of splits we want, expand=true for making the data frame
# Display the updated DataFrame
df.head()

# Count the occurrences of each title
df['title'].value_counts()

title
Mr.          517
Miss.        182
Mrs.         125
Master.       40
Dr.            7
Rev.           6
Mlle.          2
Major.         2
Col.           2
the            1
Capt.          1
Ms.            1
Sir.           1
Lady.          1
Mme.           1
Don.           1
Jonkheer.      1
Name: count, dtype: int64

In [9]:
# Replace values in the 'title' column for consistency
df['title'] = df['title'].str.replace('Ms.','Miss.')
df['title'] = df['title'].str.replace('Mlle.','Miss.')

In [10]:
# Count the occurrences of each title after replacement
df['title'].value_counts()

title
Mr.          517
Miss.        185
Mrs.         125
Master.       40
Dr.            7
Rev.           6
Major.         2
Col.           2
Don.           1
Mme.           1
Lady.          1
Sir.           1
Capt.          1
the            1
Jonkheer.      1
Name: count, dtype: int64

In [11]:
# filtering
# startswith/endswith
# Find firstnames that end with 'A'
df[df['firstname'].str.endswith('A')]
# isdigit/isalpha...
# Find firstnames that are digits (unlikely in this dataset)
df[df['firstname'].str.isdigit()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,lastname,title,firstname


In [None]:
# Applying regex and contains in pandas string operations
# Find rows where firstname contains 'john' (case insensitive)
df[df['firstname'].str.contains('john',case=False)]
# Find lastnames that start and end with a non-vowel character using regex
df[df['lastname'].str.contains('^[^aeiouAEIOU].+[^aeiouAEIOU]$')]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,lastname,title,firstname
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,Braund,Mr.,Owen Harris
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Cumings,Mrs.,John Bradley (Florence Briggs Thayer)
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,Heikkinen,Miss.,Laina
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,Moran,Mr.,James
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,McCarthy,Mr.,Timothy J
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
884,885,0,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.0500,,S,Sutehall,Mr.,Henry Jr
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,Graham,Miss.,Margaret Edith
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,Johnston,Miss.,"Catherine Helen ""Carrie"""
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,Behr,Mr.,Karl Howell


In [None]:
# Reverse the strings in the 'Name' column using slicing
df['Name'].str[::-1]

0                                sirraH newO .rM ,dnuarB
1      )reyahT sggirB ecnerolF( yeldarB nhoJ .srM ,sg...
2                                 aniaL .ssiM ,nenikkieH
3           )leeP yaM yliL( htaeH seuqcaJ .srM ,ellertuF
4                               yrneH mailliW .rM ,nellA
                             ...                        
886                                sazouJ .veR ,alivtnoM
887                         htidE teragraM .ssiM ,maharG
888             "eirraC" neleH enirehtaC .ssiM ,notsnhoJ
889                                llewoH lraK .rM ,rheB
890                                  kcirtaP .rM ,yelooD
Name: Name, Length: 891, dtype: object

In [None]:
# (This cell is intentionally left blank for future code or notes)