#String Operations with .str

In [2]:
import pandas as pd

In [4]:
df = pd.read_csv("data_cleaning_sample.csv")

In [5]:
df

Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,Alice,25.0,New York,F,alice@example.com,01-05-2021
1,Charlie,,Delhi,M,charlie@example,20-07-2021
2,Bob,30.0,Los Angeles,M,bob@example.com,15-06-2020
3,Charlie,,Delhi,M,charlie@example,20-07-2021
4,David,22.0,Mumbai,M,david@example.com,12-11-2019
5,,28.0,Delhi,F,eve@domain.com,
6,Alice,25.0,New York,F,alice@example.com,01-05-2021
7,Alice,25.0,New York,F,alice@example.com,01-05-2021
8,Charlie,,Delhi,M,charlie@example,20-07-2021


In [7]:
df["Name"].str.lower() # convert all names into lowercase

0      alice
1    charlie
2        bob
3    charlie
4      david
5        NaN
6      alice
7      alice
8    charlie
Name: Name, dtype: object

In [8]:
df

Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,Alice,25.0,New York,F,alice@example.com,01-05-2021
1,Charlie,,Delhi,M,charlie@example,20-07-2021
2,Bob,30.0,Los Angeles,M,bob@example.com,15-06-2020
3,Charlie,,Delhi,M,charlie@example,20-07-2021
4,David,22.0,Mumbai,M,david@example.com,12-11-2019
5,,28.0,Delhi,F,eve@domain.com,
6,Alice,25.0,New York,F,alice@example.com,01-05-2021
7,Alice,25.0,New York,F,alice@example.com,01-05-2021
8,Charlie,,Delhi,M,charlie@example,20-07-2021


In [10]:
# checks if delhi is in the city name, case-insenitive
df["City"].str.contains("delhi",case=False)

0    False
1     True
2    False
3     True
4    False
5     True
6    False
7    False
8     True
Name: City, dtype: bool

In [12]:
# outputs a pandas series where each element is a list of strings(the split parts)
# this is where a python list comes into play, but the outer object is still a pandas series
df["Email"].str.split("@")

0    [alice, example.com]
1      [charlie, example]
2      [bob, example.com]
3      [charlie, example]
4    [david, example.com]
5       [eve, domain.com]
6    [alice, example.com]
7    [alice, example.com]
8      [charlie, example]
Name: Email, dtype: object

In [13]:
type(df["Email"].str.split("@")[0])

list

#Type Conversions with .astype()

In [15]:
df

Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,Alice,25.0,New York,F,alice@example.com,01-05-2021
1,Charlie,,Delhi,M,charlie@example,20-07-2021
2,Bob,30.0,Los Angeles,M,bob@example.com,15-06-2020
3,Charlie,,Delhi,M,charlie@example,20-07-2021
4,David,22.0,Mumbai,M,david@example.com,12-11-2019
5,,28.0,Delhi,F,eve@domain.com,
6,Alice,25.0,New York,F,alice@example.com,01-05-2021
7,Alice,25.0,New York,F,alice@example.com,01-05-2021
8,Charlie,,Delhi,M,charlie@example,20-07-2021


In [16]:
df2 = df.dropna().copy()

In [20]:
df2

Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,Alice,25,New York,F,alice@example.com,01-05-2021
2,Bob,30,Los Angeles,M,bob@example.com,15-06-2020
4,David,22,Mumbai,M,david@example.com,12-11-2019
6,Alice,25,New York,F,alice@example.com,01-05-2021
7,Alice,25,New York,F,alice@example.com,01-05-2021


In [18]:
df2["Age"] = df2["Age"].astype(int)

In [19]:
df2

Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,Alice,25,New York,F,alice@example.com,01-05-2021
2,Bob,30,Los Angeles,M,bob@example.com,15-06-2020
4,David,22,Mumbai,M,david@example.com,12-11-2019
6,Alice,25,New York,F,alice@example.com,01-05-2021
7,Alice,25,New York,F,alice@example.com,01-05-2021


In [28]:
df2.dtypes

Name                 object
Age                   int64
City                 object
Gender               object
Email                object
Join Date    datetime64[ns]
dtype: object

#Why is pd.to_datetime() special?

Unlike astype(), which works on simple data types (like integers, strings, etc.), pd.to_datetime() is designed to:

Handle different date formats (e.g., "YYYY-MM-DD", "MM/DD/YYYY", etc.).

Handle mixed types (e.g., some date strings, some NaT, or missing values).

Convert integer timestamps (e.g., UNIX time) into datetime objects.

Recognize timezones if provided.

In [29]:
df2["Join Date"] = pd.to_datetime(df2["Join Date"])

In [30]:
df2.dtypes

Name                 object
Age                   int64
City                 object
Gender               object
Email                object
Join Date    datetime64[ns]
dtype: object

#Applying Functions

.apply() → Apply any function to rows or columns

.map() → Element-wise mapping for Series

.replace() → Replace specific values


In [44]:
df2

Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,Alice,25,New York,F,alice@example.com,2021-01-05
2,Bob,30,Los Angeles,M,bob@example.com,2020-06-15
4,David,22,Mumbai,M,david@example.com,2019-12-11
6,Alice,25,New York,F,alice@example.com,2021-01-05
7,Alice,25,New York,F,alice@example.com,2021-01-05


In [55]:
#df2["Age Group"] = df2["Age"].apply(lambda x:"Adult" if x>=23 else "minor")
# or
def isminor(x):
    return "Adult" if x>=23 else "Minor"
df2["Age Group"] = df2["Age"].apply(isminor)

In [56]:
df2

Unnamed: 0,Name,Age,City,Gender,Email,Join Date,Age Group
0,Alice,25,New York,Female,alice@example.com,2021-01-05,Adult
2,Bob,30,Los Angeles,Male,bob@example.com,2020-06-15,Adult
4,David,22,Mumbai,Male,david@example.com,2019-12-11,Minor
6,Alice,25,New York,Female,alice@example.com,2021-01-05,Adult
7,Alice,25,New York,Female,alice@example.com,2021-01-05,Adult


In [57]:
gender_map = {"M":"Male","F":"Female"}

df2["Gender"] = df2["Gender"].map(gender_map)

In [58]:
df2

Unnamed: 0,Name,Age,City,Gender,Email,Join Date,Age Group
0,Alice,25,New York,,alice@example.com,2021-01-05,Adult
2,Bob,30,Los Angeles,,bob@example.com,2020-06-15,Adult
4,David,22,Mumbai,,david@example.com,2019-12-11,Minor
6,Alice,25,New York,,alice@example.com,2021-01-05,Adult
7,Alice,25,New York,,alice@example.com,2021-01-05,Adult


In [59]:
df2["City"].replace({"Los Angeles":"LA","Mumbai":"Mum"})

0    New York
2          LA
4         Mum
6    New York
7    New York
Name: City, dtype: object