## 4. Text Data Cleaning (Beyond Basics)
- Lowercasing, strip spaces, remove special characters using regex
- Handling unicode, emojis, and symbols
- Splitting & extracting features using regex (str.extract, str.contains)


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
ds=pd.read_csv("data/cleaned_dataset_1.csv")

In [None]:
ds.sample(10)


Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Address,Paying Customer,Do_Not_Contact
6,1015,Toby,Flenderson_,304-762-2467,214 HR Avenue,No,No
2,1008,Sherlock,Holmes,876|678|3469,98 Clue Drive,No,No
9,1020,Anakin,Skywalker,876|678|3469,"910 Tatooine Road, Tatooine",Yes,No
3,1010,Peter,Parker,123-545-5421,"25th Main Street, New York",Yes,No
7,1016,Ron,Weasley,123-545-5421,2395 Hogwarts Avenue,No,No
0,1001,Frodo,Baggins,123-545-5421,"123 Shire Lane, Shire",Yes,No
10,1020,Anakin,Skywalker,876|678|3469,"910 Tatooine Road, Tatooine",Yes,No
8,1017,Michael,Scott,123/643/9775,"121 Paper Avenue, Pennsylvania",Yes,No
4,1013,Don,Draper,123-543-2345,2039 Main Street,Yes,No
5,1014,Leslie,Knope,876|678|3469,343 City Parkway,Yes,No


In [None]:
ds["Last_Name"]=ds["Last_Name"].str.strip("_")

In [None]:
ds["Phone_Number"]=ds["Phone_Number"].str.replace("[\D]", "", regex=True)
ds.sample(10)

Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Address,Paying Customer,Do_Not_Contact
2,1008,Sherlock,Holmes,8766783469,98 Clue Drive,No,No
7,1016,Ron,Weasley,1235455421,2395 Hogwarts Avenue,No,No
6,1015,Toby,Flenderson,3047622467,214 HR Avenue,No,No
8,1017,Michael,Scott,1236439775,"121 Paper Avenue, Pennsylvania",Yes,No
5,1014,Leslie,Knope,8766783469,343 City Parkway,Yes,No
10,1020,Anakin,Skywalker,8766783469,"910 Tatooine Road, Tatooine",Yes,No
1,1005,Jon,Snow,8766783469,123 Dragons Road,Yes,No
0,1001,Frodo,Baggins,1235455421,"123 Shire Lane, Shire",Yes,No
4,1013,Don,Draper,1235432345,2039 Main Street,Yes,No
3,1010,Peter,Parker,1235455421,"25th Main Street, New York",Yes,No


In [None]:
ds["Address"]=ds["Address"].str.replace("[\d]", "", regex=True)
ds.sample(10)

Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Address,Paying Customer,Do_Not_Contact
1,1005,Jon,Snow,8766783469,Dragons Road,Yes,No
6,1015,Toby,Flenderson,3047622467,HR Avenue,No,No
7,1016,Ron,Weasley,1235455421,Hogwarts Avenue,No,No
3,1010,Peter,Parker,1235455421,"th Main Street, New York",Yes,No
10,1020,Anakin,Skywalker,8766783469,"Tatooine Road, Tatooine",Yes,No
5,1014,Leslie,Knope,8766783469,City Parkway,Yes,No
2,1008,Sherlock,Holmes,8766783469,Clue Drive,No,No
9,1020,Anakin,Skywalker,8766783469,"Tatooine Road, Tatooine",Yes,No
0,1001,Frodo,Baggins,1235455421,"Shire Lane, Shire",Yes,No
4,1013,Don,Draper,1235432345,Main Street,Yes,No


In [None]:
ds["First_Name"].str.upper()

0        FRODO
1          JON
2     SHERLOCK
3        PETER
4          DON
5       LESLIE
6         TOBY
7          RON
8     MICHAEL 
9       ANAKIN
10      ANAKIN
Name: First_Name, dtype: object

In [None]:
ds["First_Name"].str.lower()

0        frodo
1          jon
2     sherlock
3        peter
4          don
5       leslie
6         toby
7          ron
8     michael 
9       anakin
10      anakin
Name: First_Name, dtype: object

In [None]:
ds["Address"].str.split(",")

0              [ Shire Lane,  Shire]
1                    [ Dragons Road]
2                      [ Clue Drive]
3        [th Main Street,  New York]
4                     [ Main Street]
5                    [ City Parkway]
6                       [ HR Avenue]
7                 [ Hogwarts Avenue]
8     [ Paper Avenue,  Pennsylvania]
9        [ Tatooine Road,  Tatooine]
10       [ Tatooine Road,  Tatooine]
Name: Address, dtype: object

In [None]:
ds[ds["First_Name"].str.contains(r"on", regex=True)]

Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Address,Paying Customer,Do_Not_Contact
1,1005,Jon,Snow,8766783469,Dragons Road,Yes,No
4,1013,Don,Draper,1235432345,Main Street,Yes,No
7,1016,Ron,Weasley,1235455421,Hogwarts Avenue,No,No


In [None]:
ds[ds["Phone_Number"].str.startswith("8")]

Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Address,Paying Customer,Do_Not_Contact
1,1005,Jon,Snow,8766783469,Dragons Road,Yes,No
2,1008,Sherlock,Holmes,8766783469,Clue Drive,No,No
5,1014,Leslie,Knope,8766783469,City Parkway,Yes,No
9,1020,Anakin,Skywalker,8766783469,"Tatooine Road, Tatooine",Yes,No
10,1020,Anakin,Skywalker,8766783469,"Tatooine Road, Tatooine",Yes,No


In [None]:
ds.head()

Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Address,Paying Customer,Do_Not_Contact
0,1001,Frodo,Baggins,1235455421,"Shire Lane, Shire",Yes,No
1,1005,Jon,Snow,8766783469,Dragons Road,Yes,No
2,1008,Sherlock,Holmes,8766783469,Clue Drive,No,No
3,1010,Peter,Parker,1235455421,"th Main Street, New York",Yes,No
4,1013,Don,Draper,1235432345,Main Street,Yes,No


In [None]:
ds["Address"].str.extract(r"(\w+)\s")

Unnamed: 0,0
0,Shire
1,Dragons
2,Clue
3,th
4,Main
5,City
6,HR
7,Hogwarts
8,Paper
9,Tatooine


In [None]:
ds["Phone_Number"]=pd.to_numeric(ds["Phone_Number"], errors="coerce", downcast="integer")

In [None]:
ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   CustomerID       11 non-null     int64 
 1   First_Name       11 non-null     object
 2   Last_Name        11 non-null     object
 3   Phone_Number     11 non-null     int64 
 4   Address          11 non-null     object
 5   Paying Customer  11 non-null     object
 6   Do_Not_Contact   11 non-null     object
dtypes: int64(2), object(5)
memory usage: 744.0+ bytes
