In [1]:
import pandas as pd
with open("customer_list.csv", "r") as file:
    data = file.read()

In [2]:
rows = data.strip().split("\n")

In [3]:
columns = rows[0].split("|")
data_rows = [row.split("|") for row in rows[1:]]

In [4]:
customer_data_split = pd.DataFrame(data_rows, columns=columns)

In [5]:
customer_data_split.head()

Unnamed: 0,ï»¿cust_id,date,time,name,email,phone,sms-opt-out
0,1,2023-03-15,08:45:12,Rachel,rachel@centralperk.coffee,212-555-1001,N
1,2,2023-05-22,12:30:45,R. Geller,rossg@centralperk.coffee,212-555-1002,N
2,3,2023-07-09,18:15:27,Monica Geller,chefmonica@centralperk.coffee,212-555-1003,N
3,4,2023-09-01,21:05:33,Chandler Bing,chandlerb@centralperk.coffee,212-555-1004,Y
4,5,2023-11-18,14:22:10,Joey,howyoudoing@centralperk.coffee,212-555-1005,N


In [6]:
customer_data_split.rename(columns={"ï»¿cust_id": "cust_id", "sms-opt-out ": "sms_opt_out"}, inplace=True) 

# Renames cust_id and sms_opt_out columns accordingly

print(customer_data_split.columns) 

Index(['cust_id', 'date', 'time', 'name', 'email', 'phone', 'sms_opt_out'], dtype='object')


In [7]:
customer_data_split.loc[customer_data_split['sms_opt_out'].isnull(), 'sms_opt_out'] = "N" #missing values are replaced with "N" in the sms_opt_out

In [8]:
customer_data_split.head()

Unnamed: 0,cust_id,date,time,name,email,phone,sms_opt_out
0,1,2023-03-15,08:45:12,Rachel,rachel@centralperk.coffee,212-555-1001,N
1,2,2023-05-22,12:30:45,R. Geller,rossg@centralperk.coffee,212-555-1002,N
2,3,2023-07-09,18:15:27,Monica Geller,chefmonica@centralperk.coffee,212-555-1003,N
3,4,2023-09-01,21:05:33,Chandler Bing,chandlerb@centralperk.coffee,212-555-1004,Y
4,5,2023-11-18,14:22:10,Joey,howyoudoing@centralperk.coffee,212-555-1005,N


In [9]:
customer_data_split['phone'] = customer_data_split['phone'].apply(
    lambda x: ''.join(filter(str.isdigit, x))[-10:] if x else None  # Remove non-numeric characters and keep last 10 digits
)
customer_data_split['phone'] = customer_data_split['phone'].apply(
    lambda x: f"{x[:3]}-{x[3:6]}-{x[6:]}" if x and len(x) == 10 else None  # Format only if x has exactly 10 digits
)

In [10]:
customer_data_split['email'] = customer_data_split['email'].str.lower().str.strip() #Convert email to lowercase and strip any whitespace

In [11]:
customer_data_split['date'] = pd.to_datetime(customer_data_split['date'], errors='coerce') # # Convert date column to a consistent datetime format

In [12]:
customer_data_split['name'] = customer_data_split['name'].str.replace(r"[^a-zA-Z-.' ]", '', regex=True).str.title().str.strip()
#Removes characters that are not letters, hyphens, or periods

In [13]:
customer_data_split.drop_duplicates(subset=['cust_id', 'email'], inplace=True) # Removes duplicates based on name and email

In [14]:
customer_data_split.to_csv("cleaned_customer_data.csv", index=False)