In [1]:
import pandas as pd
import re

In [2]:
df = pd.read_excel('../data/Customer Call List.xlsx')
df.head(3)

Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Address,Paying Customer,Do_Not_Contact,Not_Useful_Column
0,1001,Frodo,Baggins,123-545-5421,"123 Shire Lane, Shire",Yes,No,True
1,1002,Abed,Nadir,123/643/9775,93 West Main Street,No,Yes,False
2,1003,Walter,/White,7066950392,298 Drugs Driveway,N,,True


In [3]:
# df.dtypes
df.shape

(21, 8)

### Data Format Issues:

- Phone_Number: The format is inconsistent with a mix of hyphens, slashes, and pipes as delimiters.
- Address: Some addresses lack zip codes or have state information embedded within the address field.
- Paying Customer: Values are inconsistent, using both "Yes/No" and "Y/N" variations.
- Do_Not_Contact: Similar to "Paying Customer," uses both "Yes/No" and "Y/N" variations. Also, some entries are blank, which may need to be treated as a separate category or standardized.
- Not_Useful_Column: This column seems irrelevant and can be removed.

### Data Quality Issues:
- Duplicate Entry: CustomerID 1020 appears twice with identical information.
- Missing Values: Several fields have missing values, notably in Last_Name for customer 1003 and Phone_Number for customer 1007.
- Inconsistent Naming: First and last names sometimes include extra spaces (e.g., Jeff Winger, Michael Scott).

### Recommendations:
#### 1. Data Cleaning:
- Standardize phone number format.
- Separate state and zip code in the address field.
- Unify "Paying Customer" and "Do_Not_Contact" values to either "Yes/No" or "Y/N".
- Address missing values through further investigation or appropriate imputation methods.
- Remove extra spaces from names and ensure consistent capitalization.
- Remove duplicate entry for customer 1020.
- Delete the "Not_Useful_Column".

#### 2. Data Validation:
- Implement data validation rules to prevent future inconsistencies.
- For instance, enforce format restrictions on phone numbers and ensure mandatory fields are filled.

#### Additional Notes:
- Consider the purpose of the data and how it will be used. This will guide further cleaning and analysis steps.
- Explore potential relationships between variables (e.g., Paying Customer and Do_Not_Contact).
- Depending on the intended use, you might explore data enrichment by appending additional information such as email addresses or demographic data.

By addressing these issues, you can improve the quality and reliability of the customer data for better decision-making and analysis.

In [4]:
# # Steps
# remove duplicates
# drop columns
# standardize the data
# Null values or blank values
# remove any columns

In [5]:
# create a copy of the dataframe
df_copy = df.copy()
df_copy.head(3)

Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Address,Paying Customer,Do_Not_Contact,Not_Useful_Column
0,1001,Frodo,Baggins,123-545-5421,"123 Shire Lane, Shire",Yes,No,True
1,1002,Abed,Nadir,123/643/9775,93 West Main Street,No,Yes,False
2,1003,Walter,/White,7066950392,298 Drugs Driveway,N,,True


In [6]:
# check for duplicates and show the rows
df_copy[df_copy.duplicated()]

Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Address,Paying Customer,Do_Not_Contact,Not_Useful_Column
20,1020,Anakin,Skywalker,876|678|3469,"910 Tatooine Road, Tatooine",Yes,N,True


In [7]:
# remove duplicates
df_copy.drop_duplicates(inplace=True)

# preview df
df_copy.head(3)

Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Address,Paying Customer,Do_Not_Contact,Not_Useful_Column
0,1001,Frodo,Baggins,123-545-5421,"123 Shire Lane, Shire",Yes,No,True
1,1002,Abed,Nadir,123/643/9775,93 West Main Street,No,Yes,False
2,1003,Walter,/White,7066950392,298 Drugs Driveway,N,,True


In [8]:
# drop columns
df_copy.drop(columns= 'Not_Useful_Column')

# preview df
df_copy

Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Address,Paying Customer,Do_Not_Contact,Not_Useful_Column
0,1001,Frodo,Baggins,123-545-5421,"123 Shire Lane, Shire",Yes,No,True
1,1002,Abed,Nadir,123/643/9775,93 West Main Street,No,Yes,False
2,1003,Walter,/White,7066950392,298 Drugs Driveway,N,,True
3,1004,Dwight,Schrute,123-543-2345,"980 Paper Avenue, Pennsylvania, 18503",Yes,Y,True
4,1005,Jon,Snow,876|678|3469,123 Dragons Road,Y,No,True
5,1006,Ron,Swanson,304-762-2467,768 City Parkway,Yes,Yes,True
6,1007,Jeff,Winger,,1209 South Street,No,No,False
7,1008,Sherlock,Holmes,876|678|3469,98 Clue Drive,N,No,False
8,1009,Gandalf,,N/a,123 Middle Earth,Yes,,False
9,1010,Peter,Parker,123-545-5421,"25th Main Street, New York",Yes,No,True


In [9]:
# standardize the data "Last_Name" by removing white space
# df_copy["Last_Name"].str.strip() # remove white space by default
# df_copy["Last_Name"] = df_copy["Last_Name"].str.lstrip("...")
# df_copy["Last_Name"] = df_copy["Last_Name"].str.lstrip("/") 
# df_copy["Last_Name"] = df_copy["Last_Name"].str.rstrip("_")

df_copy['Last_Name'] = df_copy['Last_Name'].str.strip("123._/")
df_copy

Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Address,Paying Customer,Do_Not_Contact,Not_Useful_Column
0,1001,Frodo,Baggins,123-545-5421,"123 Shire Lane, Shire",Yes,No,True
1,1002,Abed,Nadir,123/643/9775,93 West Main Street,No,Yes,False
2,1003,Walter,White,7066950392,298 Drugs Driveway,N,,True
3,1004,Dwight,Schrute,123-543-2345,"980 Paper Avenue, Pennsylvania, 18503",Yes,Y,True
4,1005,Jon,Snow,876|678|3469,123 Dragons Road,Y,No,True
5,1006,Ron,Swanson,304-762-2467,768 City Parkway,Yes,Yes,True
6,1007,Jeff,Winger,,1209 South Street,No,No,False
7,1008,Sherlock,Holmes,876|678|3469,98 Clue Drive,N,No,False
8,1009,Gandalf,,N/a,123 Middle Earth,Yes,,False
9,1010,Peter,Parker,123-545-5421,"25th Main Street, New York",Yes,No,True


In [10]:
df_copy['Phone_Number']

0     123-545-5421
1     123/643/9775
2       7066950392
3     123-543-2345
4     876|678|3469
5     304-762-2467
6              NaN
7     876|678|3469
8              N/a
9     123-545-5421
10             NaN
11      7066950392
12    123-543-2345
13    876|678|3469
14    304-762-2467
15    123-545-5421
16    123/643/9775
17      7066950392
18             N/a
19    876|678|3469
Name: Phone_Number, dtype: object

In [11]:
# Remove non-numeric characters from the phone numbers
df_copy['Phone_Number'] = df_copy['Phone_Number'].str.replace(r'[^0-9]+', '', regex=True)

# Add hyphens to format the phone numbers as XXX-XXX-XXXX
df_copy['Phone_Number'] = df_copy['Phone_Number'].str.replace(r'(\d{3})(\d{3})(\d{4})', r'\1-\2-\3')

# Display the standardized phone numbers
print(df_copy['Phone_Number'])

0     1235455421
1     1236439775
2            NaN
3     1235432345
4     8766783469
5     3047622467
6            NaN
7     8766783469
8               
9     1235455421
10           NaN
11           NaN
12    1235432345
13    8766783469
14    3047622467
15    1235455421
16    1236439775
17           NaN
18              
19    8766783469
Name: Phone_Number, dtype: object
