In [1]:
import pandas as pd
import re

In [2]:
# Read in CSV and convert to dataframe
df = pd.read_csv('ChemBook_final_phone.csv')
df = df.drop('Unnamed: 1', axis=1)

In [3]:
# Processing functions for different scenarios where phone numbers deviate from the standard phone
def fix_scientific_notation(phone):
    try:
        if 'E' in str(phone) or 'e' in str(phone):
            phone = '{:.0f}'.format(float(phone))
    except ValueError:
        pass
    return phone

def standardize_phone_number(phone):
    phone = fix_scientific_notation(phone)
    phone = re.sub(r'[^\d+]', '', str(phone))
    if not phone.startswith('+86'):
        if phone.startswith('86'):
            phone = '+' + phone
        elif not phone.startswith('+'):
            phone = '+86' + phone
    if len(phone) > 3 and len(phone[3:]) < 11:
        phone = phone[:3] + phone[3:].zfill(11)
    return phone

def handle_multiple_phone_numbers(phone):
    phone_numbers = re.split(r'[\s,/-]+', phone)
    standardized_numbers = [standardize_phone_number(num) for num in phone_numbers if num.strip()]
    return ', '.join(standardized_numbers)

def remove_standalone_country_code(phone_list):
    phone_numbers = phone_list.split(', ')
    phone_numbers = [num for num in phone_numbers if num != '+86']
    return ', '.join(phone_numbers)

def ensure_single_plus(phone_list):
    phone_numbers = phone_list.split(', ')
    cleaned_numbers = ['+' + re.sub(r'[^\d]', '', num) if not num.startswith('+') else num for num in phone_numbers]
    return ', '.join(cleaned_numbers)

def remove_single_plus(phone_list):
    phone_numbers = phone_list.split(', ')
    cleaned_numbers = [num for num in phone_numbers if num != '+']
    return ', '.join(cleaned_numbers)

def clean_extra_plus_symbols(phone_list):
    phone_numbers = phone_list.split(', ')
    cleaned_numbers = ['+' + re.sub(r'[^\d]', '', num) for num in phone_numbers if num]
    return ', '.join(cleaned_numbers)

def flag_incomplete_numbers(phone):
    if phone == '+86' or len(phone) <= 3:
        return 'Incomplete'
    return 'Complete'

In [4]:
# Create column with cleaned phone numbers and add it to the dataframe
df['Cleaned Phone Number (s)'] = (
    df['Phone']
    .apply(handle_multiple_phone_numbers)          
    .apply(remove_standalone_country_code)         
    .apply(ensure_single_plus)                     
    .apply(remove_single_plus)                     
    .apply(clean_extra_plus_symbols)               
)

In [5]:
# Creating a column called Flag that shows if the number is complete or incomplete
df['Flag'] = df['Cleaned Phone Number (s)'].apply(flag_incomplete_numbers)

In [6]:
# Display dataframe with cleaned phone numbers
df 

Unnamed: 0,ID,Phone,Cleaned Phone Number (s),Flag
0,CB_Phone_ID_1,8.62E+12,+8620000000000,Complete
1,CB_Phone_ID_2,+86+8615028179902 15028179902,"+868615028179902, +8615028179902",Complete
2,CB_Phone_ID_3,+86-19937530512 0371-55170693,"+8619937530512, +8600000000371, +8600055170693",Complete
3,CB_Phone_ID_4,+undefined 21-51877795,"+8600000000021, +8600051877795",Complete
4,CB_Phone_ID_5,+86 0371-86658258,"+8600000000371, +8600000658258",Complete
...,...,...,...,...
1864,CB_Phone_ID_1865,029-63685358,"+8600000000029, +8600063685358",Complete
1865,CB_Phone_ID_1866,13564518121 021-58446131,"+8613564518121, +8600000000021, +8600058446131",Complete
1866,CB_Phone_ID_1867,8.61E+12,+8610000000000,Complete
1867,CB_Phone_ID_1868,027-88877052,"+8600000000027, +8600088877052",Complete


In [7]:
# Save cleaned dataframe to csv
df.to_csv('ChemBook_final_phone_cleaned.csv', index=False)