In [1]:
## Import everything ##

In [3]:
import pandas as pd
import re
import numpy as np
import os

In [4]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [47]:
## Import our Main file and the master SCP list from Mehmet ##
main_df = pd.read_csv('C:/Users/Desktop/111023_All MSN Clients (excl. those with portal IDs).csv', low_memory = False)
scp_df = df = pd.read_excel('C:/Users/Desktop/Work/Platform Migration/Data Cleaning/State, Country, Phone Codes.xlsx')

In [48]:
## REMOVE INSTRUCTIONS, CREATE DATE, ACCOUNT NO.; ADD IN RESIDENTIAL, AND LOADING_DOCK ##

main_df.drop(columns=['instructions', 'create_date'], inplace=True)
main_df['Residential_Address'] = 'No'
main_df['Loading_Dock'] = 'No'
main_df['Group_Name'] = ''
main_df.rename(columns={'id': 'Portal_ID'}, inplace=True)


In [49]:
## EMAIL ADDRESS CLEAN ##
def process_email(email_address):
    if pd.isna(email_address): 
        return ''

    email_address = str(email_address)
    
    # Remove white space from the email address
    email_address = email_address.strip()
    
    # Take only the first email address in contact email
    email_address = email_address.split(',')[0]
    email_address = email_address.split(';')[0]
    email_address = email_address.split('\n')[0]
    email_address = email_address.split('\r')[0]
    
    return email_address

main_df['Contact_Email'] = main_df['Contact_Email'].apply(process_email)

'''
Something to learn: you originally wrote "email_address = email_address.str.strip()" because you were borrowing from old syntax 
"df['Column'] = df['Column'].str.strip()". The difference is the 'email_address' is being treated as a single string, the df['email'] is a Series. 
So Python understands the df['email'] because its being told to treat the values in the Series as a string.
But the second it doesn't understand because there's no array for it to operate on because 'email_address' might not be a Series.
'''

In [50]:
## CONTACT NAME BLANK HANDLING ##

def blank_name(row):
    contact_name = str(row['Contact_Name'])
    company_name = str(row['Company_Name'])

    if contact_name == 'nan' and company_name != 'nan':
        contact_name = company_name
    else: 
        contact_name

    return contact_name

main_df['Contact_Name'] = main_df.apply(blank_name, axis=1)


In [51]:
## ADDRESS CONCATENATION ##

'''
Pandas makes us specify 'row' here because we're creating a custom function with the if statments, as opposed to the email address clean which 
only uses built-in functions like strip and RegEx. 
'''
def address_concat(row):
    add2 = str(row['Address_Line_2']).strip()
    add3 = str(row['address3']).strip()
    add4 = str(row['address4']).strip()

    # Replace 'nan' with actual NaN (missing) values; something about stringifying these columns forces the value 'NaN' to the string 'nan'
    add2 = None if add2 == 'nan' else add2
    add3 = None if add3 == 'nan' else add3
    add4 = None if add4 == 'nan' else add4

    # Initialize an empty result string
    result = ''

    # Check conditions and concatenate the address lines
    if add2:
        result += add2
    if add3:
        if result:
            result += ', ' + add3
        else:
            result += add3
    if add4:
        if result:
            result += ', ' + add4
        else:
            result += add4

    return result

# Apply the address_concat function to overwrite the 'Address_Line_2' column
main_df['Address_Line_2'] = main_df.apply(address_concat, axis=1)

# Drop the unnecessary columns 
main_df.drop(columns=['address3', 'address4'], inplace=True)


In [52]:
## PHONE NUMBER CLEAN ##
def remove_non_numeric_chars(phone_number):
    if pd.isna(phone_number):
        return ''
        
    phone_number = str(phone_number)
    phone_number = phone_number.strip()
    
    chars_to_remove = ['-', '(', ')', ' ', '+', '.']
    for char in chars_to_remove:
        phone_number = phone_number.replace(char, '')
    
    # Remove alphabetic characters using regex
    phone_number = re.sub(r"[a-zA-Z]", "", phone_number)
    
    return phone_number

main_df['Phone_Number'] = main_df['Phone_Number'].apply(remove_non_numeric_chars)

In [53]:
## INTERNATIONAL DIAL CODE CLEAN ##
'''
First, sanitize the Dial field in our scp file so it can match the phone numbers in main
'''

# Create a copy of Dial so we can come back to it later.
scp_df['Prefix'] = scp_df['Dial']

def process_dial_code(dial_code):
    dial_code = str(dial_code)
    
    # Clean the string
    dial_code = dial_code.split(',')[0]
    dial_code = re.sub(r"[^0-9]", "", dial_code)
    dial_code = dial_code.strip()

    return dial_code

# Apply the processing function to the 'Dial' column in scp_codes
scp_df['Dial'] = scp_df['Dial'].apply(process_dial_code)

In [55]:
'''
Second, because we're evaluating Series, we need to use a mapping. 
This will create a dictionary of value pairs between country names and codes, where name is the key and codes is the value
'''
country_mapping = dict(zip(scp_df['country_name'].str.upper(), scp_df['country_code'].str.upper()))

# Clean and strip all the white space in Country Code
main_df['Country_Code'] = main_df['Country_Code'].str.strip()
main_df['Country_Code'] = main_df['Country_Code'].str.upper()

'''
For all UK's, change to GB to be consistent with master list. The reason "for row in main_df['Country_Code']... if row = 'UK'..." doesn't work 
is because you would only be modifying the value at the local level; this change wouldn't propagate back to the original DataFrame, because it 
doesn't know how to access the Series directly. To fix this, we use .loc + the index to tell Python which row to modify. 
'''
main_df.loc[main_df['Country_Code'] == 'UK', 'Country_Code'] = 'UNITED KINGDOM'
main_df.loc[main_df['Country_Code'] == 'GREAT BRITAIN', 'Country_Code'] = 'UNITED KINGDOM'
main_df.loc[main_df['Country_Code'] == 'HONG KONG', 'Country_Code'] = 'HONG KONG SAR CHINA'
main_df.loc[main_df['Country_Code'] == 'CZECH REPUBLIC', 'Country_Code'] = 'CZECHIA'
main_df.loc[main_df['Country_Code'] == 'Ie', 'Country_Code'] = 'IRELAND'
main_df.loc[main_df['Country_Code'] == 'SP', 'Country_Code'] = 'SPAIN'
main_df.loc[main_df['Country_Code'] == 'SW', 'Country_Code'] = 'SWITZERLAND'
main_df.loc[main_df['Country_Code'] == 'RUSSIAN FEDERATION', 'Country_Code'] = 'RUSSIA'
main_df.loc[main_df['Country_Code'] == 'FRANCE (EUROPEAN TERRITORY)', 'Country_Code'] = 'FRANCE'
main_df.loc[main_df['Country_Code'] == 'US VIRGIN ISLANDS', 'Country_Code'] = 'U.S. VIRGIN ISLANDS'
main_df.loc[main_df['Country_Code'] == 'SWAZILAND', 'Country_Code'] = 'ESWATINI'
main_df.loc[main_df['Country_Code'] == 'VATICAN CITY STATE', 'Country_Code'] = 'VATICAN CITY'
main_df.loc[main_df['Country_Code'] == 'ON', 'Country_Code'] = 'CANADA'
main_df.loc[main_df['Country_Code'] == 'QC', 'Country_Code'] = 'CANADA'
main_df.loc[main_df['City'] == 'Curacao', 'Country_Code'] = 'CURAÇAO'
main_df.loc[main_df['City'] == 'Willemstad', 'Country_Code'] = 'CURAÇAO'
main_df.loc[main_df['City'] == 'St. Maarten', 'Country_Code'] = 'SINT MAARTEN'
main_df.loc[main_df['City'] == 'Philipsburg', 'Country_Code'] = 'SINT MAARTEN'
main_df.loc[main_df['City'] == 'SINT MAARTEN', 'Country_Code'] = 'SINT MAARTEN'
main_df.loc[main_df['City'] == 'SIMPSON BAY', 'Country_Code'] = 'SINT MAARTEN'
main_df.loc[main_df['Country_Code'] == 'NJ', 'Country_Code'] = 'UNITED STATES'
main_df.loc[main_df['Country_Code'] == 'SF', 'Country_Code'] = 'UNITED KINGDOM'
main_df.loc[main_df['Country_Code'] == 'NB', 'Country_Code'] = 'UNITED KINGDOM'
main_df.loc[main_df['Country_Code'] == 'FX', 'Country_Code'] = 'FRANCE'
main_df.loc[main_df['Country_Code'] == 'LX', 'Country_Code'] = 'LUXEMBOURG'
main_df.loc[main_df['Country_Code'] == 'KO', 'Country_Code'] = 'SOUTH KOREA'
main_df.loc[main_df['Country_Code'] == 'BC', 'Country_Code'] = 'CANADA'
main_df.loc[main_df['Country_Code'] == 'EN', 'Country_Code'] = 'UNITED KINGDOM'
main_df.loc[main_df['Country_Code'] == 'HU', 'Country_Code'] = 'HUNGARY'
main_df.loc[main_df['Country_Code'] == 'NT', 'Country_Code'] = 'NETHERLANDS'
main_df.loc[main_df['Country_Code'] == 'DD', 'Country_Code'] = 'DENMARK'
main_df.loc[main_df['Country_Code'] == 'AB', 'Country_Code'] = 'CANADA'
main_df.loc[main_df['Country_Code'] == 'HI', 'Country_Code'] = 'ARMENIA'
main_df.loc[main_df['Country_Code'] == 'AN', 'Country_Code'] = 'ST. MARTIN'
main_df.loc[main_df['Country_Code'] == 'TA', 'Country_Code'] = 'FRANCE'
main_df.loc[main_df['Country_Code'] == 'PO', 'Country_Code'] = 'POLAND'
main_df.loc[main_df['City'] == 'Zagreb', 'Country_Code'] = 'CROATIA'
main_df.loc[main_df['Country_Code'] == 'HUNGARY', 'Country_Code'] = 'HU'
main_df.loc[main_df['Country_Code'] == 'HG', 'Country_Code'] = 'HUNGARY'
main_df.loc[main_df['Country_Code'] == 'BOSNIA-HERZEGOVINA', 'Country_Code'] = 'BOSNIA & HERZEGOVINA'
main_df.loc[main_df['Country_Code'] == 'SLOVAK REPUBLIC', 'Country_Code'] = 'SLOVAKIA'
main_df.loc[main_df['Country_Code'] == '-1', 'Country_Code'] = 'CANADA'
main_df.loc[main_df['Country_Code'] == 'C', 'Country_Code'] = 'CANADA'

'''
After that, we want to use this mapping to replace values in our main df that are not using country code. For any unmatched value, they will not 
be changed (e.g., 'UK' originally didn't map to either 'GB' or 'United Kingdom', so we caught it with our handling. 
Note: it doesn't matter if the value in 'Country_Code' is the key (i.e., DE) or the value (Denmark). All that matters is the value the user has 
entered exists in some way in our new dictionary country_mapping.
'''
main_df['Country_Code'] = main_df['Country_Code'].where(main_df['Country_Code'] == main_df['Country_Code'].map(country_mapping), 
                                              main_df['Country_Code'].replace(country_mapping))

# Finally, as a contingency, let's print any unmatched values.
unmatched_mask = ~main_df['Country_Code'].isin(country_mapping.values())
unmatched_country_codes = main_df.loc[unmatched_mask, 'Country_Code']
print("Unmatched country codes:")
print(unmatched_country_codes)



Unmatched country codes:
Series([], Name: Country_Code, dtype: object)


In [56]:
'''
Finally, run through each row in our main_df; check if there's a match on country code, then strip out everything up until the prefix.
'''

# Create a new, empty column called 'Int. Ext'
main_df['Phone_Country_Code'] = None
def process_intcode():
    # Iterrows has two parameters; index (which is the current row's position in the df) and row (which is a Series that represents all the data within that row)
    for index, row in main_df.iterrows():
        country_code = row['Country_Code'] # We're accessing the column 'Country_Code' within the current row array we're iterating through
       
        # Check if there's a matching dial code for the country code;
        dial_row = scp_df.loc[scp_df['country_code'] == country_code] 
        '''
        Before the .loc; we're looking within the selected column 'country_code' within the dataframe scp and asking if - for a given row in our 
        iterated df - country code == country code. If it is, then for that iterated row, the result is an array / dataframe that would literally 
        print 'True' or 'False' for every element in the scp dataframe. The .loc is a clever way of taking the results of this Boolean mask 
        (i.e., the T and F's) and giving us rows within scp that have a country code that match the country code of our main df. This is because 
        .loc filters rows based on a condition; the condition here is where cc == cc. So the result is, in the context of our for loop, 
        'go grab every column from scp where cc == cc; then filter the resulting dataframe and show me just the rows that meet this condition.
        '''
        # Small addition; if the Phone_Number is blank or NaN, leave as blank
        if pd.isna(row['Phone_Number']) or row['Phone_Number'] == '' or row['Phone_Number'] == 'nan':
            main_df.at[index, 'Phone_Number'] = ''

        
        if not dial_row.empty:
            dial_code = dial_row['Dial'].values[0]

            # Check if the 'Phone_Number' starts with the dial code
            if row['Phone_Number'].startswith(dial_code):
                # Remove the dial code from the 'Phone_Number'
                main_df.at[index, 'Phone_Number'] = row['Phone_Number'][len(dial_code):]

        # Finally, add back in the UNFORMATTED intn'l dial code prefix for every row in our main_df
        condition = scp_df['country_code'] == country_code
        if condition.any():
            main_df.at[index, 'Phone_Country_Code'] = scp_df.loc[condition, 'Prefix'].values[0]

# Call the function to modify df['Phone_Number']
process_intcode()

In [57]:
## STATE CODE CLEANING ##

def process_state_code(state_code):
    # Remove non-alphabetic characters
    if pd.notna(state_code):
        state_code = str(state_code).strip()
        state_code = re.sub(r"[^a-zA-Z]", "", state_code)
    else:
        state_code = ''

    # Ensure that the state_code is blank on receipt
    if state_code == '' or state_code.lower() == 'nan':
        state_code = ''

    # Capitalize the state code
    state_code = state_code.upper()

    return state_code

# Apply the processing function to the 'State_Code' column
main_df['State_Code'] = main_df['State_Code'].apply(process_state_code)

In [62]:
## PROVINCE CODE CLEANING ##

'''
If the province field(from our main df) matches anything in our 'state name' field (from our scp df), then replace the value with what's in 
'state code' (from our scp df)
'''
def process_province(province):
    # Remove non-alphabetic characters
    if pd.notna(province) and isinstance(province, str):
        province = str(province).strip()
        province = re.sub(r"[^a-zA-Z]", "", province)
    else:
        province = ''

    # Split the input string based on where a lowercase letter is followed by an uppercase letter (e.g., 'NorthCarolina' should = 'North Carolina')
    province = re.sub(r"([a-z])([A-Z])", r"\1 \2", province)

    # Check if province matches 'state_name' in scp_codes
    mask = scp_df['state_name'].str.lower() == province.lower()
    if mask.any():
        # Replace with corresponding 'state_code'
        province = scp_df.loc[mask, 'state_code'].values[0]
        
    return province

# Apply the processing function to the 'province' column
main_df['province'] = main_df['province'].apply(process_province)

In [59]:
## ZIP CODE CLEANING ##

def process_zip_code(zip_code):
    # Remove non-numeric characters
    if pd.notna(zip_code):
        zip_code = str(zip_code).strip()
        zip_code = re.sub(r"[^0-9]", "", zip_code)
    else:
        zip_code = ''

    # Replace blanks and nan values
    if zip_code == '' or zip_code.lower() == 'nan':
        zip_code = ''

    return zip_code

# Apply the processing function to the 'Zip_Code' column
main_df['Zip_Code'] = main_df['Zip_Code'].apply(process_zip_code)

In [60]:
## POSTAL CODE CLEANING ##

def process_postal_code(postal_code):
    # Remove non-alphabetic characters
    if pd.notna(postal_code):
        postal_code = str(postal_code).strip()
        postal_code = re.sub(r"[^a-zA-Z0-9]", "", postal_code)
    else:
        postal_code = ''

    # Replace blanks and nan values
    if postal_code == '' or postal_code.lower() == 'nan':
        postal_code = ''

    return postal_code

# Apply the processing function to the 'postalcode' column
main_df['postalcode'] = main_df['postalcode'].apply(process_postal_code)

In [None]:
'''
Check for float types in province; for some reason, we usually have to run the 'province' clean twice.
'''

float_province_values = main_df.loc[main_df['province'].apply(lambda x: isinstance(x, float)), 'province']

# Print all values
print(float_province_values)

In [63]:
## CREATE NEW STATE/PROVINCE + ZIP/POSTAL FIELDS ##
# Create a new field in df that has EITHER -- not both -- of state code / province code, and zip code / postal code

def merge_state_province(row):
    if row['State_Code'] == '' or row['State_Code'].lower() == 'nan':
        if row['province'] == '' or row['province'].lower() == 'nan':
            return ''
        else:
            return row['province']
    else:
        return row['State_Code']
        
def merge_zip_postal(row):
    if row['Zip_Code'] == '' or row['Zip_Code'] == 'nan':
        if row['postalcode'] == '' or row['postalcode'] == 'nan':
            return ''
        else:
            return row['postalcode']
    else:
        return row['Zip_Code']

# Apply the function row-wise to create the 'New_State_Province' column. If I stipulated 'axis = 0' here, it would apply the formatting column-wise.
main_df['New_State_Province'] = main_df.apply(merge_state_province, axis=1)
main_df['New_Zip_Postal'] = main_df.apply(merge_zip_postal, axis=1)

# Assign the new values to the old columns
main_df['State_Code'] = main_df['New_State_Province'] 
main_df['Zip_Code'] = main_df['New_Zip_Postal']

# Drop the redundant columns
main_df.drop(columns=['New_State_Province', 'New_Zip_Postal', 'postalcode', 'province'], inplace=True)

In [64]:
## EXTENSION NUMBER CLEANING ##
def process_ext_code(ext_code):
    # Remove non-alphabetic characters
    ext_code = str(ext_code)
    ext_code = re.sub(r"[^0-9]", "", ext_code)

    # Remove white space
    ext_code = ext_code.strip()

    # Ensure that the ext_code is blank on receipt
    if ext_code == '':
        ext_code = ''

    return ext_code

# Apply the processing function 
main_df['Phone_Number_Ext'] = main_df['Phone_Number_Ext'].apply(process_ext_code)

In [65]:
## REORDER THE COLUMNS IN THE WAY SPECIFIED BY THE DEV TEAM ##

current_order = main_df.columns.tolist()
desired_order = ['Portal_ID', 'accountno', 'Client', 'Group_Name', 'City', 'Company_Name', 'Contact_Email', 'Contact_Name', 'Phone_Number', 'Country_Code', 'State_Code', 'Zip_Code', 'Phone_Country_Code', 'Phone_Number_Ext', 'Street_Address', 'Address_Line_2', 'Residential_Address', 'Loading_Dock']  
main_df = main_df.reindex(columns=desired_order)

In [80]:
'''
Trying to export main_df to Excel is giving us an 'illegal character' error; we need to ID which column contains the error.
'''

# Assuming df is your DataFrame
illegal_char = '\x1f'

# Iterate through each column
for column in main_df.columns:
    # Check if the illegal character is present in any cell of the column
    has_illegal_char = any(main_df[column].astype(str).str.contains(illegal_char))

    # If the illegal character is present, print the column name
    if has_illegal_char:
        print(f"Column '{column}' contains the illegal character.")


Column 'Contact_Name' contains the illegal character.
Column 'Street_Address' contains the illegal character.


In [None]:
'''
As a redundancy, I'd like to ID the exact row where the issue occurs.
'''

# Assuming df is your DataFrame
problematic_rows = main_df[main_df['Company_Name'] == 'Sams Club']
print(problematic_rows)

In [None]:
'''
We're still identifying areas where the field isn't capturing all illegal characters. Let's try to print all the rows where problematic characters 
exist.
'''

column_name = 'Company_Name'

# Iterate through each row in the DataFrame
for index, value in main_df[column_name].items():
    # Check if the value contains non-printable characters
    if any(not char.isprintable() for char in str(value)):
        print(f"Non-printable characters found in '{column_name}' at index {index}: {repr(value)}")

In [89]:
'''
Now that we've ID'd which column and which row have the illegal character, let's just replace them with a blank so we can export 
our main_df to Excel and be done. Because its such a big file, let's make a new df to ensure if we make a mistake, we can revert back.
'''

# Create a copy of the original DataFrame
main_1_df = main_df.copy()

# Replace non-printable characters in 'Company_Name' with an empty string
main_1_df['Company_Name'] = main_1_df['Company_Name'].apply(lambda x: ''.join(char for char in str(x) if char.isprintable()))





In [92]:
'''
Let's check to make sure there are no more unprintable characters, if this runs smoothly, we're set.
'''
column_name = 'Company_Name'

# Iterate through each row in the DataFrame
for index, value in main_1_df[column_name].items():
    # Check if the value contains non-printable characters
    if any(not char.isprintable() for char in str(value)):
        print(f"Non-printable characters found in '{column_name}' at index {index}: {repr(value)}")

In [94]:
main_1_df.to_csv('111523_Cleaned MSN Clients.csv', index = False)

In [None]:
## SPLIT THE MAIN FILE INTO SEPARATE CHILD CSVS 

def split_and_save_csv_by_portal_id(df, output_directory):
    """
    
    """
    # Create a directory to save the CSV files if it doesn't exist
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    # Get unique portal IDs
    unique_portal_id = df['Portal_ID'].unique()

    # Iterate through unique portal IDs and save corresponding CSV files
    for portal_id in unique_portal_id:
        # Create a DataFrame with records matching the current portal ID
        filtered_df = df[df['Portal_ID'] == portal_id]
        unique_client_name = filtered_df['Client'].unique()
        
        # Generate the CSV file name based on the portal ID; use [0] value access to grab just the name     
        csv_filename = os.path.join(output_directory, f"cid_{portal_id}.csv")

        
        # Save the filtered DataFrame to a CSV file
        filtered_df.to_csv(csv_filename, index=False)
        print(f"Saved CSV for client {portal_id}_{unique_client_name[0]} to {csv_filename}")

# Usage example
output_dir = 'C:/Users/Desktop/Work/Coding/CSVs to Upload'  # Specify the output directory
split_and_save_csv_by_portal_id(main_df, output_dir)

In [91]:
## WITH OUR CHILD FILES CREATED, NOW, DELETE THE FIRST TWO COLUMNS FROM EVERY CSV  ##

directory = 'C:/Users/Desktop/Work/Coding/CSVs to Upload'

'''
This is called 'list comprehension, and it is a more Pythonic way of creating and assigning a new list. The logic below is the exact same logic as:
csv_file_list = []
for file in os.listdir(directory):
    if file.endswith('.csv'):
         csv_file_list.append(file)
Effectively, we're executing the separate list creation and assignment in one line of code as opposed to 4
This is creating a working list of all our CSV files at a specified location and assigning them the variable 'f'.
Note, while we could've named 'f' anything, we would get a Name Error if our preceing and succeeding variables aren't the same, and that's purely
because list comprehension by nature both creates and assigns our variables in one line. If we had named the succeeding variable 'e', for example,
we would need to define what 'e' is before our list comprehension executes.
'''
csv_files = [f for f in os.listdir(directory) if f.endswith('.csv')]

# Iterate through each x in our y (each CSV file in our list); explicitly open each file, select only every column past the first two, and then save 
# them to the original file location.
for file in csv_files:
    file_path = os.path.join(directory, file)
    csv_df = pd.read_csv(file_path)
    csv_df = csv_df.iloc[:, 2:]
    csv_df.to_csv(file_path, index = False)

In [92]:
## CREATE A ZIPPED FILE OF ALL THE CSV'S ##

import zipfile

def zip_directory(source_folder, zip_filename):
    with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in os.walk(source_folder):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, source_folder)
                zipf.write(file_path, arcname)

source_folder = 'CSVs to Upload'
zip_filename = 'my_files.zip'
zip_directory(source_folder, zip_filename)


In [None]:
# A bit of scratch, but this is useful for filtering the column of an entire df on a single condition.
main_df.loc[main_df['Country_Code'] != 'US'][:2]
main_df.loc[main_df['Country_Code'].isna()][:2]

In [362]:
query_list = pd.read_excel('C:/Users/Desktop/Work/Platform Migration/Data Cleaning/Migration Update 10-17-23.xlsx', sheet_name = 'Clients to Migrate')

In [374]:
def q_list(account):
    acct_no = str(account).strip()
    return f"'{acct_no}', "

# Apply the function to each element in the 'Account Number' column
concatenated_values = query_list['Account Number'].apply(q_list)

# Join the results into a single string
concatenated_values = concatenated_values.str.cat()

# Remove the trailing comma
concatenated_values = concatenated_values[:-1]

# Print the concatenated values or return them as needed
print(concatenated_values)


'4532', '4480', '78989', '79132', '7712', '4124', '7081', '7059', '18011', '86929', '80965', '30660', '74821', '7558', '71223', '71132', '73391', '6887', '72914', '7164', '4137', '7597', '7542', '7646', '71006', '1803', '83396', '80460', '7705', '73881', '7731', '82222', '79978', '72911', '85995', '7771', '5966', '7617', '71789', '5562', '7501', '7100', '5667', '7790', '6799', '7691', '75341', '9701', '7539', '3569', '7614', '75923', '75821', '4756', '5130', '5168', '5192', '5226', '5252', '5510', '5573', '5776', '5960', '6787', '6882', '7038', '7078', '7080', '7126', '7146', '7154', '7158', '7221', '7227', '7233', '7292', '7295', '7344', '7424', '7440', '7471', '7519', '7550', '7572', '7592', '7598', '7615', '7623', '7624', '7632', '7635', '7671', '7681', '7719', '7759', '7760', '7770', '7773', '7774', '7854', '7906', '7919', '7926', '8560', '9390', '9880', '18003', '18018', '30005', '30750', '31123', '40060', '41003', '41236', '70006', '70013', '70032', '70054', '70063', '70106', '70

In [None]:
'''
Think about the way Pandas ingests Series in this way: it's going to treat each element of the field separately, so your named parameter,
in this case, "client_name", is the thing it's going to iterate through. The '.apply' method takes a function as a parameter, and it's going to apply 
that function to every element within the field you specify (in this case, "Client Name"). Concatenated Comma then takes the output of that modified 
list, and concatenates all the elements into one line. We then are printing that list and removing the very last comma with [:-1] (this is clever; b/c
we've now created a very long list of many, many characters, we can use string slicing to capture everything (the [:...]) up until the penultimate 
positioned character (the -1). 
'''

def format_list(client_name):
    client_name = str(client_name).strip()
    return f"'{client_name}',"

comma_list = migration_list['Client Name'].apply(format_list)
concat_comma = comma_list.str.cat()
concat_comma = concat_comma[:-1]
print(concat_comma)


In [104]:
migration_list = pd.read_excel('C:/Users/Desktop/Work/Platform Migration/Migration Tracker/110323_Migration Tracker.xlsx')