In [50]:
import pandas as pd
import random
from faker import Faker

# Create a Faker instance
fake = Faker()

# Function to create an individual
def create_individual():
    return [fake.unique.random_number(digits=5), fake.first_name(), fake.last_name(), 
            fake.date_of_birth(minimum_age=18, maximum_age=90), fake.phone_number(), fake.zipcode()]

# Function to introduce typos
def introduce_typos(value, num_typos):
    for _ in range(num_typos):
        typo_type = random.randint(1, 3)
        if typo_type == 1 and len(value) > 1:  # deletion
            pos = random.randint(0, len(value) - 1)
            value = value[:pos] + value[pos+1:]
        elif typo_type == 2:  # insertion
            pos = random.randint(0, len(value))
            value = value[:pos] + random.choice('abcdefghijklmnopqrstuvwxyz') + value[pos:]
        else:  # swap
            pos = random.randint(0, len(value) - 2)
            value = value[:pos] + value[pos+1] + value[pos] + value[pos+2:]
    return value

# Function to introduce numeric typos
def introduce_numeric_typos(value, num_typos):
    for _ in range(num_typos):
        typo_type = random.randint(1, 3)
        if typo_type == 1 and len(value) > 1:  # deletion
            pos = random.randint(0, len(value) - 1)
            value = value[:pos] + value[pos+1:]
        elif typo_type == 2:  # insertion
            pos = random.randint(0, len(value))
            value = value[:pos] + str(random.randint(0, 9)) + value[pos:]
        else:  # swap
            pos = random.randint(0, len(value) - 2)
            value = value[:pos] + value[pos+1] + value[pos] + value[pos+2:]
    return value

# Function to swap month and day in date
def swap_month_day(date):
    date_parts = date.split('-')
    return '-'.join([date_parts[0], date_parts[2], date_parts[1]])

# Create df1
data1 = [create_individual() for _ in range(5)]
df1 = pd.DataFrame(data1, columns=['unique_id', 'first_name', 'last_name', 
                                   'date_of_birth', 'phone_number', 'zip_code'])

# Create df2
data2 = [list(df1.loc[i]) for i in random.choices(range(5), k=6)]  # choose 6 individuals from df1
data2 += [create_individual() for _ in range(4)]  # add 4 new individuals

# Introduce typos, formatting errors, and NULL values
for person in data2[:6]:
    error_count = 0
    error_fields = sorted(random.sample(range(1, 7), k=random.randint(2, 5)))  # select fields to introduce errors
    untouched_fields = list(set(list(range(1,6))) - set(error_fields)) # determine what fields do not have errors 
    for field in error_fields:
        if field == 6:
            random_null = random.choice(untouched_fields)
            person[random_null] = None
            untouched_fields.remove(random_null)
            error_count += 1
        elif field in [1, 2]:  # first_name, last_name fields
            person[field] = introduce_typos(person[field], random.randint(0, 2))
            error_count += 1
        elif field == 4:  # phone_number field
            person[field] = person[field].replace('-', '')  # formatting error
            error_count += 1
        elif field == 5:  # zip_code field
            person[field] = introduce_numeric_typos(person[field], random.randint(0, 2))
            error_count += 1
        elif field == 3: # date_of_birth field
            if random.choice([True, False]):
                person[field] = swap_month_day(str(person[field])) # swap month and day
                error_count += 1
        else:
            person[random.randint(1,7)] = None # NULL value
            error_count += 1
    person.append(error_count)

for person in data2[6:]:
    person.append(0)

df2 = pd.DataFrame(data2, columns=['unique_id', 'first_name', 'last_name',
'date_of_birth', 'phone_number', 'zip_code', 'number_of_errors'])



In [53]:
df1

Unnamed: 0,unique_id,first_name,last_name,date_of_birth,phone_number,zip_code
0,32626,Amanda,Cameron,1989-03-27,960.342.3182,72462
1,40966,Briana,Bennett,1997-07-13,429-286-9862x92729,9143
2,33466,Luke,Sanchez,1992-05-16,4380449549,78742
3,61541,Stephen,Wilson,1938-07-26,001-536-623-6876,17287
4,77230,Ellen,Rose,1979-12-14,806.754.3462,93545


In [54]:
df2

Unnamed: 0,unique_id,first_name,last_name,date_of_birth,phone_number,zip_code,number_of_errors
0,61541,Stephen,Wislon,1938-07-26,0015366236876,1272087,3
1,77230,Ellne,,1979-12-14,806.754.3462,93545,4
2,33466,Luke,Sanche,1992-05-16,4380449549,7874,3
3,33466,,Sanchezv,1992-16-05,4380449549,78742,5
4,33466,Luke,Sanchez,1992-05-16,4380449549,787482,2
5,32626,Amaunda,amerron,,960.342.3182,67462,5
6,88185,Ryan,Callahan,1944-12-22,+1-367-179-5936,48240,0
7,63454,Donald,Gonzalez,1967-03-06,339-397-0646x25798,73068,0
8,61486,Ashley,Dillon,1949-02-27,001-086-776-0295x2790,32317,0
9,52426,Kimberly,English,1959-03-28,277.846.8408,50210,0
