In [1]:
import numpy as np
import pandas as pd
import random

col_names = ['first_name','last_name','country','city','street','housenumber','date_of_birth','tax_id']
rs = 50 # random-state

In [2]:
customers = pd.read_csv('customers.csv', header =0)
customers.set_index('id', inplace=True)
customers['date_of_birth'] = pd.to_datetime(customers.date_of_birth)
customers.head(3)

Unnamed: 0_level_0,first_name,last_name,country,city,street,housenumber,date_of_birth,tax_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1111,Kenneth,Tucker,Serbia,Niš,Coleman Trail,70,1987-12-27,704561284
1112,Lawrence,Brooks,Philippines,Paitan Este,Onsgard Park,93,1944-10-10,347398604
1113,Carolyn,Patterson,Brazil,Corumbá,Golf Course Avenue,196,1989-07-14,755983430


In [3]:
cust_sample = customers.loc[:, col_names].sample(frac = 0.1, replace=False, random_state=rs, axis = 0)

In [4]:
alerts =  [[True, True, True, True, True, True, True, True], #full
           [True, True, True, True, True, True, True, False], #address_dob
           [True, True, True, True, True, True, False, True], #address_tax
           [True, True, True, True, False, False, True, False], #city_dob
           [True, True, True, True, False, False, False, True], #city_tax
           [True, True, False, False, False, False, True, True]] #dob_tax

weight_of_alerts = [1, 2, 2, 4, 3, 4]
s = sum(weight_of_alerts)
prob_of_alerts = [n/s for n in weight_of_alerts]

In [5]:
def subset_entry(sr, filters, probs):
    num_of_alerts = len(filters)
    rnd_loc = np.random.choice(np.arange(num_of_alerts), p = probs)
    filt = filters[rnd_loc]
    return sr[filt]

In [6]:
# replace non-matching values with NaN
blacklist = cust_sample.apply(lambda x: subset_entry(x, alerts, prob_of_alerts), axis=1)

In [7]:
# reorder columns
blacklist = blacklist.loc[:, col_names]

In [8]:
def fill_na_random(col, from_df):
    n_missing = col.isnull().sum()
    rand_values = np.random.choice(from_df[col.name].tolist(), n_missing)
    index_miss = col[col.isnull()].index
    fill_series = pd.Series(rand_values, index = index_miss)
    return col.fillna(fill_series)

# fill missing values from orginial customers table
blacklist = blacklist.apply(lambda x: fill_na_random(x, customers), axis=0)

In [9]:
# converting values back to datetime/int
blacklist['date_of_birth'] = blacklist.date_of_birth.apply(lambda x: pd.to_datetime(x))
blacklist.loc[:, ['housenumber', 'tax_id']]= blacklist.loc[:, ['housenumber', 'tax_id']].applymap(int)

In [10]:
#test
customers.join(blacklist, how='inner', rsuffix='_bl').head(10)

Unnamed: 0_level_0,first_name,last_name,country,city,street,housenumber,date_of_birth,tax_id,first_name_bl,last_name_bl,country_bl,city_bl,street_bl,housenumber_bl,date_of_birth_bl,tax_id_bl
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1427,Daniel,Lane,Jordan,Jarash,Sundown Crossing,57,1981-04-19,204539767,Daniel,Lane,Jordan,Jarash,Nelson Trail,5,1981-04-19,606392090
1570,Heather,Morrison,Philippines,Hinlayagan Ilaud,Victoria Plaza,6,1967-11-24,209677027,Heather,Morrison,Philippines,Hinlayagan Ilaud,Victoria Plaza,6,1967-11-24,417515331
1965,Teresa,Mills,Armenia,Saratak,Artisan Parkway,366,1989-08-27,695173048,Teresa,Mills,Armenia,Saratak,Alpine Hill,1,1963-12-04,695173048
1122,Judith,Grant,Argentina,Capilla del Monte,Hauk Alley,5184,1966-10-12,430983044,Judith,Grant,Ukraine,Youdunjie,Roth Junction,6,1966-10-12,430983044
2089,Jack,Price,Nigeria,Yola,Nevada Hill,4,1966-07-09,137747837,Jack,Price,Nigeria,Yola,Nevada Hill,4,1966-07-09,975791247
1838,Brenda,Rivera,Indonesia,Girihieum,Acker Center,428,1954-02-12,946234197,Brenda,Rivera,Indonesia,Girihieum,Acker Center,428,1935-01-14,946234197
1882,Carlos,Rodriguez,Tunisia,Tajerouine,Mcguire Point,7,1978-10-10,530891550,Carlos,Rodriguez,Tunisia,Tajerouine,Mcguire Point,7,1978-10-10,530891550
1674,Jesse,Perez,Indonesia,Cangkuang,Blackbird Hill,58020,1951-06-26,150034545,Jesse,Perez,Indonesia,Cangkuang,Holmberg Parkway,5434,1951-06-26,403199834
1652,Cynthia,Hart,Sweden,Skene,Eliot Terrace,78994,1947-05-15,356456459,Cynthia,Hart,Sweden,Skene,Eliot Terrace,78994,1947-05-15,646019179
1858,Scott,Willis,Sweden,Solna,Moland Point,8311,1938-10-18,593469151,Scott,Willis,Sweden,Solna,Union Street,4,1938-10-18,874469650


In [11]:
blacklist.to_csv('blacklist.csv', encoding='utf8')