In [189]:
from faker import Faker
import numpy as np
import pandas as pd
import random as rd
import matplotlib.pyplot as plt
import seaborn as sns

In [190]:
n = 10000 # data-points
datafield = "VPN Usage"
fake = Faker() # Faker instance
Faker.seed(42)
np.random.seed(42)
rd.seed(42)

In [191]:
vpn_stat = (8, 4)

In [198]:
def fabricate_base_data(num_rows=n, id_range=(10**11, 10**12-1) ,age_stats=(36.17, 12.68)):
    data = []
    aadhar = rd.sample(range(id_range[0], id_range[1] + 1), num_rows)
    ages = np.random.normal(loc=age_stats[0], scale=age_stats[1], size=num_rows)
    ages = np.clip(ages, 18, 65)  # keep within the bounds
    ages = np.round(ages).astype(int)
    for _ in range(num_rows):
        aadhar_no = aadhar[_]
        age = ages[_]
        name = fake.name()
        email = fake.email()
        phone = fake.phone_number()
        data.append([aadhar_no, name, age, email, phone])
    df = pd.DataFrame(data, columns=['Aadhar No.', 'Name', 'Age', 'E-mail', 'Phone No.'])
    df.set_index('Aadhar No.', inplace=True)
    return df

In [199]:
def fabricate_vpn_usage(vpn_stats, num_rows=n, precision=2, nan_probability=0.05):
    data = []
    for _ in range(num_rows):
        vpn_usage = np.random.normal(vpn_stats[0], vpn_stats[1])
        vpn_usage = max(0, round(vpn_usage, precision))
        if np.random.rand() < nan_probability:
            vpn_usage = np.nan
        data.append(vpn_usage)
    return data

In [200]:
df = fabricate_base_data()
df[datafield] = fabricate_vpn_usage(vpn_stat)

In [201]:
df

Unnamed: 0_level_0,Name,Age,E-mail,Phone No.,VPN Usage
Aadhar No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
670577406348,Kristen Rodriguez,44,derek78@example.org,454-860-4165x222,3.17
540581591865,Taylor Gordon,21,erika94@example.net,+1-256-382-3575x85291,15.18
734054232680,Jon Peters,73,pearsoncraig@example.org,+1-542-495-0791,7.48
529407098278,Rebecca May,49,devin54@example.net,+1-370-700-6208,7.06
386675292107,Barbara Williams,43,ujohnson@example.net,(393)200-0838x51909,7.89
...,...,...,...,...,...
112512634478,Emily Cruz,27,wtate@example.org,(204)718-2310x625,0.00
226515786535,James Hartman,32,gardnerjennifer@example.com,001-472-484-8825x14445,8.40
363855739265,Matthew Gilbert,17,christopher53@example.com,(266)743-2966,7.24
715335644861,Mario Gomez,57,fitzpatrickerin@example.org,+1-513-774-8391,1.63


In [196]:
def plot_heatmap(data, vmax):
    plt.figure(figsize=(15,8))
    sns.set(font_scale = 1.4)
    sns.heatmap(data, cmap='Reds', annot=True, annot_kws={'size': 15}, vmax=vmax)
    plt.show()

In [176]:
df.to_csv(f'Fabricated Data - {datafield}.csv', index = False)