In [None]:
from faker import Faker
import numpy as np
import pandas as pd
import random as rd
from lib import analyze, fabricate

In [None]:
n = 10000 # data-points
datafield = "VPN Usage"
fake = Faker('en_IN') # Faker instance
Faker.seed(42)
np.random.seed(42)
rd.seed(42)

In [None]:
vpn_stat = (8, 4)

In [None]:
df = fabricate.fabricate_base_data()
df[datafield] = fabricate.create_positive_norm_distribution(vpn_stat, 2, 0.05)

In [None]:
df.head(100)

In [None]:
analyze.analyze_dataframe(df)

In [None]:
df.to_csv(f'data/Fabricated Data - {datafield}.csv', index = False)

In [None]:
# Step 1: Base identity data
df = fabricate.fabricate_base_data()

# Step 2: Define stats for the 3 correlated features
# Format: (mean, std, min, max)
stats_list = [
    (4, 1.5, 0, 10),    # VPN usage in hours
    (1, 0.5, 0, 1),     # Antivirus subscription (0=no, 1=yes)
    (2, 1, 0, 5)        # SIM changes per year
]

# Step 3: Define correlation matrix
correlation_matrix = [
    [1.0, 0.4, 0.5],  # VPN with others
    [0.4, 1.0, 0.3],  # Antivirus with others
    [0.5, 0.3, 1.0]   # SIM changes with others
]

# Step 4: Generate correlated data
correlated_features = fabricate.create_correlated_norm_distribution(
    stats_list=stats_list,
    correlation_matrix=correlation_matrix,
    n=len(df),
    precision=2,
    nan_probability=0.05
)

# Step 5: Add correlated columns to df
df[["VPN Usage Hours", "Antivirus Subscribed", "SIM Changes"]] = correlated_features

print(df.head())
df.to_csv(f'data/Fabricated Data - {datafield}.csv', index = False)


In [None]:
analyze.analyze_dataframe(df)