In [1]:
import numpy as np
import pandas as pd

In [3]:
# example data with 1 million rows and 10 columns with various data types
data = {
    "A": np.random.randint(0, 100, size=1000000),
    "B": np.random.rand(1000000),
    "C": np.random.choice(["foo", "bar", "baz"], size=1000000),
    "D": pd.date_range("2020-01-01", periods=1000000, freq="min"),
    "E": np.random.choice([True, False], size=1000000),
    "F": np.random.randn(1000000),
    "G": np.random.randint(1, 10, size=1000000),
    "H": np.random.choice(["apple", "banana", "cherry"], size=1000000),
    "I": np.random.randint(1000, 2000, size=1000000),
    "J": np.random.choice(["x", "y", "z"], size=1000000),
}
data = pd.DataFrame(data)
data.head()

Unnamed: 0,A,B,C,D,E,F,G,H,I,J
0,18,0.195467,foo,2020-01-01 00:00:00,False,1.916013,8,apple,1523,z
1,13,0.093321,foo,2020-01-01 00:01:00,False,-0.93066,6,apple,1458,x
2,84,0.047368,bar,2020-01-01 00:02:00,False,0.62316,8,apple,1330,y
3,17,0.524406,foo,2020-01-01 00:03:00,False,0.529768,7,cherry,1874,z
4,29,0.260558,foo,2020-01-01 00:04:00,False,-1.584884,3,apple,1146,x


### by loops

In [4]:
# simulate on this data structure, making new columns with random uniform 0-1, each time do summary on the new col and one of the existing columns, collecting summaries in a list
def generate_random_column_and_summarize(df, col_name):
    new_col_name = f"random_{col_name}"
    df[new_col_name] = np.random.rand(len(df))

    summary = {
        "column": new_col_name,
        "mean": df[new_col_name].mean(),
        "std": df[new_col_name].std(),
        "min": df[new_col_name].min(),
        "max": df[new_col_name].max(),
        "count": df[new_col_name].count(),
    }

    return summary

In [5]:
# simulate 100 times
summaries = []
for col in range(100):
    summary = generate_random_column_and_summarize(data, col)
    summaries.append(summary)

# Display the summaries
sdf = pd.DataFrame(summaries, columns=["column", "mean", "std", "min", "max", "count"])

sdf.head()

  df[new_col_name] = np.random.rand(len(df))
  df[new_col_name] = np.random.rand(len(df))
  df[new_col_name] = np.random.rand(len(df))
  df[new_col_name] = np.random.rand(len(df))
  df[new_col_name] = np.random.rand(len(df))


Unnamed: 0,column,mean,std,min,max,count
0,random_0,0.49957,0.288715,6.702492e-08,1.0,1000000
1,random_1,0.500299,0.28879,2.114768e-07,0.999998,1000000
2,random_2,0.500032,0.288729,5.351773e-07,1.0,1000000
3,random_3,0.500158,0.288566,7.073605e-07,0.999999,1000000
4,random_4,0.499813,0.288755,5.068415e-07,1.0,1000000


### vectorized 

In [8]:
# make (len(df), 1000) numpy array with random uniform 0-1, and add it to the dataframe in one go
random_cols = np.random.rand(len(data), 500)
print(f"{random_cols.shape=}")
random_cols[:5, :5]

random_cols.shape=(1000000, 500)


array([[0.70314242, 0.22380397, 0.37443675, 0.47814453, 0.03532072],
       [0.87337275, 0.14464855, 0.34109734, 0.93411117, 0.67949773],
       [0.17674818, 0.75153754, 0.45394043, 0.72780007, 0.09674669],
       [0.87418587, 0.67386422, 0.92692087, 0.58631969, 0.48498155],
       [0.89181568, 0.73104451, 0.82865003, 0.29721189, 0.27377307]])

In [9]:
import sys

_ = sys.getsizeof(random_cols)  # size in bytes of the numpy array
print(f"Size of random_cols array: {_/1024**2:.2f} MB")  # convert to MB

Size of random_cols array: 3814.70 MB


In [10]:
# for each column in the random_cols, calc against one of the cols in data, sumamrize and collect in a list
threshold = data["A"].div(100).values.reshape(-1, 1)
threshold[:5]

array([[0.18],
       [0.13],
       [0.84],
       [0.17],
       [0.29]])

In [11]:
random_cols = (random_cols < threshold).astype(int)

In [12]:
print(f"{random_cols.shape=}")
random_cols[:5, :5]

random_cols.shape=(1000000, 500)


array([[0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0],
       [1, 1, 1, 1, 1],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1]])

In [15]:
random_cols_df = pd.DataFrame(
    random_cols, columns=[f"random_col_{i}" for i in range(random_cols.shape[1])]
)
random_cols_df.mean()

random_col_0      0.494987
random_col_1      0.494590
random_col_2      0.494498
random_col_3      0.494672
random_col_4      0.495708
                    ...   
random_col_495    0.494885
random_col_496    0.495066
random_col_497    0.495504
random_col_498    0.494877
random_col_499    0.495077
Length: 500, dtype: float64