**Imports**

In [10]:
import pandas as pd
import numpy as np
import seaborn as sns
import json
import simplejson
import matplotlib.pyplot as plt
from pathlib import Path
from typing import List, Dict, Tuple
from sklearn.linear_model import LinearRegression
import os
import time

**Code**

In [12]:
file_path = "large_users.csv"

# Get file size
size_bytes = os.path.getsize(file_path)
size_mb = size_bytes / (1024 ** 2)

print(f"File size: {size_mb:.2f}mb")  # First feasibility check

File size: 11.36mb


In [4]:
chunks = pd.read_csv("large_users.csv", chunksize=50_000)

total_rows = 0
income_sum = 0.0

for chunk in chunks:
    total_rows += len(chunk)
    income_sum += chunk["income"].sum()

mean_income = income_sum / total_rows

print(f"Total rows processed: {total_rows}")
print(f"Mean income: {mean_income:,.2f}")

Total rows processed: 200000
Mean income: 24,917.92


In [5]:
# Load a sample of the large CSV
sample_df = pd.read_csv("large_users.csv", nrows=50_000)

# Quick look at the data
print(sample_df.head())

# Check category distribution for 'city'
city_counts = sample_df["city"].value_counts()
print("\nCity distribution (sample of 50,000 rows):")
print(city_counts)

   user_id username  age        income           city signup_date
0        1   user_1   56  28473.787893       new york  2023-12-13
1        2   user_2   45  12524.154525            nyc  2023-08-28
2        3   user_3   27  14453.826563  san francisco  2024-06-27
3        4   user_4   52  13555.088861       new york  2021-04-23
4        5   user_5   65  32232.827981    los angeles  2024-04-09

City distribution (sample of 50,000 rows):
city
nyc              8465
houston          8390
san francisco    8315
new york         8304
los angeles      8271
chicago          8255
Name: count, dtype: int64


In [6]:
# Assume 'sample' is your DataFrame
sample = pd.read_csv("large_users.csv", nrows=50_000)

# Vectorized operation (fast)
%timeit sample["income_scaled_vec"] = sample["income"] / 1000.0

# Using .apply() with lambda (slower)
%timeit sample["income_scaled_apply"] = sample["income"].apply(lambda x: x / 1000)

35.2 μs ± 117 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
3.44 ms ± 10.8 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [8]:
# Avoid
# Row-wise scaling of income (anti-pattern, but keeping original logic)
clean_values = []

for _, row in sample_df.iterrows():
    clean_values.append(row["income"] / 1000.0)

sample_df["income_scaled_loop"] = clean_values

In [7]:
# Recommended
# Column-wise operation: scale income
sample_df["income_scaled"] = sample_df["income"] / 1000.0

# Conditional flag without loops: mark high income
threshold = 1_000_000  # Example threshold
sample_df["high_flag"] = (sample_df["income"] > threshold).astype(int)

# Quick check
print(sample_df[["income", "income_scaled", "high_flag"]].head())

         income  income_scaled  high_flag
0  28473.787893      28.473788          0
1  12524.154525      12.524155          0
2  14453.826563      14.453827          0
3  13555.088861      13.555089          0
4  32232.827981      32.232828          0


In [11]:
# Function to "clean" a chunk (currently just makes a copy)
def clean_chunk(chunk):
    return chunk.copy()

# Measure performance
start = time.perf_counter()
cleaned = clean_chunk(sample_df)
elapsed = time.perf_counter() - start

print(f"Chunk cleaned in {elapsed:.3f} seconds")

Chunk cleaned in 0.022 seconds
