In [2]:
import pandas as pd
import numpy as np
import os

In [5]:
# --- Configuration ---

# This is the big file you downloaded from Kaggle
SOURCE_FILENAME = 'accepted_2007_to_2018Q4.csv' 

# This is the new, smaller file we will create
SAMPLE_FILENAME = 'data_sample.csv' 

# Define your desired sample size
# 150k-200k is a good target for this project
TARGET_SAMPLE_SIZE = 150000 

# Define the chunk size to read the large file
# This loads 100k rows into memory at a time
CHUNK_SIZE = 100000 

# Check if the source file exists
if not os.path.exists(SOURCE_FILENAME):
    print(f"Error: Source file not found at '{SOURCE_FILENAME}'")
    print("Please download it from Kaggle and place it in the same directory.")
else:
    print(f"Found source file: '{SOURCE_FILENAME}'")

Found source file: 'accepted_2007_to_2018Q4.csv'


In [6]:
# --- File Sampling ---
# We will read the file in chunks, take a random sample from each chunk,
# and append it to a list. This is memory-efficient.

print(f"Starting to sample '{SOURCE_FILENAME}'...")

# Calculate the fraction of each chunk to sample
# We need to estimate the total number of rows first.
# A simple way is to get the file size.
# A 1GB file might have ~4M rows. A 2.5GB file ~10M.
# Let's do a quick pass to count rows for a more accurate sample fraction.

print("First pass: Counting total rows for accurate sampling fraction...")
total_rows = 0
with pd.read_csv(SOURCE_FILENAME, chunksize=CHUNK_SIZE, usecols=['id']) as reader:
    for chunk in reader:
        total_rows += len(chunk)
print(f"Total rows found: {total_rows}")

# Calculate the sampling fraction
sample_frac = TARGET_SAMPLE_SIZE / total_rows
print(f"Target sample size: {TARGET_SAMPLE_SIZE}")
print(f"Sampling fraction to apply to each chunk: {sample_frac:.6f}")

# --- Second Pass: Sampling ---
print("Second pass: Reading chunks and sampling...")
sample_chunks = []
with pd.read_csv(SOURCE_FILENAME, chunksize=CHUNK_SIZE) as reader:
    for chunk in reader:
        # Take a random sample from this chunk
        chunk_sample = chunk.sample(frac=sample_frac)
        sample_chunks.append(chunk_sample)

# Concatenate all the small samples into one final DataFrame
final_sample_df = pd.concat(sample_chunks, ignore_index=True)

print("Sampling complete.")
print(f"Final sample shape: {final_sample_df.shape}")

Starting to sample 'accepted_2007_to_2018Q4.csv'...
First pass: Counting total rows for accurate sampling fraction...


  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:


Total rows found: 2260701
Target sample size: 150000
Sampling fraction to apply to each chunk: 0.066351
Second pass: Reading chunks and sampling...


  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:


Sampling complete.
Final sample shape: (149998, 151)


In [7]:
# --- Save and Verify ---

if final_sample_df.shape[0] > 0:
    # Save the new sample DataFrame to a CSV
    final_sample_df.to_csv(SAMPLE_FILENAME, index=False)
    print(f"Successfully saved sample to '{SAMPLE_FILENAME}'")

    # Verification step:
    print("\nVerifying sample file...")
    try:
        df_check = pd.read_csv(SAMPLE_FILENAME)
        print(f"Successfully read back sample file. Shape: {df_check.shape}")
        print("\nFirst 5 rows of the sample:")
        print(df_check.head())
    except Exception as e:
        print(f"Error reading back sample file: {e}")
else:
    print("Error: The final sample DataFrame is empty. Check your file or logic.")

print("\n--- Phase 1.2 (Data Sampling) is complete. ---")

Successfully saved sample to 'data_sample.csv'

Verifying sample file...


  df_check = pd.read_csv(SAMPLE_FILENAME)


Successfully read back sample file. Shape: (149998, 151)

First 5 rows of the sample:
         id  member_id  loan_amnt  funded_amnt  funded_amnt_inv        term  \
0  67575322        NaN    27000.0      27000.0          26575.0   60 months   
1  62569701        NaN    28000.0      28000.0          28000.0   36 months   
2  62390325        NaN    10500.0      10500.0          10500.0   36 months   
3  66410552        NaN    14400.0      14400.0          14400.0   36 months   
4  64240494        NaN    35000.0      35000.0          34825.0   60 months   

   int_rate  installment grade sub_grade  ... hardship_payoff_balance_amount  \
0     23.99       776.58     F        F3  ...                            NaN   
1     11.53       923.73     B        B5  ...                            NaN   
2      5.32       316.21     A        A1  ...                            NaN   
3     12.59       482.36     C        C2  ...                            NaN   
4     22.99       986.47     F        F