In [2]:
import pandas as pd
import numpy as np
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Load your data
# use csv file from local
# not pushed in github because it is heavy (around 9 GB)
df = pd.read_csv('data/2019-Nov.csv')
print(f"Dataset shape: {df.shape}")

Dataset shape: (67501979, 9)


In [4]:
import pandas as pd
import numpy as np

# Read the data with optimized memory usage
def read_large_wide_dataset(file_path):
    # Specify dtype to reduce memory usage
    dtype_dict = {f'col_{i}': np.float32 for i in range(67500000)}

    # Read in chunks if needed
    df = pd.read_csv(file_path, dtype=dtype_dict, low_memory=False)
    return df

# Load your data
df = read_large_wide_dataset('data/2019-Nov.csv')

# Remove columns with any null values
def remove_columns_with_nulls(df):
    # This approach processes columns in batches to avoid memory issues
    batch_size = 10000
    columns_to_keep = []

    for start_idx in range(0, df.shape[1], batch_size):
        end_idx = min(start_idx + batch_size, df.shape[1])
        batch = df.iloc[:, start_idx:end_idx]

        # Find columns without any nulls in this batch
        non_null_cols = batch.columns[batch.notna().all()].tolist()
        columns_to_keep.extend(non_null_cols)

        print(f"Processed columns {start_idx} to {end_idx}")

    return df[columns_to_keep]

# Clean the dataset
cleaned_df = remove_columns_with_nulls(df)
print(f"Original shape: {df.shape}")
print(f"Cleaned shape: {cleaned_df.shape}")

Processed columns 0 to 9
Original shape: (67501979, 9)
Cleaned shape: (67501979, 6)
