# 01. Preprocessing - Data Format Benchmark Test

## Dependencies & Constants

In [2]:
import pandas as pd
from pathlib import Path
import time
import pyarrow.feather as feather
import os

from jupyter_init import setup
setup()

from src_code.config import DEFECTORS_DIR

RAW_DATASET_FILE = DEFECTORS_DIR / "line_bug_prediction_splits/random/train.parquet.gzip"

# i need to find the first feather file where the raw dataset is stored
FTH_DATASET_FILE = RAW_DATASET_FILE.with_suffix("").with_suffix(".feather")

DATASET_DIR = RAW_DATASET_FILE.parent
OUTPUT_DIR = DATASET_DIR / "bench_formats"

# load the raw dataset
df = pd.read_parquet(RAW_DATASET_FILE)
print(df.columns)



[CONFIG] Config file directory: C:\Users\fmojt\Code\Software Projects\DiplomaThesis\src_code\config.py
[CONFIG] Project root directory: C:\Users\fmojt\Code\Software Projects\DiplomaThesis
[CONFIG] Data directory: C:\Users\fmojt\Code\Software Projects\DiplomaThesis\data
[CONFIG] Bug inducing commits directory: C:\Users\fmojt\Code\Software Projects\DiplomaThesis\data\bug_inducing_commits
Index(['datetime', 'commit', 'repo', 'filepath', 'content', 'methods',
       'lines'],
      dtype='object')


## Parquet -> Feather

### Why Feather?

In [None]:
def measure_time(label, func):
    """Measure read time."""
    start = time.perf_counter()
    df = func()
    end = time.perf_counter()
    print(f"{label:<25} {end - start:.4f} seconds   (rows={len(df):,})")
    return df

OUTPUT_DIR.mkdir(exist_ok=True)

print("Loading original .parquet.gzip...")
df = measure_time("parquet(gzip) read", lambda: pd.read_parquet(RAW_DATASET_FILE))

# Save alternative formats
print("\nSaving to other formats...")

# 1) Feather
feather_file = OUTPUT_DIR / "train.feather"
feather.write_feather(df, feather_file)

# 2) Parquet uncompressed
parquet_fast = OUTPUT_DIR / "train_uncompressed.parquet"
df.to_parquet(parquet_fast, compression=None)

# 3) Pickle
pickle_file = OUTPUT_DIR / "train.pkl"
df.to_pickle(pickle_file)

# 4) CSV (optional — slowest & largest)
# csv_file = OUTPUT_DIR / "train.csv"
# df.to_csv(csv_file, index=False)


print("\nBenchmarking load speeds:")
measure_time("parquet (gzip)", lambda: pd.read_parquet(RAW_DATASET_FILE))
measure_time("parquet (raw)", lambda: pd.read_parquet(parquet_fast))
measure_time("feather", lambda: feather.read_feather(feather_file))
measure_time("pickle", lambda: pd.read_pickle(pickle_file))
# measure_time("csv", lambda: pd.read_csv(csv_file))

# deleting the bench formats
for file in OUTPUT_DIR.iterdir():
    os.remove(file)
os.rmdir(OUTPUT_DIR)

Loading original .parquet.gzip...
parquet(gzip) read        1.9627 seconds   (rows=185,369)

Saving to other formats...

Benchmarking load speeds:
parquet (gzip)            1.6495 seconds   (rows=185,369)
parquet (raw)             0.8892 seconds   (rows=185,369)
feather                   0.7575 seconds   (rows=185,369)
pickle                    2.1212 seconds   (rows=185,369)


### Saving All data

In [13]:
def to_feather_path(parquet_path: Path) -> Path:
    name = parquet_path.name

    if name.endswith(".parquet.gzip"):
        base = name[:-len(".parquet.gzip")]
    elif name.endswith(".parquet"):
        base = name[:-len(".parquet")]
    else:
        raise ValueError("File does not look like a parquet file: " + name)

    return parquet_path.with_name(base + ".feather")

def convert_parquet_to_feather(parquet_path: Path):
    # feather_path = parquet_path.with_suffix(".feather")
    feather_path = to_feather_path(parquet_path)


    # Avoid overwriting if already converted
    if feather_path.exists():
        print(f"[SKIP] {feather_path.name} already exists.")
        return

    print(f"[LOAD]  {parquet_path}")
    start = time.perf_counter()
    df = pd.read_parquet(parquet_path)
    load_time = time.perf_counter() - start

    print(f"[SAVE]  {feather_path}")
    start = time.perf_counter()
    df.to_feather(feather_path)
    save_time = time.perf_counter() - start

    print(f"→ done: load={load_time:.3f}s, save={save_time:.3f}s\n")


def find_all_parquets(root: Path):
    """Find .parquet or .parquet.gzip recursively."""
    for path in root.rglob("*"):
        if path.suffix in [".parquet", ".gzip"] and "parquet" in path.name:
            yield path

start = time.perf_counter()
print(f"Searching for .parquet/.parquet.gzip in {DEFECTORS_DATA_DIR}")
parquet_files = list(find_all_parquets(DEFECTORS_DATA_DIR))

print(f"Found {len(parquet_files)} parquet files.\n")

for p in parquet_files:
    convert_parquet_to_feather(p)

print("All conversions done.")
end = time.perf_counter()
print(f"Total time: {end - start:.2f} seconds")

Searching for .parquet/.parquet.gzip in ..\..\data\defectors
Found 12 parquet files.

[LOAD]  ..\..\data\defectors\line_bug_prediction_splits\random\test.parquet.gzip
[SAVE]  ..\..\data\defectors\line_bug_prediction_splits\random\test.feather
→ done: load=0.900s, save=0.860s

[LOAD]  ..\..\data\defectors\line_bug_prediction_splits\random\train.parquet.gzip
[SAVE]  ..\..\data\defectors\line_bug_prediction_splits\random\train.feather
→ done: load=22.290s, save=24.863s

[LOAD]  ..\..\data\defectors\line_bug_prediction_splits\random\val.parquet.gzip
[SAVE]  ..\..\data\defectors\line_bug_prediction_splits\random\val.feather
→ done: load=1.158s, save=0.909s

[LOAD]  ..\..\data\defectors\line_bug_prediction_splits\time\test.parquet.gzip
[SAVE]  ..\..\data\defectors\line_bug_prediction_splits\time\test.feather
→ done: load=0.988s, save=1.128s

[LOAD]  ..\..\data\defectors\line_bug_prediction_splits\time\train.parquet.gzip
[SAVE]  ..\..\data\defectors\line_bug_prediction_splits\time\train.feath

### Exploring dataset

In [21]:
def find_feather_files(root: Path):
    """Return all .feather files recursively."""
    return list(root.rglob("*.feather"))


def summarize_feather(path: Path):
    print(f"\n=== {path} ===")
    print(f"File size: {path.stat().st_size / (1024**2):.2f} MB")

    # Load
    df = pd.read_feather(path)

    # Basic shape
    print(f"Rows: {len(df):,}")
    print(f"Columns: {len(df.columns)}")

    # Schema: dtype + non-null count
    print("\nColumn summary:")
    summary = (
        df.dtypes.to_frame("dtype")
        .join(df.notnull().sum().to_frame("non_null"))
        .join(df.isnull().sum().to_frame("nulls"))
    )
    print(summary)

    # Optional: show first 3 rows as a preview
    print("\nPreview (first 3 rows):")
    print(df.head(3))


# ---- RUN ----

feather_files = find_feather_files(DEFECTORS_DATA_DIR)
print(f"Found {len(feather_files)} feather datasets.\n")

for f in feather_files:
    summarize_feather(f)

Found 12 feather datasets.


=== ..\..\data\defectors\line_bug_prediction_splits\random\test.feather ===
File size: 73.22 MB
Rows: 10,000
Columns: 7

Column summary:
                                          dtype  non_null  nulls
datetime  datetime64[us, pytz.FixedOffset(240)]     10000      0
commit                                   object     10000      0
repo                                     object     10000      0
filepath                                 object     10000      0
content                                  object      9769    231
methods                                  object     10000      0
lines                                    object     10000      0

Preview (first 3 rows):
                   datetime                                    commit  \
1 2021-07-13 15:35:10+04:00  000fbe63d390c59b9c1e29216c35fc52b991f2f3   
4 2022-07-11 13:12:55+04:00  038d5338530411bb47283fda1e84dec91137880b   
7 2014-07-16 08:51:12+04:00  0786e84a33155ebc8d8d3502e3a7f3060b86a4ec 

### Opening feather

In [20]:
fth_df = pd.read_feather(FTH_DATASET_FILE)

In [None]:
open_start = time.time()
df = pd.read_pickle(DATASET_DIR / "train.pkl")

open_end = time.time()
print(f"Time to open parquet: {open_end - open_start} seconds")

# print(df['commit'].head())       # show first rows
# Count occurrences of each commit
commit_counts = df['commit'].value_counts()

# Filter commits that occur more than once
duplicate_commits = commit_counts[commit_counts > 1].index

# Get first 5 commits that occur more than once
first_five_duplicates = duplicate_commits[:5]

print(first_five_duplicates)
print(df.info())   

Time to open parquet: 8.17880654335022 seconds
Index(['011ea55a8f1630842c67603ac601d4d7ef6ccef9',
       'bd12c4bfe7da8dcb37db8d6b081a1aa5f2cb517c',
       'e0e57b4beb9809883d5ef0df0d0367385b7c8aa3',
       '257ac9d17581bb67c24e7148e7bab37fc28ec64c',
       '60d4d5e1aaa9fde3cf541ee335e284d05e75679c'],
      dtype='object', name='commit')
<class 'pandas.core.frame.DataFrame'>
Index: 185369 entries, 0 to 43564
Data columns (total 7 columns):
 #   Column    Non-Null Count   Dtype                                 
---  ------    --------------   -----                                 
 0   datetime  185369 non-null  datetime64[us, pytz.FixedOffset(-120)]
 1   commit    185369 non-null  object                                
 2   repo      185369 non-null  object                                
 3   filepath  185369 non-null  object                                
 4   content   182680 non-null  object                                
 5   methods   185369 non-null  object                     