In [1]:
# --- Final PESV (α + δ + γ') Assembly Script ---
# This script merges the three component CSVs into a single,
# final dataset ready for machine learning.

print("--- Initializing Final PESV v2 Assembly Script ---")

import pandas as pd
import numpy as np
import os

# --- PART 1: Configuration ---

# --- Input Files (Source) ---
BASE_PATH = "/content/drive/MyDrive/1 Skripsi/"
ALPHA_FILE = os.path.join(BASE_PATH, "alpha_component_v2.csv")
DELTA_FILE = os.path.join(BASE_PATH, "delta_component_v2.csv")
GAMMA_PRIME_FILE = os.path.join(BASE_PATH, "gamma_prime_component_v2.csv")

# --- Output File (Destination) ---
FINAL_OUTPUT_CSV = os.path.join(BASE_PATH, "final_PESV_dataset_v2.csv")

# --- Define our label columns ---
# These are the columns we *don't* want to duplicate
LABEL_COLS = ['filename', 'application', 'category', 'binary_type']


# --- PART 2: Main Execution ---
def main():
    print("--- PART 1: Loading Component CSVs ---")

    try:
        df_alpha = pd.read_csv(ALPHA_FILE)
        print(f"Loaded Alpha (α) component: {df_alpha.shape}")

        df_delta = pd.read_csv(DELTA_FILE)
        print(f"Loaded Delta (δ) component: {df_delta.shape}")

        df_gamma_prime = pd.read_csv(GAMMA_PRIME_FILE)
        print(f"Loaded Gamma-Prime (γ') component: {df_gamma_prime.shape}")

    except FileNotFoundError as e:
        print(f"\nFATAL ERROR: File not found.")
        print(f"Could not find: {e.filename}")
        print("Please ensure all three component generation scripts ran successfully.")
        return
    except Exception as e:
        print(f"\nFATAL ERROR: Could not read files. Error: {e}")
        return

    # --- PART 2: Sanity Check ---
    # This is the most important step. We must ensure all files
    # are identical in length and order.
    print("\n--- PART 2: Performing Sanity Checks ---")

    # 1. Check lengths
    if not (len(df_alpha) == len(df_delta) == len(df_gamma_prime)):
        print("\nFATAL ERROR: Mismatched row counts!")
        print(f"  Alpha rows: {len(df_alpha)}")
        print(f"  Delta rows: {len(df_delta)}")
        print(f"  Gamma' rows: {len(df_gamma_prime)}")
        print("This should not happen. Please re-run the component scripts.")
        return
    else:
        print(f"Row count check PASSED. All files have {len(df_alpha)} rows.")

    # 2. Check filename order
    if not (df_alpha['filename'] == df_delta['filename']).all() or \
       not (df_alpha['filename'] == df_gamma_prime['filename']).all():
        print("\nFATAL ERROR: Mismatched 'filename' order.")
        print("The files are not sorted in the same way. This will corrupt the data.")
        print("This should not happen. Please re-run the component scripts.")
        return
    else:
        print("Filename order check PASSED. All files are aligned.")

    # --- PART 3: Assembling Final Dataset ---
    print("\n--- PART 3: Assembling Final Dataset ---")

    # 1. Get the base DataFrame (labels + alpha features)
    alpha_feature_cols = [col for col in df_alpha.columns if col.startswith('alpha_')]
    df_base = df_alpha[LABEL_COLS + alpha_feature_cols]
    print(f"Found {len(alpha_feature_cols)} Alpha (α) features.")

    # 2. Get the Delta features (all columns *except* the labels)
    delta_feature_cols = [col for col in df_delta.columns if col not in LABEL_COLS]
    df_delta_features = df_delta[delta_feature_cols]
    print(f"Found {len(delta_feature_cols)} Delta (δ) features.")

    # 3. Get the Gamma-Prime features (all columns *except* the labels)
    gamma_prime_feature_cols = [col for col in df_gamma_prime.columns if col not in LABEL_COLS]
    df_gamma_prime_features = df_gamma_prime[gamma_prime_feature_cols]
    print(f"Found {len(gamma_prime_feature_cols)} Gamma-Prime (γ') features.")

    # 4. Concatenate all features horizontally
    df_final = pd.concat(
        [df_base, df_delta_features, df_gamma_prime_features],
        axis=1
    )

    total_features = len(alpha_feature_cols) + len(delta_feature_cols) + len(gamma_prime_feature_cols)
    print("\nAssembly complete.")
    print(f"  Final dataset shape: {df_final.shape}")
    print(f"  Total features: {total_features}")

    # --- PART 4: Saving Final Dataset ---
    print("\n--- PART 4: Saving Final Dataset ---")

    try:
        df_final.to_csv(FINAL_OUTPUT_CSV, index=False)
        print(f"Successfully saved final 'Robust Model' dataset to:")
        print(FINAL_OUTPUT_CSV)
    except Exception as e:
        print(f"Error saving final CSV: {e}")


if __name__ == "__main__":
    if not os.path.exists("/content/drive/MyDrive"):
        print("Please mount your Google Drive first!")
        print("from google.colab import drive; drive.mount('/content/drive')")
    else:
        main()

print("\n--- Final PESV v2 Assembly Script Finished ---")


--- Initializing Final PESV v2 Assembly Script ---
--- PART 1: Loading Component CSVs ---
Loaded Alpha (α) component: (10105, 36)
Loaded Delta (δ) component: (10105, 43)
Loaded Gamma-Prime (γ') component: (10105, 41)

--- PART 2: Performing Sanity Checks ---
Row count check PASSED. All files have 10105 rows.
Filename order check PASSED. All files are aligned.

--- PART 3: Assembling Final Dataset ---
Found 32 Alpha (α) features.
Found 39 Delta (δ) features.
Found 37 Gamma-Prime (γ') features.

Assembly complete.
  Final dataset shape: (10105, 112)
  Total features: 108

--- PART 4: Saving Final Dataset ---
Successfully saved final 'Robust Model' dataset to:
/content/drive/MyDrive/1 Skripsi/final_PESV_dataset_v2.csv

--- Final PESV v2 Assembly Script Finished ---
