# Notebook and Environment Setup 
This initial cell ensures that the Jupyter Notebook environment is correctly configured to run the rest of the project.


In [None]:
import os
import sys

# Get the directory of the currently running notebook, which should be the project root
notebook_dir = os.path.abspath(os.getcwd())

# Ensure we are at the highest level des646_project
if notebook_dir.endswith(os.path.join('des646_project', 'testing')):
# If the CWD is 'testing', move UP one level to the project root
    project_root = os.path.dirname(notebook_dir)
elif not notebook_dir.endswith('des646_project'):
# Generic case: try to find the project root based on the notebook file location
    project_root = os.path.dirname(os.path.abspath('influence_check.ipynb')) 
else:
# If we are already in the correct folder (des646_project)
    project_root = notebook_dir

# Set the Current Working Directory (CWD) to the project root
os.chdir(project_root)

# Add the project root to the Python path for local imports (to find 'influence' folder)
if project_root not in sys.path:
    sys.path.append(project_root)

print("Current Working Directory:", os.getcwd())
print("Notebook path configured.")

Current Working Directory: c:\Users\Shrimi Agrawal\OneDrive\Desktop\DES646\des646_project
Notebook path configured.


# Load and Prepare Data

This cell handles **loading and preprocessing** of the dataset:

- Loads **embeddings** (`.npy`) and **labels** from the `testing/output` folder.  
- Loads **sensitive attributes** from `influence/sensitive_attr.npy`.  
- Handles **file not found errors** gracefully.  
- Converts string/object labels to **integer codes** using `LabelEncoder` for PyTorch compatibility.  
- Prints shapes and types of all loaded arrays for verification.


In [None]:
import numpy as np
import os
import torch
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Define paths relative to the project root (set in Cell 1)
base_dir = "testing/output"
embeddings_path = os.path.join(base_dir, "embeddings_resnet18.npy")
labels_path = os.path.join(base_dir, "labels.npy")

# Path to the sensitive attribute file 
sensitive_attr_path = os.path.join("influence", "sensitive_attr.npy")
# --- Load Data ---
print(" Loading Embeddings and Labels...")
try:
    embeddings = np.load(embeddings_path)
    labels = np.load(labels_path)
    
    # LOAD SENSITIVE ATTRIBUTE
    sensitive_attr = np.load(sensitive_attr_path, allow_pickle=True) # <--- ADD THIS LOADING BLOCK
    
    print(f" Data loaded: Embeddings {embeddings.shape}, Labels {labels.shape}")
    print(f" Sensitive Attribute loaded: {sensitive_attr.shape}")
except FileNotFoundError as e:
    print(f" ERROR: Data files not found. Cannot proceed. Error: {e}")
    embeddings = np.array([])
    labels = np.array([])
    sensitive_attr = np.array([]) # Ensure this is initialized

# Label Encoding
if labels.size > 0:
    print(f"\nLabels data type before encoding: {labels.dtype}")
    if labels.dtype == object or labels.dtype.kind in ('U', 'S'): # Check for string/object types
        print(" Converting string labels to integer codes (required for PyTorch Loss)...")
        le = LabelEncoder()
        labels_encoded = le.fit_transform(labels)
        class_names = le.classes_
    else:
        labels_encoded = labels
        class_names = np.unique(labels)
    
    print(f"Encoded labels shape: {labels_encoded.shape}, Data Type: {labels_encoded.dtype}")
    print(f"Total unique classes: {len(class_names)}")
else:
    labels_encoded = np.array([])


 Loading Embeddings and Labels...
 Data loaded: Embeddings (200, 512), Labels (200,)
 Sensitive Attribute loaded: (200,)

Labels data type before encoding: <U9
 Converting string labels to integer codes (required for PyTorch Loss)...
Encoded labels shape: (200,), Data Type: int64
Total unique classes: 10
