### Section C - (B)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import os
import glob
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns

In [None]:
# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

In [None]:
# Function to load and process data files
def load_data(file_pattern, label):
    """
    Load and process data files, assigning a label for each event.
    
    Args:
        file_pattern: Pattern for glob to find files
        label: Label for this category (0=bb, 1=cc, 2=ss)
    
    Returns:
        List of event dictionaries with features and labels
    """
    files = glob.glob(file_pattern)
    events = []
    
    for file_path in files:
        print(f"Loading {file_path}...")
        # Read root file using uproot
        try:
            import uproot
            with uproot.open(file_path) as file:
                tree = file["events"]
                # Get all branches
                branches = tree.arrays()
                
                # Process each event
                num_events = len(branches["n_jet"])
                for i in range(num_events):
                    event = {}
                    event['n_jet'] = branches["n_jet"][i]
                    event['n_trk'] = branches["n_trk"][i]
                    
                    # Get jet features - for simplicity, let's use the first two jets
                    max_jets = 2
                    jets = []
                    for j in range(min(event['n_jet'], max_jets)):
                        jet = {}
                        jet['pt'] = branches["jet_pt"][i][j]
                        jet['eta'] = branches["jet_eta"][i][j]
                        jet['phi'] = branches["jet_phi"][i][j]
                        jet['e'] = branches["jet_e"][i][j]
                        # Add other jet features if available
                        if "jet_btag" in branches:
                            jet['btag'] = branches["jet_btag"][i][j]
                        jets.append(jet)
                    
                    # Get track features - limit to a manageable number per event
                    max_tracks = 20
                    tracks = []
                    for j in range(min(event['n_trk'], max_tracks)):
                        track = {}
                        track['pt'] = branches["trk_pt"][i][j]
                        track['eta'] = branches["trk_eta"][i][j]
                        track['phi'] = branches["trk_phi"][i][j]
                        # Add other track features if available
                        if "trk_d0" in branches:
                            track['d0'] = branches["trk_d0"][i][j]
                        if "trk_z0" in branches:
                            track['z0'] = branches["trk_z0"][i][j]
                        tracks.append(track)
                    
                    event['jets'] = jets
                    event['tracks'] = tracks
                    event['label'] = label
                    events.append(event)
                
        except Exception as e:
            # If uproot isn't available or fails, try a fallback approach
            # This is a placeholder - in reality, you would need to implement an alternative loading method
            print(f"Error loading file with uproot: {e}")
            print("Please ensure uproot is installed or provide an alternative loading method.")
            
    return events

In [None]:
# Placeholder for loading paths - replace with actual paths
bb_files = "path/to/bb_files/*.root"  # Replace with actual path
cc_files = "path/to/cc_files/*.root"  # Replace with actual path
ss_files = "path/to/ss_files/*.root"  # Replace with actual path

# For demonstration, I'll create placeholder data
# In your actual solution, load real data using the function above
print("Creating placeholder data for demonstration...")
np.random.seed(42)

In [None]:
# Function to create sample synthetic event data
def create_sample_data(n_events, flavor='b'):
    """Create synthetic data for demonstration"""
    events = []
    
    # Set parameters based on quark flavor to simulate different distributions
    if flavor == 'b':
        label = 0
        pt_mean, pt_std = 40, 15
        track_mult = 10
        btag_mean = 0.8
    elif flavor == 'c':
        label = 1
        pt_mean, pt_std = 35, 10
        track_mult = 8
        btag_mean = 0.5
    else:  # 's'
        label = 2
        pt_mean, pt_std = 30, 8
        track_mult = 6
        btag_mean = 0.2
    
    for i in range(n_events):
        event = {}
        n_jets = np.random.randint(2, 5)
        n_tracks = np.random.poisson(track_mult)
        
        event['n_jet'] = n_jets
        event['n_trk'] = n_tracks
        
        # Generate jets
        jets = []
        for j in range(n_jets):
            jet = {}
            jet['pt'] = max(5, np.random.normal(pt_mean, pt_std))
            jet['eta'] = np.random.normal(0, 1)
            jet['phi'] = np.random.uniform(-np.pi, np.pi)
            jet['e'] = jet['pt'] * np.cosh(jet['eta']) * (1 + np.random.normal(0, 0.1))
            jet['btag'] = np.clip(np.random.normal(btag_mean, 0.2), 0, 1)
            jets.append(jet)
        
        # Generate tracks
        tracks = []
        for j in range(n_tracks):
            track = {}
            track['pt'] = max(0.5, np.random.exponential(5))
            track['eta'] = np.random.normal(0, 1.5)
            track['phi'] = np.random.uniform(-np.pi, np.pi)
            
            # Impact parameters are more interesting for b and c jets
            if flavor == 'b':
                track['d0'] = np.random.normal(0, 0.02)
                track['z0'] = np.random.normal(0, 0.05)
            elif flavor == 'c':
                track['d0'] = np.random.normal(0, 0.01)
                track['z0'] = np.random.normal(0, 0.03)
            else:
                track['d0'] = np.random.normal(0, 0.005)
                track['z0'] = np.random.normal(0, 0.01)
                
            tracks.append(track)
        
        event['jets'] = jets
        event['tracks'] = tracks
        event['label'] = label
        events.append(event)
    
    return events

# Create synthetic data for demonstration
bb_events = create_sample_data(1000, 'b')
cc_events = create_sample_data(1000, 'c')
ss_events = create_sample_data(1000, 's')

# Combine all events
all_events = bb_events + cc_events + ss_events
np.random.shuffle(all_events)

print(f"Created {len(all_events)} synthetic events")
print(f"b-jets: {len(bb_events)}, c-jets: {len(cc_events)}, s-jets: {len(ss_events)}")

# Explore the data structure
sample_event = all_events[0]
print("\nSample event structure:")
print(f"Label: {sample_event['label']} ({['b', 'c', 's'][sample_event['label']]})")
print(f"Number of jets: {sample_event['n_jet']}")
print(f"Number of tracks: {sample_event['n_trk']}")
print(f"First jet properties: {sample_event['jets'][0]}")
print(f"First track properties: {sample_event['tracks'][0]}")

# Extract some features for visualization
flavor_labels = [event['label'] for event in all_events]
n_jets = [event['n_jet'] for event in all_events]
n_tracks = [event['n_trk'] for event in all_events]
btag_values = []
leading_jet_pt = []

for event in all_events:
    if len(event['jets']) > 0 and 'btag' in event['jets'][0]:
        btag_values.append(event['jets'][0]['btag'])
        leading_jet_pt.append(event['jets'][0]['pt'])
    else:
        btag_values.append(float('nan'))
        leading_jet_pt.append(float('nan'))

# Create a dataframe for easier visualization
df = pd.DataFrame({
    'flavor': [['b', 'c', 's'][label] for label in flavor_labels],
    'n_jets': n_jets,
    'n_tracks': n_tracks,
    'btag': btag_values,
    'leading_jet_pt': leading_jet_pt
})

# Visualize some distributions
plt.figure(figsize=(15, 10))

plt.subplot(2, 2, 1)
sns.histplot(data=df, x='n_jets', hue='flavor', multiple='stack', discrete=True)
plt.title('Number of Jets by Flavor')
plt.xlabel('Number of Jets')

plt.subplot(2, 2, 2)
sns.histplot(data=df, x='n_tracks', hue='flavor', multiple='stack', bins=20)
plt.title('Number of Tracks by Flavor')
plt.xlabel('Number of Tracks')

plt.subplot(2, 2, 3)
sns.histplot(data=df, x='btag', hue='flavor', multiple='stack', bins=20)
plt.title('b-tag Score by Flavor')
plt.xlabel('b-tag Score')

plt.subplot(2, 2, 4)
sns.histplot(data=df, x='leading_jet_pt', hue='flavor', multiple='stack', bins=20)
plt.title('Leading Jet pT by Flavor')
plt.xlabel('Leading Jet pT (GeV)')

plt.tight_layout()
plt.savefig('data_exploration.png')
plt.close()

print("\nData exploration completed and visualizations saved.")

In [None]:
# Feature extraction for the neural network
def extract_features(events, max_jets=4, max_tracks=20):
    """
    Extract features from events for input to neural networks.
    
    Returns:
        X_jets: Array of jet features for each event
        X_tracks: Array of track features for each event
        y: Labels
    """
    n_events = len(events)
    
    # Define feature dimensions
    n_jet_features = 5  # pt, eta, phi, e, btag
    n_track_features = 5  # pt, eta, phi, d0, z0
    
    # Initialize arrays
    X_jets = np.zeros((n_events, max_jets, n_jet_features))
    X_tracks = np.zeros((n_events, max_tracks, n_track_features))
    y = np.zeros(n_events, dtype=int)
    
    # Fill arrays with data
    for i, event in enumerate(events):
        # Fill label
        y[i] = event['label']
        
        # Fill jet features
        for j, jet in enumerate(event['jets'][:max_jets]):
            X_jets[i, j, 0] = jet['pt']
            X_jets[i, j, 1] = jet['eta']
            X_jets[i, j, 2] = jet['phi']
            X_jets[i, j, 3] = jet['e']
            X_jets[i, j, 4] = jet.get('btag', 0)  # Default to 0 if btag not available
        
        # Fill track features
        for j, track in enumerate(event['tracks'][:max_tracks]):
            X_tracks[i, j, 0] = track['pt']
            X_tracks[i, j, 1] = track['eta']
            X_tracks[i, j, 2] = track['phi']
            X_tracks[i, j, 3] = track.get('d0', 0)  # Default to 0 if d0 not available
            X_tracks[i, j, 4] = track.get('z0', 0)  # Default to 0 if z0 not available
    
    return X_jets, X_tracks, y

In [None]:
# Extract features
X_jets, X_tracks, y = extract_features(all_events)

# Split data into training, validation, and test sets
X_jets_train, X_jets_test, X_tracks_train, X_tracks_test, y_train, y_test = train_test_split(
    X_jets, X_tracks, y, test_size=0.2, random_state=42
)

X_jets_train, X_jets_val, X_tracks_train, X_tracks_val, y_train, y_val = train_test_split(
    X_jets_train, X_tracks_train, y_train, test_size=0.25, random_state=42
)

print("Data split complete:")
print(f"Training: {len(y_train)} samples")
print(f"Validation: {len(y_val)} samples")
print(f"Test: {len(y_test)} samples")

In [None]:

# Normalize the data (this is important for neural networks)
# For simplicity, we'll use a simple approach here
def normalize_features(X_train, X_val, X_test):
    # Reshape to 2D for scaling
    orig_shape = X_train.shape
    X_train_flat = X_train.reshape(-1, X_train.shape[-1])
    X_val_flat = X_val.reshape(-1, X_val.shape[-1])
    X_test_flat = X_test.reshape(-1, X_test.shape[-1])
    
    # Fit scaler on training data
    scaler = StandardScaler()
    scaler.fit(X_train_flat)
    
    # Transform all datasets
    X_train_scaled = scaler.transform(X_train_flat).reshape(orig_shape)
    X_val_scaled = scaler.transform(X_val_flat).reshape(X_val.shape)
    X_test_scaled = scaler.transform(X_test_flat).reshape(X_test.shape)
    
    return X_train_scaled, X_val_scaled, X_test_scaled

# Normalize jet and track features separately
X_jets_train_norm, X_jets_val_norm, X_jets_test_norm = normalize_features(
    X_jets_train, X_jets_val, X_jets_test
)
X_tracks_train_norm, X_tracks_val_norm, X_tracks_test_norm = normalize_features(
    X_tracks_train, X_tracks_val, X_tracks_test
)

print("Data normalization complete.")

In [None]:

# Build a model using the Keras functional API
# We'll use a combination of RNN and DeepSets approaches
def build_model(max_jets, max_tracks, n_jet_features, n_track_features, n_classes=3):
    """
    Build a neural network model for event classification.
    Uses both jet and track information.
    """
    # Jet inputs and processing
    jet_input = keras.Input(shape=(max_jets, n_jet_features), name='jet_input')
    
    # Process each jet with a shared dense network (DeepSets approach)
    jet_dense = layers.Dense(64, activation='relu')(jet_input)
    jet_dense = layers.Dense(32, activation='relu')(jet_dense)
    
    # Apply RNN to sequence of jets
    jet_lstm = layers.Bidirectional(layers.LSTM(32))(jet_dense)
    
    # Track inputs and processing
    track_input = keras.Input(shape=(max_tracks, n_track_features), name='track_input')
    
    # Process each track with a shared dense network
    track_dense = layers.Dense(64, activation='relu')(track_input)
    track_dense = layers.Dense(32, activation='relu')(track_dense)
    
    # Apply RNN to sequence of tracks
    track_lstm = layers.Bidirectional(layers.LSTM(32))(track_dense)
    
    # Combine jet and track features
    combined = layers.Concatenate()([jet_lstm, track_lstm])
    
    # Final classification layers
    x = layers.Dense(64, activation='relu')(combined)
    x = layers.Dropout(0.2)(x)
    x = layers.Dense(32, activation='relu')(x)
    x = layers.Dropout(0.2)(x)
    
    # Output layer with softmax activation for multi-class classification
    output = layers.Dense(n_classes, activation='softmax', name='output')(x)
    
    # Create and compile model
    model = keras.Model(inputs=[jet_input, track_input], outputs=output)
    
    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    
    return model

In [None]:
# Build the model
n_jet_features = X_jets_train.shape[2]
n_track_features = X_tracks_train.shape[2]
max_jets = X_jets_train.shape[1]
max_tracks = X_tracks_train.shape[1]

model = build_model(
    max_jets=max_jets, 
    max_tracks=max_tracks, 
    n_jet_features=n_jet_features, 
    n_track_features=n_track_features
)

# Print model summary
model.summary()

In [None]:

# Train the model
history = model.fit(
    [X_jets_train_norm, X_tracks_train_norm], 
    y_train,
    validation_data=([X_jets_val_norm, X_tracks_val_norm], y_val),
    epochs=30,
    batch_size=64,
    callbacks=[
        keras.callbacks.EarlyStopping(
            patience=5, 
            restore_best_weights=True
        ),
        keras.callbacks.ReduceLROnPlateau(
            factor=0.5, 
            patience=3
        )
    ]
)

In [None]:

# Plot training history
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')

plt.tight_layout()
plt.savefig('training_history.png')
plt.close()

In [None]:

# Evaluate the model on the test set
test_loss, test_acc = model.evaluate([X_jets_test_norm, X_tracks_test_norm], y_test)
print(f"\nTest accuracy: {test_acc:.4f}")

# Get predictions
y_pred_prob = model.predict([X_jets_test_norm, X_tracks_test_norm])
y_pred = np.argmax(y_pred_prob, axis=1)

# Generate classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['b-jet', 'c-jet', 's-jet']))

# Create confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['b-jet', 'c-jet', 's-jet'],
            yticklabels=['b-jet', 'c-jet', 's-jet'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.savefig('confusion_matrix.png')
plt.close()

# Plot ROC curves
from sklearn.metrics import roc_curve, auc
plt.figure(figsize=(10, 8))

# One-vs-All ROC curves
for i, flavor in enumerate(['b-jet', 'c-jet', 's-jet']):
    # Convert to one-vs-all problem
    y_test_binary = (y_test == i).astype(int)
    
    # Get ROC curve
    fpr, tpr, _ = roc_curve(y_test_binary, y_pred_prob[:, i])
    roc_auc = auc(fpr, tpr)
    
    plt.plot(fpr, tpr, lw=2, label=f'{flavor} (AUC = {roc_auc:.2f})')

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curves')
plt.legend(loc="lower right")
plt.grid(True)
plt.savefig('roc_curves.png')
plt.close()

# Visualize classifier outputs
plt.figure(figsize=(15, 5))

# Create 3 subplots, one for each flavor
for i, flavor in enumerate(['b-jet', 'c-jet', 's-jet']):
    plt.subplot(1, 3, i+1)
    
    # Get indices of true examples for this flavor
    true_indices = np.where(y_test == i)[0]
    
    # Plot distribution of classifier outputs for this flavor
    for j, pred_flavor in enumerate(['b-jet', 'c-jet', 's-jet']):
        plt.hist(y_pred_prob[true_indices, j], bins=25, alpha=0.7, 
                 label=f'Prob({pred_flavor})', range=(0, 1))
    
    plt.title(f'True {flavor}')
    plt.xlabel('Classifier Output')
    plt.ylabel('Events')
    plt.legend()
    plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('classifier_outputs.png')
plt.close()