<a href="https://colab.research.google.com/github/FuzzilyDeveloper/ML_Project/blob/master/tsne_exact_s5e2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Install Qiskit-Aer with GPU support
!pip install qiskit==1.2 qiskit-machine-learning qiskit-aer-gpu pandas numpy scikit-learn scipy


In [None]:
!pip install qiskit-algorithms

In [None]:
import pandas as pd
import numpy as np
from qiskit import QuantumCircuit
from qiskit.circuit.library import RealAmplitudes, ZFeatureMap
from qiskit_machine_learning.neural_networks import EstimatorQNN
from qiskit_aer import AerSimulator
from qiskit.primitives import StatevectorEstimator
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from scipy.optimize import minimize
from sklearn.manifold import TSNE
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor

In [None]:


# Load data
train_path = '/content/drive/MyDrive/playground-series-s5e2/train.csv'
test_path = '/content/drive/MyDrive/playground-series-s5e2/test.csv'
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

In [None]:


# Downsample training data
train_sample_size = 1500
df_train = df_train.sample(n=train_sample_size, random_state=18)

In [None]:
df_train

In [None]:

df_train_split, df_val = train_test_split(df_train, test_size=0.2, random_state=15)

In [None]:
df_train_split

In [None]:
df_val

In [None]:


# Split training data

# Manually select features
categorical_cols = ['Brand', 'Material','Size', 'Laptop Compartment',
                    'Waterproof', 'Style','Color','Compartments' ]
numerical_cols = ['Weight Capacity (kg)']

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
        ]), categorical_cols),
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), numerical_cols)
    ])

In [None]:


# Prepare data
X_train = df_train_split[categorical_cols + numerical_cols]
y_train = df_train_split['Price']
X_val = df_val[categorical_cols + numerical_cols]
y_val = df_val['Price']
X_test = df_test[categorical_cols + numerical_cols]

# Apply preprocessing
# X_train_processed = preprocessor.fit_transform(X_train)
# X_val_processed = preprocessor.transform(X_val)
# X_test_processed = preprocessor.transform(X_test)


X_train_processed = preprocessor.fit_transform(X_train)
X_val_processed = preprocessor.fit_transform(X_val)
X_test_processed = preprocessor.fit_transform(X_test)

In [None]:
X_train_processed[0]

In [None]:
X_val_processed[0]

In [None]:


# t-SNE dimensionality reduction for training and validation
X_train_val_processed = np.vstack([X_train_processed, X_val_processed])
train_idx = len(X_train_processed)
n_components = 14
# tsne = TSNE(n_components=n_components, method='barnes_hut', random_state=42, n_iter=300)
tsne = TSNE(n_components=n_components, method='exact', random_state=42, n_iter=300)
X_train_val_reduced = tsne.fit_transform(X_train_val_processed)

In [None]:
X_train_val_processed[0]

In [None]:
X_train_val_reduced[0]

In [None]:


# Split back into training and validation
X_train_reduced = X_train_val_reduced[:train_idx]
X_val_reduced = X_train_val_reduced[train_idx:]

In [None]:

# Approximate t-SNE for test set using k-NN
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train_processed, X_train_reduced)
X_test_reduced = knn.predict(X_test_processed)

print(f"t-SNE applied with {n_components} components (training/validation), k-NN for test set")

In [None]:

# Scale target
y_min, y_max = y_train.min(), y_train.max()

In [None]:

# Quantum circuit
num_qubits = n_components
feature_map = ZFeatureMap(feature_dimension=num_qubits, reps=1)
ansatz = RealAmplitudes(num_qubits=num_qubits, reps=1)
circuit = QuantumCircuit(num_qubits)
circuit.compose(feature_map, inplace=True)
circuit.compose(ansatz, inplace=True)

In [None]:

# Simulator and QNN
aer_simulator = AerSimulator(method='statevector', device='CPU')
estimator = StatevectorEstimator()
qnn = EstimatorQNN(
    circuit=circuit,
    estimator=estimator,
    input_params=circuit.parameters[:num_qubits],
    weight_params=circuit.parameters[num_qubits:]
)

In [None]:

# Training setup
num_weights = len(qnn.weight_params)
initial_weights = np.random.random(num_weights)
current_weights = initial_weights.copy()
best_weights = current_weights.copy()
best_val_loss = float('inf')
patience = 2
patience_counter = 0

In [None]:







# Training loop
for epoch in range(2):
    batch_num = 0
    print(f"Epoch {epoch + 1}")
    start_idx = 0
    while start_idx < len(X_train_reduced):
        end_idx = min(start_idx + 50, len(X_train_reduced))
        X_batch = X_train_reduced[start_idx:end_idx]
        y_batch = y_train.iloc[start_idx:end_idx] if isinstance(y_train, pd.Series) else y_train[start_idx:end_idx]

        print(f"Training on batch {batch_num + 1}")
        y_batch_scaled = 2 * (y_batch - y_min) / (y_max - y_min) - 1

        result = minimize(
            lambda weights: np.mean((qnn.forward(X_batch, weights).flatten() - y_batch_scaled) ** 2) + 0.005 * np.sum(weights ** 2),
            current_weights,
            method='COBYLA',
            options={'maxiter': 20}
        )
        current_weights = result.x
        batch_num += 1
        start_idx += 50
        if batch_num >= 5:
            break

    val_predictions = qnn.forward(X_val_reduced, current_weights).flatten()
    val_scaled = 2 * (y_val - y_min) / (y_max - y_min) - 1
    val_loss = mean_squared_error(val_scaled, val_predictions)
    print(f"Validation Loss: {val_loss:.4f}")

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_weights = current_weights.copy()
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered.")
            break

In [None]:


# Training predictions
train_predictions = []
train_actual = []
start_idx = 0
while start_idx < len(X_train_reduced):
    end_idx = min(start_idx + 50, len(X_train_reduced))
    X_batch = X_train_reduced[start_idx:end_idx]
    y_batch = y_train.iloc[start_idx:end_idx] if isinstance(y_train, pd.Series) else y_train[start_idx:end_idx]
    y_pred = qnn.forward(X_batch, best_weights).flatten()
    y_pred_scaled = y_min + (y_pred + 1) * (y_max - y_min) / 2
    train_predictions.extend(y_pred_scaled)
    train_actual.extend(y_batch)
    start_idx += 50

mse = mean_squared_error(train_actual, train_predictions)
print(f"Training MSE: {mse:.4f}")

# Test predictions in chunks
test_predictions = []
batch_size = 10000  # Process test set in smaller chunks
start_idx = 0
while start_idx < len(X_test_reduced):
    end_idx = min(start_idx + batch_size, len(X_test_reduced))
    X_batch = X_test_reduced[start_idx:end_idx]
    y_pred = qnn.forward(X_batch, best_weights).flatten()
    y_pred_scaled = y_min + (y_pred + 1) * (y_max - y_min) / 2
    test_predictions.extend(y_pred_scaled)
    start_idx += batch_size

# Output results
print("\nSample Training Predictions (first 5):")
for i in range(min(5, len(train_predictions))):
    print(f"Actual: {train_actual[i]:.2f}, Predicted: {train_predictions[i]:.2f}")

df_test['Predicted_Price'] = test_predictions
print("\nSample Test Predictions (first 5 rows):")
print(df_test.head())

# Ensure submission has 200,000 rows
df_test_pred = df_test[['id', 'Predicted_Price']]
df_test_pred.columns = ['id', 'Price']
if len(df_test_pred) != 200000:
    raise ValueError(f"Submission has {len(df_test_pred)} rows, expected 200,000")
df_test_pred.to_csv('submission.csv', index=False)
print("\nTest predictions saved to 'submission.csv' with 200,000 rows")

# ***QSVR with quantm kernerl***

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from qiskit import QuantumCircuit
from qiskit.circuit.library import ZFeatureMap
from qiskit.quantum_info import Statevector
from qiskit_algorithms.utils import algorithm_globals
import matplotlib.pyplot as plt
import umap
import warnings
import datetime
warnings.filterwarnings('ignore')

# Function to print timestamped messages
def print_progress(message):
    print(f"[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] {message}")

# Set random seed for reproducibility
np.random.seed(42)
algorithm_globals.random_seed = 42

# Load Kaggle dataset
print_progress("Loading datasets...")
train_df = pd.read_csv('/content/drive/MyDrive/playground-series-s5e2/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/playground-series-s5e2/test.csv')

# Remove rows with missing values from training set only
print_progress("Removing missing values from training set...")
train_df = train_df.dropna()

# Define features
categorical_cols = ['Brand', 'Material', 'Size', 'Laptop Compartment', 'Waterproof', 'Style', 'Color']
numerical_cols = ['Weight Capacity (kg)', 'Compartments']
target = 'Price'

# Impute missing values in test set
print_progress("Imputing missing values in test set...")
cat_imputer = SimpleImputer(strategy='most_frequent')
num_imputer = SimpleImputer(strategy='median')
test_df[categorical_cols] = cat_imputer.fit_transform(test_df[categorical_cols])
test_df[numerical_cols] = num_imputer.fit_transform(test_df[numerical_cols])

# Preprocessing pipeline with sparse output for categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=True), categorical_cols),
        ('num', StandardScaler(), numerical_cols)
    ])

# Fit and transform training data
print_progress("Preprocessing training and test data...")
X_train = train_df[categorical_cols + numerical_cols]
y_train = train_df[target]
X_test = test_df[categorical_cols + numerical_cols]

# Apply preprocessing
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Convert sparse matrix to dense for UMAP
X_train_preprocessed = X_train_preprocessed.toarray() if hasattr(X_train_preprocessed, 'toarray') else X_train_preprocessed
X_test_preprocessed = X_test_preprocessed.toarray() if hasattr(X_test_preprocessed, 'toarray') else X_test_preprocessed

# Subsample training data for UMAP and quantum kernel
n_samples = 100  # Small sample for fast computation
print_progress(f"Subsampling {n_samples} training samples...")
sample_indices = np.random.choice(X_train_preprocessed.shape[0], n_samples, replace=False)
X_train_sample = X_train_preprocessed[sample_indices]
y_train_sample = y_train.iloc[sample_indices].values

# Apply UMAP for dimensionality reduction
n_components = 2
print_progress("Applying UMAP to training sample...")
umap_model = umap.UMAP(n_components=n_components, random_state=42, n_jobs=1)
X_train_transformed = umap_model.fit_transform(X_train_sample)
print_progress("UMAP transformation for training sample completed.")

# Define simpler quantum feature map
feature_map = ZFeatureMap(feature_dimension=n_components, reps=1)

# Function to compute approximate quantum kernel matrix
def quantum_kernel(X1, X2, feature_map, sample_pairs=1000):
    kernel_matrix = np.zeros((len(X1), len(X2)))
    # Randomly sample pairs to approximate kernel
    n_pairs = min(sample_pairs, len(X1) * len(X2))
    indices = np.random.choice(len(X1) * len(X2), n_pairs, replace=False)
    for idx in indices:
        i = idx // len(X2)
        j = idx % len(X2)
        qc = QuantumCircuit(feature_map.num_qubits)
        params_i = X1[i]
        mapped_circuit_i = feature_map.assign_parameters(params_i)
        qc.compose(mapped_circuit_i, inplace=True)
        params_j = X2[j]
        mapped_circuit_j = feature_map.assign_parameters(params_j).inverse()
        qc.compose(mapped_circuit_j, inplace=True)
        state = Statevector.from_instruction(qc)
        kernel_matrix[i, j] = np.abs(state.data[0])**2
    # Fill remaining entries with average kernel value
    if n_pairs < len(X1) * len(X2):
        avg_kernel = np.mean(kernel_matrix[kernel_matrix != 0]) if np.any(kernel_matrix != 0) else 1.0
        kernel_matrix[kernel_matrix == 0] = avg_kernel
    return kernel_matrix

# Compute training kernel
print_progress("Computing training kernel...")
train_kernel = quantum_kernel(X_train_transformed, X_train_transformed, feature_map, sample_pairs=1000)
print_progress("Training kernel computation completed.")

# Train QSVR model
print_progress("Training QSVR model...")
qsvr = SVR(kernel='precomputed')
qsvr.fit(train_kernel, y_train_sample)
print_progress("QSVR model training completed.")

# Batch processing for test set
batch_size = 500  # Small for fast processing
y_pred_test = []
total_batches = (X_test_preprocessed.shape[0] + batch_size - 1) // batch_size
print_progress(f"Starting test set prediction with {total_batches} batches...")
for i in range(0, X_test_preprocessed.shape[0], batch_size):
    batch_num = i // batch_size + 1
    print_progress(f"Processing test batch {batch_num}/{total_batches}...")
    X_test_batch = X_test_preprocessed[i:i + batch_size]
    X_test_batch_transformed = umap_model.transform(X_test_batch)
    test_kernel_batch = quantum_kernel(X_test_batch_transformed, X_train_transformed, feature_map, sample_pairs=500)
    y_pred_batch = qsvr.predict(test_kernel_batch)
    y_pred_test.extend(y_pred_batch)
print_progress("Test set prediction completed.")

# Convert predictions to numpy array and round to 2 decimal places
y_pred_test = np.round(np.array(y_pred_test), 2)

# Create submission file
print_progress("Creating submission file...")
submission = pd.DataFrame({'id': test_df['id'], 'Price': y_pred_test})
# Verify submission size
if len(submission) != 200000:
    print_progress(f"Warning: Submission has {len(submission)} rows, expected 200000.")
else:
    print_progress("Submission has correct number of rows (200000).")
submission.to_csv('submission_qsvr_umap_full.csv', index=False)
print_progress("Submission file 'submission_qsvr_umap_full.csv' created successfully!")

# Evaluate on training sample
print_progress("Evaluating training performance...")
y_pred_train = qsvr.predict(train_kernel)
mse = mean_squared_error(y_train_sample, y_pred_train)
print_progress(f"Training Mean Squared Error: {mse:.4f}")

# Plot predictions vs. true values for sampled training data
print_progress("Generating training prediction plot...")
plt.scatter(y_train_sample, y_pred_train, color='blue', label='Predicted vs. True')
plt.plot([y_train_sample.min(), y_train_sample.max()], [y_train_sample.min(), y_train_sample.max()], 'r--', label='Ideal')
plt.xlabel('True Price')
plt.ylabel('Predicted Price')
plt.title('QSVR with UMAP: True vs. Predicted Prices (Training Sample)')
plt.legend()
plt.savefig('qsvr_backpack_umap_full_prediction.png')
plt.close()
print_progress("Plot 'qsvr_backpack_umap_full_prediction.png' saved.")

# ***quantum kernel with tsne***

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.manifold import TSNE
from qiskit import QuantumCircuit
from qiskit.circuit.library import ZFeatureMap
from qiskit.quantum_info import Statevector
from qiskit_algorithms.utils import algorithm_globals
import matplotlib.pyplot as plt
import warnings
import datetime
warnings.filterwarnings('ignore')

# Function to print timestamped messages
def print_progress(message):
    print(f"[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] {message}")

# Set random seed for reproducibility
np.random.seed(42)
algorithm_globals.random_seed = 42

# Load Kaggle dataset
print_progress("Loading datasets...")
train_df = pd.read_csv('/content/drive/MyDrive/playground-series-s5e2/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/playground-series-s5e2/test.csv')

# Remove rows with missing values from training set only
print_progress("Removing missing values from training set...")
train_df = train_df.dropna()

# Define features
categorical_cols = ['Brand', 'Material', 'Size', 'Laptop Compartment', 'Waterproof', 'Style', 'Color']
numerical_cols = ['Weight Capacity (kg)', 'Compartments']
target = 'Price'

# Impute missing values in test set
print_progress("Imputing missing values in test set...")
cat_imputer = SimpleImputer(strategy='most_frequent')
num_imputer = SimpleImputer(strategy='median')
test_df[categorical_cols] = cat_imputer.fit_transform(test_df[categorical_cols])
test_df[numerical_cols] = num_imputer.fit_transform(test_df[numerical_cols])

# Preprocessing pipeline with sparse output for categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=True), categorical_cols),
        ('num', StandardScaler(), numerical_cols)
    ])

# Fit and transform training data
print_progress("Preprocessing training and test data...")
X_train = train_df[categorical_cols + numerical_cols]
y_train = train_df[target]
X_test = test_df[categorical_cols + numerical_cols]

# Apply preprocessing
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Convert sparse matrix to dense for t-SNE
X_train_preprocessed = X_train_preprocessed.toarray() if hasattr(X_train_preprocessed, 'toarray') else X_train_preprocessed
X_test_preprocessed = X_test_preprocessed.toarray() if hasattr(X_test_preprocessed, 'toarray') else X_test_preprocessed

# Subsample training data for t-SNE and quantum kernel
n_samples = 100  # Small sample for fast computation
print_progress(f"Subsampling {n_samples} training samples...")
sample_indices = np.random.choice(X_train_preprocessed.shape[0], n_samples, replace=False)
X_train_sample = X_train_preprocessed[sample_indices]
y_train_sample = y_train.iloc[sample_indices].values

# Apply t-SNE (exact method) for dimensionality reduction
n_components = 2
print_progress("Applying t-SNE (exact method) to training sample...")
tsne_model = TSNE(n_components=n_components, method='exact', random_state=42, n_jobs=1)
X_train_transformed = tsne_model.fit_transform(X_train_sample)
print_progress("t-SNE transformation for training sample completed.")

# Define simpler quantum feature map
feature_map = ZFeatureMap(feature_dimension=n_components, reps=1)

# Function to compute approximate quantum kernel matrix
def quantum_kernel(X1, X2, feature_map, sample_pairs=1000):
    kernel_matrix = np.zeros((len(X1), len(X2)))
    # Randomly sample pairs to approximate kernel
    n_pairs = min(sample_pairs, len(X1) * len(X2))
    indices = np.random.choice(len(X1) * len(X2), n_pairs, replace=False)
    for idx in indices:
        i = idx // len(X2)
        j = idx % len(X2)
        qc = QuantumCircuit(feature_map.num_qubits)
        params_i = X1[i]
        mapped_circuit_i = feature_map.assign_parameters(params_i)
        qc.compose(mapped_circuit_i, inplace=True)
        params_j = X2[j]
        mapped_circuit_j = feature_map.assign_parameters(params_j).inverse()
        qc.compose(mapped_circuit_j, inplace=True)
        state = Statevector.from_instruction(qc)
        kernel_matrix[i, j] = np.abs(state.data[0])**2
    # Fill remaining entries with average kernel value
    if n_pairs < len(X1) * len(X2):
        avg_kernel = np.mean(kernel_matrix[kernel_matrix != 0]) if np.any(kernel_matrix != 0) else 1.0
        kernel_matrix[kernel_matrix == 0] = avg_kernel
    return kernel_matrix

# Compute training kernel
print_progress("Computing training kernel...")
train_kernel = quantum_kernel(X_train_transformed, X_train_transformed, feature_map, sample_pairs=1000)
print_progress("Training kernel computation completed.")

# Train QSVR model
print_progress("Training QSVR model...")
qsvr = SVR(kernel='precomputed')
qsvr.fit(train_kernel, y_train_sample)
print_progress("QSVR model training completed.")

# Batch processing for test set
batch_size = 500  # Small for fast processing
y_pred_test = []
total_batches = (X_test_preprocessed.shape[0] + batch_size - 1) // batch_size
print_progress(f"Starting test set prediction with {total_batches} batches...")
for i in range(0, X_test_preprocessed.shape[0], batch_size):
    batch_num = i // batch_size + 1
    print_progress(f"Processing test batch {batch_num}/{total_batches}...")
    X_test_batch = X_test_preprocessed[i:i + batch_size]
    X_test_batch_transformed = tsne_model.fit_transform(X_test_batch)
    test_kernel_batch = quantum_kernel(X_test_batch_transformed, X_train_transformed, feature_map, sample_pairs=500)
    y_pred_batch = qsvr.predict(test_kernel_batch)
    y_pred_test.extend(y_pred_batch)
print_progress("Test set prediction completed.")

# Convert predictions to numpy array and round to 2 decimal places
y_pred_test = np.round(np.array(y_pred_test), 2)

# Create submission file
print_progress("Creating submission file...")
submission = pd.DataFrame({'id': test_df['id'], 'Price': y_pred_test})
# Verify submission size
if len(submission) != 200000:
    print_progress(f"Warning: Submission has {len(submission)} rows, expected 200000.")
else:
    print_progress("Submission has correct number of rows (200000).")
submission.to_csv('submission_qsvr_tsne_full.csv', index=False)
print_progress("Submission file 'submission_qsvr_tsne_full.csv' created successfully!")

# Evaluate on training sample
print_progress("Evaluating training performance...")
y_pred_train = qsvr.predict(train_kernel)
mse = mean_squared_error(y_train_sample, y_pred_train)
print_progress(f"Training Mean Squared Error: {mse:.4f}")

# Plot predictions vs. true values for sampled training data
print_progress("Generating training prediction plot...")
plt.scatter(y_train_sample, y_pred_train, color='blue', label='Predicted vs. True')
plt.plot([y_train_sample.min(), y_train_sample.max()], [y_train_sample.min(), y_train_sample.max()], 'r--', label='Ideal')
plt.xlabel('True Price')
plt.ylabel('Predicted Price')
plt.title('QSVR with t-SNE: True vs. Predicted Prices (Training Sample)')
plt.legend()
plt.savefig('qsvr_backpack_tsne_full_prediction.png')
plt.close()
print_progress("Plot 'qsvr_backpack_tsne_full_prediction.png' saved.")