In [None]:
# PCA to 1000 components

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load data from Excel file
data = pd.read_excel('/content/drive/MyDrive/_projects/dr-amira/data.xlsx')
print("data: " + type(data))
# Separate features (X) and labels (y)
X = data.iloc[:, 1:]  # Assuming features start from the second column
y = data.iloc[:, 0]   # Assuming labels are in the first column
print("x: " + type(x))
# Apply PCA to reduce dimensionality to 1000 dimensions
pca = PCA(n_components=1000)
X_pca = pca.fit_transform(X)

# Split the data into training and testing sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Train a classifier (logistic regression) on the transformed features
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = classifier.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


In [None]:
# PCA to 2 components
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# Load data from Excel file
data = pd.read_excel('/content/drive/MyDrive/_projects/dr-amira/data.xlsx')

# Separate features (X) and labels (y)
X = data.iloc[:, 1:]  # Assuming features start from the second column
y = data.iloc[:, 0]   # Assuming labels are in the first column

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA
pca = PCA(n_components=2)  # Reduce to 2 dimensions for visualization
X_pca = pca.fit_transform(X_scaled)

# Plot the transformed data
plt.figure(figsize=(8, 6))
plt.scatter(X_pca[y == 1, 0], X_pca[y == 1, 1], color='red', label='Cancer')
plt.scatter(X_pca[y == 11, 0], X_pca[y == 11, 1], color='blue', label='No Cancer')
plt.title('PCA for Breast Cancer Dataset')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.show()


In [None]:
# XGboost crashes on run time
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score

# Load data from Excel file
data = pd.read_excel('/content/drive/MyDrive/_projects/dr-amira/data.xlsx')

# Separate features (X) and labels (y)
X = data.iloc[:, 1:]  # Assuming features start from the second column
y = data.iloc[:, 0]   # Assuming labels are in the first column

# Split the data into training and testing sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define XGBoost model
model = xgb.XGBClassifier()

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


In [None]:
# CNN crashes on run time

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Load data from Excel file
df = pd.read_excel("/content/drive/MyDrive/_projects/dr-amira/data.xlsx")

# Split features and labels
X = df.iloc[:, 1:].values  # Features (genes measurements)
y = df.iloc[:, 0].values   # Labels (1 for normal, 11 for cancer)

# Normalize features
X = X / np.max(X)

# Convert labels to binary (0 for normal, 1 for cancer)
y_binary = np.where(y == 11, 1, 0)

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, random_state=42)

# Create sequential model
model = Sequential([
    Dense(128, input_shape=(X_train.shape[1],), activation='relu'),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')  # Output layer, binary classification
])

# Compile model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Evaluate model on test set
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}')


In [None]:
# TSNE crashes multiple times
import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# Load the dataset from Excel file
data = pd.read_excel('/content/drive/MyDrive/_projects/dr-amira/data.xlsx')

# Separate features and target variable
X = data.iloc[:, 1:]  # Features (excluding the first column)
y = data.iloc[:, 0]   # Target variable (first column)

# Initialize and fit t-SNE
tsne = TSNE(n_components=2, random_state=42)
X_embedded = tsne.fit_transform(X)

# Plot the transformed data
plt.figure(figsize=(8, 6))
plt.scatter(X_embedded[y == 1, 0], X_embedded[y == 1, 1], label='Breast Cancer', c='r', alpha=0.5)
plt.scatter(X_embedded[y == 11, 0], X_embedded[y == 11, 1], label='No Breast Cancer', c='b', alpha=0.5)
plt.title('t-SNE Visualization of Breast Cancer Dataset')
plt.xlabel('t-SNE Dimension 1')
plt.ylabel('t-SNE Dimension 2')
plt.legend()
plt.show()


In [None]:
# auto encoder directly over the data
# crashes multiple times
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

# Load the dataset from an Excel file
data = pd.read_excel("/content/drive/MyDrive/_projects/dr-amira/data.xlsx")

# Split the dataset into features (X) and labels (y)
X = data.iloc[:, 1:].values
y = data.iloc[:, 0].values

# Normalize the features
X = X / np.max(X)

# Split the dataset into training and testing sets
X_train, X_test, _, _ = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the autoencoder architecture
input_dim = X_train.shape[1]
encoding_dim = 128

input_layer = Input(shape=(input_dim,))
encoder = Dense(encoding_dim, activation='relu')(input_layer)
decoder = Dense(input_dim, activation='sigmoid')(encoder)

# Define the autoencoder model
autoencoder = Model(inputs=input_layer, outputs=decoder)

# Compile the autoencoder model
autoencoder.compile(optimizer='adam', loss='binary_crossentropy')

# Train the autoencoder model
autoencoder.fit(X_train, X_train, epochs=50, batch_size=32, shuffle=True, validation_data=(X_test, X_test))

# Use the trained autoencoder to reconstruct the input data
reconstructed_data = autoencoder.predict(X)

# Evaluate the reconstruction error
reconstruction_error = np.mean(np.abs(X - reconstructed_data))

print("Reconstruction error:", reconstruction_error)


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
