<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Kaggle" data-toc-modified-id="Kaggle-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Kaggle</a></span></li><li><span><a href="#Imports" data-toc-modified-id="Imports-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Imports</a></span></li><li><span><a href="#Initializing" data-toc-modified-id="Initializing-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Initializing</a></span></li><li><span><a href="#Organizing-Data" data-toc-modified-id="Organizing-Data-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Organizing Data</a></span></li><li><span><a href="#Important-Functions" data-toc-modified-id="Important-Functions-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Important Functions</a></span><ul class="toc-item"><li><span><a href="#Pipeline" data-toc-modified-id="Pipeline-5.1"><span class="toc-item-num">5.1&nbsp;&nbsp;</span>Pipeline</a></span></li><li><span><a href="#Test-Accuracy,-Silhouette_Score-and-Adjusted_Rand_Score" data-toc-modified-id="Test-Accuracy,-Silhouette_Score-and-Adjusted_Rand_Score-5.2"><span class="toc-item-num">5.2&nbsp;&nbsp;</span>Test Accuracy, Silhouette_Score and Adjusted_Rand_Score</a></span></li><li><span><a href="#Set-Parameters" data-toc-modified-id="Set-Parameters-5.3"><span class="toc-item-num">5.3&nbsp;&nbsp;</span>Set Parameters</a></span></li><li><span><a href="#Validation-method" data-toc-modified-id="Validation-method-5.4"><span class="toc-item-num">5.4&nbsp;&nbsp;</span>Validation method</a></span></li></ul></li><li><span><a href="#PCA-vs-LDA-in-Classification" data-toc-modified-id="PCA-vs-LDA-in-Classification-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>PCA vs LDA in Classification</a></span></li></ul></div>

# Kaggle

In [1]:
%%bash
pip install kaggle
mkdir ~/.kaggle
cp kaggle.json ~/.kaggle/
chmod 600 ~/.kaggle/kaggle.json
kaggle datasets download -d brunogrisci/breast-cancer-gene-expression-cumida
apt install unzip
mkdir data
unzip -q breast-cancer-gene-expression-cumida.zip -d data/

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Downloading breast-cancer-gene-expression-cumida.zip to /content

Reading package lists...
Building dependency tree...
Reading state information...
unzip is already the newest version (6.0-21ubuntu1.1).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 12 not upgraded.


  0%|          | 0.00/61.5M [00:00<?, ?B/s] 15%|█▍        | 9.00M/61.5M [00:00<00:00, 72.3MB/s] 34%|███▍      | 21.0M/61.5M [00:00<00:00, 99.7MB/s] 65%|██████▌   | 40.0M/61.5M [00:00<00:00, 142MB/s]  88%|████████▊ | 54.0M/61.5M [00:00<00:00, 131MB/s]100%|██████████| 61.5M/61.5M [00:00<00:00, 125MB/s]




# Imports

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split 
from sklearn.svm import SVC
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_validate
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, adjusted_rand_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, MinMaxScaler


# Initializing

In [2]:
mms = MinMaxScaler()
LE = LabelEncoder()
kmeans = KMeans(n_clusters=6, random_state=42)
loo = LeaveOneOut()
pca = PCA()
lda = LinearDiscriminantAnalysis()
svc = SVC()
ss = StandardScaler()

# Organizing Data

In [3]:
path = 'data/Breast_GSE45827.csv'
BC = pd.read_csv(path)
data = BC.drop(columns=['samples'])
data.isna().sum().sum()

0

In [4]:
def func(data, num):
  ds = np.asarray(data)
  ln = data.shape[0]
  new_ds = ds
  for i in range(num):
    rnd = np.random.randint(0, ln, ln)
    new_ds = new_ds[rnd]
  return new_ds

In [5]:
num = 7
new_ds = func(data, num)

In [6]:
features = np.delete(new_ds, 0, 1)
labels = new_ds[:,0]


LE_labels = LE.fit_transform(labels)


labels_dict = dict()

for i,label in enumerate(LE.classes_):
  labels_dict[label] = i

print(labels_dict)


{'HER': 0, 'basal': 1, 'cell_line': 2, 'luminal_A': 3, 'luminal_B': 4, 'normal': 5}


In [25]:
X_train, x_test, Y_train, y_test = train_test_split(features, LE_labels, test_size=30)

# Important Functions

## Pipeline

In [17]:
def initialize(Use_PCA, ss_transform=True):
  mms = MinMaxScaler()
  kmeans = KMeans(n_clusters=6, random_state=42)
  lda = LinearDiscriminantAnalysis()
  svc = SVC()
  ss = StandardScaler()
  pca = PCA(5)
  basic = Pipeline(
      [
          ('Scaler', ss)
      ]
  )
  if Use_PCA:
    preprocess = Pipeline(
        [
          ('DR', pca),
          ('K-Means',kmeans),        
        ]
    )
  else:
    preprocess = Pipeline(
        [
          ('DR', lda),
          ('K-Means',kmeans),        
        ]
    )

  
  piper = Pipeline(
      [
        ('Basic', basic),
        ('Preprocess', preprocess),
        ('Classify', svc)          
      ]
  )

  if ss_transform:
    X_train_ss = piper['Basic'].fit_transform(X_train)
    x_test_ss = piper['Basic'].transform(x_test)
  else:
    X_train_ss = X_train
    x_test_ss = x_test


  piper['Preprocess'].fit(X_train_ss, Y_train)
  X_train_DR = piper['Preprocess']['DR'].transform(X_train_ss)
  DR_t = piper['Preprocess']['DR'].transform(x_test_ss)
  return piper, X_train_DR, DR_t

### Tunning PCA

In [9]:
def fixed_transform_PCA(n):
  pipe = initialize()
  parameters = ['PCA__n_components', 'PCA__svd_solver']
  params = dict()
  params[parameters[0]] = n
  params[parameters[1]] = 'randomized'
  pipe['Preprocess'].set_params(**params)

  pipe['Preprocess']['PCA'].n_components = n


  pca_features_train = pipe['Preprocess']['PCA'].fit_transform(Scaled_features)
  

  pipe['Preprocess']['K-Means'].fit(pca_features_train)
  pred_labels_train = pipe['Preprocess']['K-Means'].labels_
  
  return pca_features_train, pred_labels_train







## Test Accuracy, Silhouette_Score and Adjusted_Rand_Score

In [34]:
def acc(pract_test, theor_test, train=False, pract_train=None, theor_train=None, prin=False, val='Test'):

  if train:
    length_train = theor_train.shape[0]
    temp = len([i for i,j in zip(pract_train,theor_train) if i==j])
    train_acc = round(temp / length_train,3)
    if prin:
      print('{}Accuarcy = {}\t\tCorrect Samples = {}'.format('train ', round(temp / length_train,3), temp))
  else:
    train_acc = -1
  
    length_test = theor_test.shape[0]
    if length_test == 1:
      if theor_test[0] == pract_test[0]:
        test_acc = 1
      else:
        test_acc = 0
      return train_acc, test_acc

  if prin:
    temp2 = len([i for i,j in zip(pract_test,theor_test) if i==j])
    if val == 'Validation':
      print('{}Accuarcy\t=\t{}\t\t# Correct Predictions\t=\t{}'.format(val + ' ', round(temp2 / length_test,3), temp2))  
    else:
      print('{}Accuarcy\t\t=\t{}\t\t# Correct Predictions\t=\t{}'.format(val + ' ', round(temp2 / length_test,3), temp2))

  return round(temp2 / length_test,3)

In [11]:
def tunning(PCA_features, pred_labels, y_true):

  score = silhouette_score(PCA_features, pred_labels)
  ar = adjusted_rand_score(y_true, pred_labels)
  
  print('\nsilhouette_score\t=\t{}\t\tadjusted_rand_score\t=\t{}'.format(round(score,3), round(ar,3)))

## Set Parameters

In [12]:
def SVCsetparams(weights, piper):
  par = dict()
  for m,k in enumerate(weights):
    par[m] = k
  params = {'class_weight': par}
  piper['Classify'].set_params(**params)

## Validation method

In [35]:
def Val(X_train_LDA, piper):
  ls = []
  for train_index, val_index in loo.split(X_train_LDA):
    x_train, y_train = X_train_LDA[train_index], Y_train[train_index]
    x_val, y_val = X_train_LDA[val_index], Y_train[val_index]

    piper['Classify'].fit(x_train, y_train)

    pred_val = piper['Classify'].predict(x_val)

    
    _ , single_val_acc = acc(pred_val, y_val)
    ls.append(single_val_acc)
  val_acc = round(np.sum(ls) / len(ls), 3)
  print('Validation Accuracy\t=\t{}\t\t# Correct Predictions\t=\t{}'.format(val_acc, np.sum(ls)))

  return pipe['Classify'].class_weight_, val_acc



# PCA vs LDA in Classification

In [36]:
print('total # of train Samples = {}\t\ttotal # of test Samples = {}'.format(X_train.shape[0],x_test.shape[0]))
print('\n')

method = ['using PCA', 'Using LDA']

averaging_PCA = []
averaging_LDA = []

averaging = [averaging_PCA, averaging_LDA]

ss_transform = [True, False]
for j,k in enumerate(ss_transform):

  print(method[j])
  print('\n\n')

  for i in range(6):
    X_train, x_test, Y_train, y_test = train_test_split(features, LE_labels, test_size=30)



    pipe, X_train_DR, DR_t = initialize(k)
    final_weights, val_acc = Val(X_train_DR, pipe)
    pipe['Classify'].class_weight = final_weights
    pred = pipe['Classify'].predict(DR_t)
    test_acc = acc(pred, y_test, prin=True)
    print('\n')
    averaging[j].append((val_acc,test_acc))
    # pipe, X_train_DR, DR_t = initialize(k)
    # pipe['Classify'].class_weight='balanced'
    # print("\t\tSVC(class_weight='balanced')")
    # print('\n')
    # final_weights_b = Val(X_train_DR, pipe)
    # pipe['Classify'].class_weight = final_weights_b
    # pred = pipe['Classify'].predict(DR_t)
    # print('\n')
    # acc(pred, y_test, prin=True)
    # print('\n\n')

    # pipe, X_train_DR, DR_t = initialize(k)
    # pipe['Classify'].probability=True
    # pipe['Classify'].fit(X_train_DR,Y_train)
    # pract_test = pipe['Classify'].predict_proba(X_train_DR)
    # pract_test = np.argmax(pract_test, axis=1)
    # print('\t\tSVC(probability=True)')
    # print('\n')
    # acc(pract_test, Y_train, prin=True, val='Validation')

    # pred = pipe['Classify'].predict_proba(DR_t)
    # pred = np.argmax(pred, axis=1)
    # acc(pred, y_test, prin=True)



total # of train Samples = 121		total # of test Samples = 30


using PCA



Validation Accuracy	=	0.959		# Correct Predictions	=	116
Test Accuarcy		=	0.867		# Correct Predictions	=	26


Validation Accuracy	=	0.967		# Correct Predictions	=	117
Test Accuarcy		=	0.967		# Correct Predictions	=	29


Validation Accuracy	=	0.95		# Correct Predictions	=	115
Test Accuarcy		=	0.967		# Correct Predictions	=	29


Validation Accuracy	=	0.95		# Correct Predictions	=	115
Test Accuarcy		=	0.967		# Correct Predictions	=	29


Validation Accuracy	=	0.959		# Correct Predictions	=	116
Test Accuarcy		=	0.967		# Correct Predictions	=	29


Validation Accuracy	=	0.917		# Correct Predictions	=	111
Test Accuarcy		=	0.933		# Correct Predictions	=	28


Using LDA



Validation Accuracy	=	0.876		# Correct Predictions	=	106
Test Accuarcy		=	0.9		# Correct Predictions	=	27


Validation Accuracy	=	0.893		# Correct Predictions	=	108
Test Accuarcy		=	0.9		# Correct Predictions	=	27


Validation Accuracy	=	0.893		# Correc

In [40]:
LDA_avg = np.array(averaging_LDA)
PCA_avg = np.array(averaging_PCA)

print('LDA Average Scores across {} epochs:\t\tValidation\t=\t{}\t\tTest\t=\t{}'.format(LDA_avg.shape[0], round(np.average(LDA_avg[:,0]),3), round(np.average(LDA_avg[:,1]),3)))
print('\n')
print('PCA Average Scores across {} epochs:\t\tValidation\t=\t{}\t\tTest\t=\t{}'.format(PCA_avg.shape[0], round(np.average(PCA_avg[:,0]),3), round(np.average(PCA_avg[:,1]),3)))

LDA Average Scores across 6 epochs:		Validation	=	0.886		Test	=	0.883


PCA Average Scores across 6 epochs:		Validation	=	0.95		Test	=	0.945
