# Kaggle

In [1]:
%%bash
pip install kaggle
mkdir ~/.kaggle
cp kaggle.json ~/.kaggle/
chmod 600 ~/.kaggle/kaggle.json
kaggle datasets download -d brunogrisci/breast-cancer-gene-expression-cumida
apt install unzip
mkdir data
unzip -q breast-cancer-gene-expression-cumida.zip -d data/

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Downloading breast-cancer-gene-expression-cumida.zip to /content

Reading package lists...
Building dependency tree...
Reading state information...
unzip is already the newest version (6.0-21ubuntu1.1).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 12 not upgraded.


  0%|          | 0.00/61.5M [00:00<?, ?B/s] 15%|█▍        | 9.00M/61.5M [00:00<00:00, 61.7MB/s] 55%|█████▌    | 34.0M/61.5M [00:00<00:00, 158MB/s]  93%|█████████▎| 57.0M/61.5M [00:00<00:00, 189MB/s]100%|██████████| 61.5M/61.5M [00:00<00:00, 173MB/s]




# Imports

In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split 
from sklearn.svm import SVC
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_validate
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, adjusted_rand_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, MinMaxScaler


# Initializing

In [3]:
mms = MinMaxScaler()
LE = LabelEncoder()
kmeans = KMeans(n_clusters=6, random_state=42)
loo = LeaveOneOut()
pca = PCA()
lda = LinearDiscriminantAnalysis()
svc = SVC()
ss = StandardScaler()

# Organizing Data

In [4]:
path = 'data/Breast_GSE45827.csv'
BC = pd.read_csv(path)
data = BC.drop(columns=['samples'])
data.isna().sum().sum()

0

In [5]:
def func(data, num):
  ds = np.asarray(data)
  ln = data.shape[0]
  new_ds = ds
  for i in range(num):
    rnd = np.random.randint(0, ln, ln)
    new_ds = new_ds[rnd]
  return new_ds

In [6]:
num = 7
new_ds = func(data, num)

In [7]:
features = np.delete(new_ds, 0, 1)
labels = new_ds[:,0]


LE_labels = LE.fit_transform(labels)


labels_dict = dict()

for i,label in enumerate(LE.classes_):
  labels_dict[label] = i

print(labels_dict)


{'HER': 0, 'basal': 1, 'cell_line': 2, 'luminal_A': 3, 'luminal_B': 4, 'normal': 5}


In [8]:
X_train, x_test, Y_train, y_test = train_test_split(features, LE_labels, test_size=30)

# Important Functions

## Pipeline

In [9]:
def initialize(ss_transform=True):
  mms = MinMaxScaler()
  kmeans = KMeans(n_clusters=6, random_state=42)
  lda = LinearDiscriminantAnalysis()
  svc = SVC()
  ss = StandardScaler()
  # with_std=False
  basic = Pipeline(
      [
          ('Scaler', ss)
      ]
  )
  preprocess = Pipeline(
      [
        ('LDA', lda),
        ('K-Means',kmeans),        
      ]
  )

  piper = Pipeline(
      [
        ('Basic', basic),
        ('Preprocess', preprocess),
        ('Classify', svc)          
      ]
  )

  if ss_transform:
    X_train_ss = piper['Basic'].fit_transform(X_train)
    x_test_ss = piper['Basic'].transform(x_test)
  else:
    X_train_ss = X_train
    x_test_ss = x_test


  piper['Preprocess']['LDA'].fit(X_train_ss, Y_train)
  X_train_LDA = piper['Preprocess']['LDA'].transform(X_train_ss)
  lda_t = piper['Preprocess']['LDA'].transform(x_test_ss)
  return piper, X_train_LDA, lda_t

## Incase PCA was used

In [10]:
def fixed_transform_PCA(n):
  pipe = initialize()
  parameters = ['PCA__n_components', 'PCA__svd_solver']
  params = dict()
  params[parameters[0]] = n
  params[parameters[1]] = 'randomized'
  pipe['Preprocess'].set_params(**params)

  pipe['Preprocess']['PCA'].n_components = n


  pca_features_train = pipe['Preprocess']['PCA'].fit_transform(Scaled_features)
  

  pipe['Preprocess']['K-Means'].fit(pca_features_train)
  pred_labels_train = pipe['Preprocess']['K-Means'].labels_
  
  return pca_features_train, pred_labels_train







## Test Accuracy

In [11]:
def acc(pract_test, theor_test, train=False, pract_train=None, theor_train=None, prin=False, val='Test'):

  if train:
    length_train = theor_train.shape[0]
    temp = len([i for i,j in zip(pract_train,theor_train) if i==j])
    train_acc = round(temp / length_train,3)
    if prin:
      print('{}Accuarcy = {}\t\tCorrect Samples = {}'.format('train ', round(temp / length_train,3), temp))
  else:
    train_acc = -1
  
    length_test = theor_test.shape[0]
    if length_test == 1:
      if theor_test[0] == pract_test[0]:
        test_acc = 1
      else:
        test_acc = 0
      return train_acc, test_acc

  if prin:
    temp2 = len([i for i,j in zip(pract_test,theor_test) if i==j])
    if val == 'Validation':
      print('{}Accuarcy\t=\t{}\t\t# Correct Predictions\t=\t{}'.format(val + ' ', round(temp2 / length_test,3), temp2))  
    else:
      print('{}Accuarcy\t\t=\t{}\t\t# Correct Predictions\t=\t{}'.format(val + ' ', round(temp2 / length_test,3), temp2))

  

## Fine Tunning

#### Tunning PCA

In [12]:
# def tunning():
#   SilScore = []
#   ARScore = []
#   first_step = 3
#   last_step = x_test.shape[0]

#   for n in range(first_step, last_step):
#     PCA_features, pred_labels = fixed_transform(n)

#     score = silhouette_score(PCA_features, pred_labels)
#     ar = adjusted_rand_score(Y_train, pred_labels)

#     SilScore.append(score)
#     ARScore.append(ar)
    
#   return np.argmax(ARScore) + 3

# def silar(silhouette_scores, adjusted_rand_scores, n, pred_labels_train, predictions):
#   feat = '{} Features'
  
#   print(feat.format(n))
#   print('silhouette_score = {}'.format(round(silhouette_scores,3)) )
#   print('adjusted_rand_score = {}'.format(round(adjusted_rand_scores,3)))
#   acc((pred_labels_train, predictions))




## Set Parameters

In [15]:
def SVCsetparams(weights, piper):
  par = dict()
  for m,k in enumerate(weights):
    par[m] = k
  params = {'class_weight': par}
  piper['Classify'].set_params(**params)

## Validation method

In [16]:
def Val(X_train_LDA, piper):
  ls = []
  for train_index, val_index in loo.split(X_train_LDA):
    x_train, y_train = X_train_LDA[train_index], Y_train[train_index]
    x_val, y_val = X_train_LDA[val_index], Y_train[val_index]

    piper['Classify'].fit(x_train, y_train)

    pred_val = piper['Classify'].predict(x_val)

    
    _ , single_val_acc = acc(pred_val, y_val)
    ls.append(single_val_acc)
  val_acc = round(np.sum(ls) / len(ls), 3)
  print('Validation Accuracy\t=\t{}\t\t# Correct Predictions\t=\t{}'.format(val_acc, np.sum(ls)))

  return pipe['Classify'].class_weight_



# Classification

In [17]:
print('total # of train Samples = {}\t\ttotal # of test Samples = {}'.format(X_train.shape[0],x_test.shape[0]))
print('\n')

ss_transform = [True, False]

for k in ss_transform:

  print('Standard Scaler: {}'.format(str(k)))
  print('\n\n\n')

  pipe, X_train_LDA, lda_t = initialize(k)
  print("\t\tSVC(class_weight='None')")
  print('\n')
  final_weights = Val(X_train_LDA, pipe)
  print('\n')
  pipe['Classify'].class_weight = final_weights
  pred = pipe['Classify'].predict(lda_t)
  acc(pred, y_test, prin=True)
  print('\n\n')

  pipe, X_train_LDA, lda_t = initialize(k)
  pipe['Classify'].class_weight='balanced'
  print("\t\tSVC(class_weight='balanced')")
  print('\n')
  final_weights_b = Val(X_train_LDA, pipe)
  pipe['Classify'].class_weight = final_weights_b
  pred = pipe['Classify'].predict(lda_t)
  print('\n')
  acc(pred, y_test, prin=True)
  print('\n\n')

  pipe, X_train_LDA, lda_t = initialize(k)
  pipe['Classify'].probability=True
  pipe['Classify'].fit(X_train_LDA,Y_train)
  pract_test = pipe['Classify'].predict_proba(X_train_LDA)
  pract_test = np.argmax(pract_test, axis=1)
  print('\t\tSVC(probability=True)')
  print('\n')
  acc(pract_test, Y_train, prin=True, val='Validation')

  pred = pipe['Classify'].predict_proba(lda_t)
  pred = np.argmax(pred, axis=1)
  acc(pred, y_test, prin=True)

  print('\n\n\n')

total # of train Samples = 121		total # of test Samples = 30


Standard Scaler: True




		SVC(class_weight='None')


Validation Accuracy	=	0.917		# Correct Predictions	=	111


Test Accuarcy		=	0.933		# Correct Predictions	=	28



		SVC(class_weight='balanced')


Validation Accuracy	=	0.934		# Correct Predictions	=	113


Test Accuarcy		=	0.933		# Correct Predictions	=	28



		SVC(probability=True)


Validation Accuarcy	=	0.983		# Correct Predictions	=	119
Test Accuarcy		=	1.0		# Correct Predictions	=	30




Standard Scaler: False




		SVC(class_weight='None')


Validation Accuracy	=	0.917		# Correct Predictions	=	111


Test Accuarcy		=	0.933		# Correct Predictions	=	28



		SVC(class_weight='balanced')


Validation Accuracy	=	0.934		# Correct Predictions	=	113


Test Accuarcy		=	0.933		# Correct Predictions	=	28



		SVC(probability=True)


Validation Accuarcy	=	0.975		# Correct Predictions	=	118
Test Accuarcy		=	0.933		# Correct Predictions	=	28




