# BDMH PROJECT
## An Empirical Study of Machine Learning Algorithms for Cancer Identification

#### Note
----
    Step 1) Place the 'GSE62054_series_matrix.txt',
    'GSE98406_series_matrix.txt' and 'scaling.py' in the same folder.
    Step 2) Change the current directory to this new directory using the cell below.

In [38]:
#cd C:\Users\shubh\Desktop\IIITD\Sem2\BDMH\project\Dataset\matrices

In [1]:
import scaling
import pandas as pd
import numpy as np 
import seaborn as sns

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix ,accuracy_score


### Load data

In [2]:
file_Thyroid=open("GSE62054_series_matrix.txt",encoding='ISO-8859-1').read()
data = file_Thyroid.split('\n\n')[2]
data=data.split('\n')   


In [4]:
X=np.zeros((25,1146))
for i in range(1,len(data)-1):
    x=data[i].split('\t')
    #print(x)
    for j in range(1,len(x)):
#         print(i," ",j)
        X[(j-1),(i-1)]=x[j]
        
Y=np.zeros(25)
for i in range(0,25):
    if i < 8:
        Y[i] = 0
    else:
        Y[i] = 1
        

In [5]:
print("X\n",X,", shape = ",X.shape)
print("\n")
print("Y\n",Y,", shape = ",Y.shape)

X
 [[5.26128859 4.07453793 3.21929761 ... 4.99484292 5.04675288 4.33849959]
 [5.09657112 5.54199961 3.14619765 ... 4.06077457 4.88185151 4.57594889]
 [5.37146322 5.54199961 2.85800437 ... 4.88354725 4.88291309 4.54149244]
 ...
 [5.10813134 3.64184092 5.38428995 ... 4.16405064 5.08635804 4.88238879]
 [5.09586229 5.4187635  4.96860273 ... 5.26612907 4.83300648 4.76127475]
 [4.99992076 5.29166851 5.09225501 ... 5.76627377 4.91686286 5.13710666]] , shape =  (25, 1146)


Y
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1.] , shape =  (25,)


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=42,stratify=Y)

## Data Analysis

In [44]:
YY = pd.Series(list(Y),dtype = 'int32')
print(YY.value_counts())
# g = sns.countplot(YY)

1    17
0     8
dtype: int64


In [45]:
print("Checking null values in X : ",np.argwhere(np.isnan(X)))

Checking null values in X :  []


## Preprocesing Data

In [7]:
# SZ = Scaled Z Score
# SMM = Scale MinMax

X_train_SZ, m, s  = scaling.ZScoreScalingTrain(X_train)
X_test_SZ         = scaling.ZScoreScalingTest(X_test,m,s)
    
X_train_SMM, m, s = scaling.MinMaxScalingTrain(X_train)
X_test_SMM        = scaling.MinMaxScalingTest(X_test,m,s)

## Models

### 1. Logistic Regression

In [None]:
from sklearn.model_selection import GridSearchCV 
from sklearn.linear_model import LogisticRegression  
from sklearn.metrics import classification_report, confusion_matrix ,accuracy_score

def LR_grid(X_train,y_train,X_test,y_test):
    print("\n--------Logistic Regression-----------")
    param_grid = { 'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'] }
    
    lr_clf = GridSearchCV(LogisticRegression(max_iter = 3000), param_grid, refit = True)
    lr_clf.fit(X_train, y_train) 
    print("\n<Best Params> :",lr_clf.best_params_) 
    pred   = lr_clf.predict(X_test) 
    
    # print classification report 
    print("\n<Classification Report>\n",classification_report(y_test, pred)) 
    # print confusion matrix 
    print("\n<Confusion Matrix>\n",confusion_matrix(y_test, pred)) 
    # print Acuracy
    print("\n<Accuracy> : ",accuracy_score(y_test, pred)) 
    return lr_clf


LR_SZ = LR_grid(X_train_SZ,y_train,X_test_SZ,y_test)

LR_SMM = LR_grid(X_train_SMM,y_train,X_test_SMM,y_test)


### 2. SVM

In [None]:
from sklearn.model_selection import GridSearchCV 
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split 
from sklearn.metrics import classification_report, confusion_matrix ,accuracy_score

def SVM_grid(X_train,y_train,X_test,y_test):
    print("\n--------SVM-----------")
    param_grid = {'C': [0.1, 1, 10, 100, 1000],  
                  'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
                  'kernel': ['linear', 'rbf', 'sigmoid']}  

    grid = GridSearchCV(SVC(), param_grid, refit = True) 

    # fitting the model for grid search 
    grid.fit(X_train, y_train) 
    print("\n<Best Params> :",grid.best_params_) 
    grid_predictions = grid.predict(X_test) 

    # print classification report 
    print("\n<Classification Report>\n",classification_report(y_test, grid_predictions)) 
    # print confusion matrix 
    print("\n<Confusion Matrix>\n",confusion_matrix(y_test, grid_predictions)) 
    # print Acuracy
    print("\n<Accuracy> : ",accuracy_score(y_test, grid_predictions)) 
    return grid


SVM_SZ = SVM_grid(X_train_SZ,y_train,X_test_SZ,y_test)

SVM_SMM = SVM_grid(X_train_SMM,y_train,X_test_SMM,y_test)


##  XGBoost

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold

# data_dmatrix = xgb.DMatrix(data=X,label=Y)

def XGB_classifier(X_train,y_train,X_test,y_test):
    xg_clf = XGBClassifier()
    parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['binary:logistic'],
              'learning_rate': [0.05], 
              'max_depth': [5,6,7,8,9,10],
              'min_child_weight': [11],
              'silent': [1],
              'subsample': [0.8],
              'colsample_bytree': [0.7],
              'n_estimators': [5,1000], 
              }
    xg_clf = GridSearchCV(xg_clf, parameters, n_jobs=5, verbose=2, refit=True)
    xg_clf.fit(X_train,y_train)

    pred = xg_clf.predict(X_test)

    print("\n<Classification Report>\n",classification_report(y_test, pred)) 
    # print confusion matrix 
    print("\n,Confusion Matrix>\n",confusion_matrix(y_test, pred)) 
    # print Acuracy
    print("\n<Accuracy> : ",accuracy_score(y_test, pred)) 
    
XGB_SZ = XGB_classifier(X_train_SZ,y_train,X_test_SZ,y_test)

XGB_SMM = XGB_classifier(X_train_SMM,y_train,X_test_SMM,y_test)
   


In [22]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
model = RandomForestClassifier(n_estimators=100,
                               bootstrap = True,
                               max_features = 'sqrt')
# Fit on training data
model.fit(X_train, y_train)
rf_predictions = model.predict(X_test)
# Probabilities for each class

print("\n<Accfdsfuracy> : ",accuracy_score(y_test, rf_predictions))


<Accfdsfuracy> :  0.75


In [24]:
#parameters for random forest
#----------------------
#Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
#------------------------------
def RF_grid(X_train,y_train,X_test,y_test):
    print("\n--------RF-----------")
    random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

    grid = RandomizedSearchCV(estimator = RandomForestClassifier(), param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

    # fitting the model for grid search
    grid.fit(X_train, y_train)
    print("\n<Best Params> :",grid.best_params_)
    grid_predictions = grid.predict(X_test)
    # print classification report
    print("\n<Classification Report>\n",classification_report(y_test, grid_predictions))
    # print confusion matrix
    print("\n<Confusion Matrix>\n",confusion_matrix(y_test, grid_predictions))
    # print Acuracy
    print("\n<Accuracy> : ",accuracy_score(y_test, grid_predictions))
    return grid
RF_SZ = RF_grid(X_train_SZ,y_train,X_test_SZ,y_test)

RF_SMM = RF_grid(X_train_SMM,y_train,X_test_SMM,y_test)



--------RF-----------
Fitting 3 folds for each of 100 candidates, totalling 300 fits

<Best Params> : {'n_estimators': 1200, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 10, 'bootstrap': False}

<Classification Report>
               precision    recall  f1-score   support

         0.0       1.00      0.33      0.50         3
         1.0       0.71      1.00      0.83         5

    accuracy                           0.75         8
   macro avg       0.86      0.67      0.67         8
weighted avg       0.82      0.75      0.71         8


<Confusion Matrix>
 [[1 2]
 [0 5]]

<Accuracy> :  0.75

--------RF-----------
Fitting 3 folds for each of 100 candidates, totalling 300 fits

<Best Params> : {'n_estimators': 400, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': False}

<Classification Report>
               precision    recall  f1-score   support

         0.0       1.00      0.33      0.

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   38.3s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  5.3min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   32.0s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  4.5min finished


In [27]:
from sklearn.model_selection import StratifiedKFold
#implementing k fold for cross validation
def cross_validation(splits,func):
    skf = StratifiedKFold(n_splits=splits)
    for train_index, test_index in skf.split(X, Y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]
        X_train_SZ, m, s  = scaling.ZScoreScalingTrain(X_train)
        X_test_SZ         = scaling.ZScoreScalingTest(X_test,m,s)
        X_train_SMM, m, s = scaling.MinMaxScalingTrain(X_train)
        X_test_SMM        = scaling.MinMaxScalingTest(X_test,m,s)
        RF_SZ = func(X_train_SZ,y_train,X_test_SZ,y_test)
        RF_SMM = func(X_train_SMM,y_train,X_test_SMM,y_test)

In [None]:
cross_validation(5,RF_grid)



--------RF-----------
Fitting 3 folds for each of 100 candidates, totalling 300 fits

<Best Params> : {'n_estimators': 600, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 60, 'bootstrap': False}

<Classification Report>
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00         2
         1.0       1.00      1.00      1.00         3

    accuracy                           1.00         5
   macro avg       1.00      1.00      1.00         5
weighted avg       1.00      1.00      1.00         5


<Confusion Matrix>
 [[2 0]
 [0 3]]

<Accuracy> :  1.0

--------RF-----------
Fitting 3 folds for each of 100 candidates, totalling 300 fits

<Best Params> : {'n_estimators': 600, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 60, 'bootstrap': False}

<Classification Report>
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00  

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   30.6s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  4.8min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   25.8s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  4.1min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   29.1s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  4.0min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   28.0s
