<a href="https://colab.research.google.com/github/AnoushkaVijay/Leukemia_GAN/blob/main/Featurization_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Set the training and validation paths of the respective CSVs
TRAINING_PATH = ""
VALIDATION_PATH = ""

In [None]:
import pandas as pd
# Read the datasets
training_data = pd.read_csv(TRAINING_PATH)
validation_data = pd.read_csv(VALIDATION_PATH)

In [None]:
#view the data
training_data.head()

In [None]:
#view the data
validation_data.head()

In [None]:
#label encoding for training data
labels = training_data['label'].unique()
labels.sort()
print(labels)

valid_labels = validation_data['label'].unique()
valid_labels.sort()
print(valid_labels)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# training data label distribution
plt.figure(figsize = (15,5))
sns.countplot(data = training_data, x = "label")
plt.title("Label Distribution for Training Data")
plt.xlabel("Category")
plt.ylabel("Label count")
plt.show();

In [None]:
# validation data label distribution
plt.figure(figsize = (15,5))
sns.countplot(data = validation_data, x = "label")
plt.title("Label Distribution for Validation Data")
plt.xlabel("Category")
plt.ylabel("Label count")
plt.show();

In [None]:
#label encoding the training data
training_data['label'] = pd.factorize(training_data['label'], sort = True)[0]
training_data.head()

In [None]:
validation_data['label'] = pd.factorize(validation_data['label'], sort = True)[0]
validation_data.head()

In [None]:
from sklearn.model_selection import train_test_split

x_train = training_data.iloc[:, :-1].values
x_valid = validation_data.iloc[:, :-1].values
y_train = training_data.iloc[:, -1].values
y_valid = validation_data.iloc[:, -1].values

print(x_train.shape)
print(x_valid.shape)
print(y_train.shape)
print(y_valid.shape)

In [None]:
'''
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
def model_evaluations(y_true, y_pred):
  import matplotlib.pyplot as plt
  import seaborn as sns

  acc_score = accuracy_score(y_true, y_pred)
  print("Accuracy score: {}\n".format(acc_score))

  print("Classification Report: {}".format(classification_report(y_true, y_pred)))

  plt.figure(figsize = (10,10))
  sns.heatmap(confusion_matrix(y_true, y_pred),  annot = True, fmt="g", cmap = "Blues", xticklabels = labels, yticklabels = labels)
  plt.title("Consfuion Matrix")
  plt.show()
'''

Cofusion Matrix in Percentage & Classification Report

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import numpy as np

def model_evaluations(y_true, y_pred):
  import matplotlib.pyplot as plt
  import seaborn as sns

  acc_score = accuracy_score(y_true, y_pred)
  print("Accuracy score: {}\n".format(acc_score))

  print("Classification Report: {}".format(classification_report(y_true, y_pred)))

   # Compute normalized confusion matrix
  cm = confusion_matrix(y_true, y_pred, normalize='true')

    # to get percentages
  cm_percentage = cm * 100

    # Create annotations for confusion matrix values with the percentage sign
  annotations = np.array([[f'{val:.2f}%' for val in row] for row in cm_percentage])


  plt.figure(figsize = (10,10))
  sns.heatmap(cm,  annot = annotations, fmt="", cmap = "Blues", xticklabels = labels, yticklabels = labels)
  plt.title("Consfusion Matrix")
  plt.show()


### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

K = [i for i in range(2,15)]
accuracies = []

for k_value in K:
  main_model = KNeighborsClassifier(n_neighbors=k_value, n_jobs = -1)
  main_model.fit(x_train,y_train)
  y_pred = main_model.predict(x_valid)
  accuracies.append(accuracy_score(y_valid, y_pred))
  print("Neighbour {} experiment done".format(k_value))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize = (5,5))
plt.plot(K, accuracies, color = "red")
plt.xlabel("Number of Neighbours")
plt.ylabel("Accuracy Value")
plt.title("Accuracy vs Number of Neighbours")
plt.xticks([i for i in range(16)])
plt.grid(True)
plt.show()

In [None]:
#best model
best_model_one = KNeighborsClassifier(n_neighbors=5, n_jobs = -1)
best_model_one.fit(x_train, y_train)
best_ypred = best_model_one.predict(x_valid)
model_evaluations(y_valid, best_ypred)

### RandomForest

In [None]:
%%time
from sklearn.ensemble import RandomForestClassifier

max_depth = [1,2,3,4,5,6,7]
n_trees = [i for i in range(10,110,10)]

all_acc = []

for depth in max_depth:
  acc = []
  for tree in n_trees:
    model1 = RandomForestClassifier(n_estimators=tree, max_depth=depth, n_jobs = -1)
    model1.fit(x_train,y_train)
    y_pred = model1.predict(x_valid)
    accuracy = accuracy_score(y_valid, y_pred)
    acc.append(accuracy)
    print("Depth: {} and Tree: {} done".format(depth, tree))
  all_acc.append(acc)

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize = (15,8))
for index in range(len(max_depth)):
  plt.plot(n_trees,all_acc[index],"*-", label = f"max_depth: {index + 1}")
plt.xlabel("Number of trees")
plt.ylabel("Accuracy Value")
plt.legend(loc = "best")
plt.xticks([i for i in range(10,110,10)])
plt.grid(True)
plt.title("Accuracy values respective to max_depth and number of trees")

In [None]:
# run the best model
best_model_two = RandomForestClassifier()
best_model_two.fit(x_train, y_train)
best_ypred = best_model_two.predict(x_valid)
model_evaluations(y_valid, best_ypred)

### MLP

In [None]:
%%time
from sklearn.neural_network import MLPClassifier

learning_rate = [0.01, 0.05, 0.001, 0.0001, 0.00001]
epochs = [i for i in range(10,110,10)]

all_acc_mlp = []

for lr in learning_rate:
  acc_mlp = []
  for epo in epochs:
    model1 = MLPClassifier(learning_rate_init=lr, max_iter=epo)
    model1.fit(x_train,y_train)
    y_pred = model1.predict(x_valid)
    accuracy = accuracy_score(y_valid, y_pred)
    acc_mlp.append(accuracy)
    print("Depth: {} and Learning Rate: {} done".format(lr, epo))
  all_acc_mlp.append(acc_mlp)

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize = (15,8))
for index in range(len(learning_rate)):
  plt.plot(epochs,all_acc_mlp[index],"*-", label = f"learning_rate: {learning_rate[index]}")
plt.xlabel("Number of epochs")
plt.ylabel("Accuracy Value")
plt.legend(loc = "best")
plt.xticks([i for i in range(10,110,10)])
plt.grid(True)
plt.title("Accuracy values respective to Learning rate and number of epochs")

In [None]:
#please train the data with best selected model
best_model_three = MLPClassifier(learning_rate_init=0.001 , max_iter=60)
best_model_three.fit(x_train, y_train)
best_ypred = best_model_three.predict(x_valid)
model_evaluations(y_valid, best_ypred)

### Run the best algorithm with best hyper parameters
- Compulsory

In [None]:
#please train the data with best selected model
final_best = MLPClassifier(learning_rate_init=0.001 , max_iter=60)
final_best.fit(x_train, y_train)
best_ypred = final_best.predict(x_valid)
model_evaluations(y_valid, best_ypred)

### Save the best model

In [None]:
import pickle
fh = open("", "wb")
pickle.dump(final_best, fh)
fh.close()