In [None]:
#These are Python libraries being imported for use in the code.
import os
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import shutil
from tqdm import tqdm
from shutil import copyfile
import cv2

In [None]:
#This code is reading a CSV file named "train.csv" and selecting only the columns "image_id" and "cancer" from the file. The resulting dataframe is assigned to the variable "df". The ".head()" method is then used to display the first few rows of the dataframe.
df = pd.read_csv('./train.csv')[['image_id','cancer']]
df.head()

In [None]:
df.shape

In [None]:
'''This code is creating a dictionary called `class_dict` with two keys, 0 and 1, each initialized with an empty list.
It then iterates through each row of a pandas DataFrame called `df`. If the value in the 'cancer' column of the current 
row is 0, it appends the string representation of the 'image_id' column value with the extension '.png' to the list 
associated with the key 0 in `class_dict`. If the value in the 'cancer' column is not 0 (i.e. it is 1), 
it appends the same string representation of the 'image_id' column value with the extension '.png' to the list 
associated with the key 1 in `class_dict`. This code is essentially grouping the image IDs in `df` by their 
corresponding cancer class (0 or 1) and storing them in a dictionary for later use.'''
class_dict = {0:[],1:[]}
for n,i in df.iterrows():
  if(i['cancer']==0):
      class_dict[0].append(str(i['image_id'])+'.png')
  else:
      class_dict[1].append(str(i['image_id'])+'.png')

In [None]:
#making new directory for working purpose
os.mkdir('./working')

In [None]:
'''This code is recursively searching for all files with the extension ".png" in the directory "./train_images" 
and its subdirectories. It then copies each of these files to the directory "./working" with the same filename. 
The `os.walk()` function is used to traverse the directory tree and `shutil.copy()` is used to copy the files.'''
root_dir = './train_images'
dest_dir = './working'
for subdir, dirs, files in os.walk(root_dir):
    for file in files:
        # Check if the file is a txt file
        if file.endswith('.png'):
            # Construct the source and destination file paths
            src_path = os.path.join(subdir, file)
            dest_path = os.path.join(dest_dir, file)

            # Copy the file to the destination directory
            shutil.copy(src_path, dest_path)

In [None]:
#Make new directories for the two classes of the prediction 
os.mkdir('./no')
os.mkdir('./yes')

In [None]:
# initializing the varibales wiht the corresponding directories
no_cancer = './no'
cancer = './yes'
source = './working'

In [None]:
'''This code is iterating through a dictionary called `class_dict` and copying files from a source directory to two
different destination directories based on the value of the key in the dictionary. If the key is 0, the file is copied
to the `no_cancer` directory, and if the key is not 0, the file is copied to the `cancer` directory. The `shutil.copy()` 
function is used to copy the files.'''

for k in class_dict:
  if (k==0):
    for v in class_dict[k]:
        src_path = os.path.join(source, v)
        dest_path = os.path.join(no_cancer, v)

            # Copy the file to the destination directory
        shutil.copy(src_path, dest_path)
  else:
    for v in class_dict[k]:
      src_path = os.path.join(source, v)
      dest_path = os.path.join(cancer, v)

            # Copy the file to the destination directory
      shutil.copy(src_path, dest_path)

In [None]:
#Making the final directory to store the result after augmentation
os.mkdir('./final')

In [None]:
'''This code is using the `ImageDataGenerator` class from the `tensorflow.keras.preprocessing.image` module to generate 
augmented images from a directory of input images. The `datagen` object is defined with various transformations such as 
rotation, shifting, flipping, and zooming. The code then loops through each image in the input directory, reads the image 
using `cv2.imread`, expands the dimensions of the image using `np.expand_dims`, and generates 45 augmented images using the
`datagen.flow` method. The augmented images are saved to the output directory with the same filename prefix as the original
image.'''

#This part of code increases the number of images in the positive class equivalent to that of the ngative as the numbers largly differ

from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing.image import img_to_array, array_to_img
import os
from tensorflow.keras.preprocessing.image import load_img

# Define the ImageDataGenerator with any required transformations
datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=True,
    vertical_flip=False,
    zoom_range=0.1,
    fill_mode='nearest')

# Define the directory containing the images
directory = '/content/yes'

# Define the output directory where the augmented images will be saved
output_dir = './final'

# Generate augmented images and save them to disk
for filename in os.listdir(directory):
    img_path = os.path.join(directory, filename)

    img = cv2.imread(img_path)

    #img = array_to_img(img_to_array(load_img(img_path)))

    img = np.expand_dims(img,axis=0)

    i = 0
    for batch in datagen.flow(img, batch_size=1, save_to_dir=output_dir, save_prefix=filename, save_format='png'):
        i += 1
        if i >= 45:  # Generate and save 45 augmented images per input image
            break


In [None]:
'''`shutil.rmtree('./final')` is a Python command that deletes a directory and all its contents recursively. 
In this case, it is deleting the directory named "final" and all its contents in the current working directory.'''
shutil.rmtree('./final')

In [None]:
#Creating all the required directories for the use
os.mkdir('./train')
os.mkdir('./test')
os.mkdir('./val')
os.mkdir('./train/0')
os.mkdir('./train/1')
os.mkdir('./test/0')
os.mkdir('./test/1')
os.mkdir('./val/0')
os.mkdir('./val/1')

In [None]:
'''This code is splitting a dataset into training, testing, and validation sets and moving the files into their respective 
directories. It uses the `train_test_split` function from the `sklearn.model_selection` module to split the data into train
and test sets, and then splits the train set again to create a validation set. It then uses the `shutil.move` function to
move the files from the original directory to the appropriate train, test, or validation directory.'''
# Done for negative class
from sklearn.model_selection import train_test_split

# Define your directories
data_dir = '/content/no'
train_dir = './train'
test_dir = './test'
val_dir = './val'
# Split your data into train and test sets
data_files = os.listdir(data_dir)
train_files, test_files = train_test_split(data_files, test_size=0.2)
train_files, val_files = train_test_split(train_files, test_size=0.2)
# Move the files into their respective directories
for filename in train_files:
    src_path = os.path.join(data_dir, filename)
    dst_path = os.path.join(train_dir, '0', filename)
    shutil.move(src_path, dst_path)

for filename in test_files:
    src_path = os.path.join(data_dir, filename)
    dst_path = os.path.join(test_dir,'0', filename)
    shutil.move(src_path, dst_path)

for filename in val_files:
    src_path = os.path.join(data_dir, filename)
    dst_path = os.path.join(val_dir,'0', filename)
    shutil.move(src_path, dst_path)


In [None]:
'''This code is splitting a dataset into training, testing, and validation sets and moving the files into their respective 
directories. It uses the `train_test_split` function from the `sklearn.model_selection` module to split the data into train
and test sets, and then splits the train set again to create a validation set. It then uses the `shutil.move` function to
move the files from the original directory to the appropriate train, test, or validation directory.'''
# Done for the positive class

# Define your directories
data_dir = '/content/final'
train_dir = './train'
test_dir = './test'
val_dir = './val'
# Split your data into train and test sets
data_files = os.listdir(data_dir)
train_files, test_files = train_test_split(data_files, test_size=0.2)
train_files, val_files = train_test_split(train_files, test_size=0.2)
# Move the files into their respective directories
for filename in train_files:
    src_path = os.path.join(data_dir, filename)
    dst_path = os.path.join(train_dir, '1', filename)
    shutil.move(src_path, dst_path)

for filename in test_files:
    src_path = os.path.join(data_dir, filename)
    dst_path = os.path.join(test_dir,'1', filename)
    shutil.move(src_path, dst_path)

for filename in val_files:
    src_path = os.path.join(data_dir, filename)
    dst_path = os.path.join(val_dir,'1', filename)
    shutil.move(src_path, dst_path)


In [None]:
# Data preprocessing
# Train data
from tensorflow.keras.preprocessing.image import ImageDataGenerator
datagenerator = ImageDataGenerator(
    rescale=1.0 / 255)
train_generator = datagenerator.flow_from_directory(
    directory='./train', target_size=(128,128), class_mode="categorical", batch_size=32
)

# Validation data
val_generator = datagenerator.flow_from_directory(
    directory='./val', target_size=(128,128), class_mode="categorical", batch_size=32
)
# Test data
test_generator = datagenerator.flow_from_directory(
    directory='./test', target_size=(128,128), class_mode="categorical", batch_size=32
)

In [None]:
import tensorflow
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import MaxPool2D,Dropout,MaxPooling2D
from tensorflow.keras.layers import Flatten
from tensorflow.keras.callbacks import EarlyStopping,ModelCheckpoint
from keras.applications.vgg16 import VGG16
from keras.applications.vgg19 import VGG19


In [None]:
# Initializing the VGG16 model
vgg16_model = VGG16(weights="imagenet", include_top=False, input_shape=(128,128, 3))

for layer in vgg16_model.layers:
    layer.trainable = False

# Initialize a sequential model
model2 = Sequential()
model2.add(vgg16_model)
model2.add(Flatten())
model2.add(Dense(256, activation='relu'))
model2.add(Dense(2, activation="softmax"))
#model.summary()
model2.compile(optimizer="adam", loss="categorical_crossentropy", metrics="accuracy")

In [None]:
# Fitting the model 
model2_history = model2.fit(train_generator,validation_data=val_generator,epochs=10, verbose=1)

In [None]:
# Saving the model
model2.save('vgg16.h5')

In [None]:
# plotting the accuracy and loss graph

import matplotlib.pyplot as plt

# plot the training and validation accuracy
plt.plot(model2_history.history['accuracy'])
plt.plot(model2_history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# plot the training and validation loss
plt.plot(model2_history.history['loss'])
plt.plot(model2_history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

In [None]:
X ,y1 = test_generator.next()

In [None]:
result2 = model2.evaluate(X,y1) # test loss and acc

In [None]:
print(result2[1]*100)

In [None]:
y_pred1 = model2.predict(X1)
y_temp=[]
for i in range (21107):
    y_temp.append(y_pred1[i].argmax())
y_pred1 = np.array(y_temp)

In [None]:
print(sklearn.metrics.classification_report(y_true,y_pred1,labels =None))

In [None]:
cm2 = confusion_matrix(y_true,y_pred1)
print(cm2)