In [None]:

import warnings
warnings.filterwarnings('ignore')

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.metrics import classification_report , confusion_matrix , accuracy_score , auc
from sklearn.model_selection import train_test_split
from skimage import io
from skimage.color import rgb2gray
from skimage.transform import resize
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from keras.layers import Input, Dense,Conv2D , MaxPooling2D, Flatten,BatchNormalization,Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing import image_dataset_from_directory
import cv2
from PIL import Image 


In [None]:
train_df= pd.read_csv('/kaggle/input/UBC-OCEAN/train.csv')
train_df.head()

In [None]:
train_df.info()

In [None]:
train_df.nunique()

In [None]:
train_df['label'].value_counts()

In [None]:
train_df.describe()

In [None]:
colors_list = ['yellowgreen', 'lightcoral', 'lightskyblue', 'lightgreen', 'pink']
explode_list = [0, 0, 0, 0.1, 0.1] 

train_df['label'].value_counts().plot(kind='pie',autopct="%.1f%%", shadow= True,
                                     colors= colors_list, explode= explode_list)
plt.title("Ovarian Cancer Types Distributions")
plt.legend(loc= 'upper left', fontsize = 7)
plt.show()


## Detecting Outliers from the data

In [None]:
train_df.plot(kind='box', figsize=(8, 6))
plt.title('Defining the outliers')

plt.show()

In [None]:
train_image_path = "/kaggle/input/UBC-OCEAN/train_images"
test_image_path = "/kaggle/input/UBC-OCEAN/test_images"
train_image = os.listdir(train_image_path)
test_image = os.listdir(test_image_path)

print(len(train_image))
print(len(test_image))

In [None]:
train_thumb_path= '/kaggle/input/UBC-OCEAN/train_thumbnails'
test_thumb_path= '/kaggle/input/UBC-OCEAN/test_thumbnails'
train_thumb_image= os.listdir(train_thumb_path)
test_thumb_image= os.listdir(test_thumb_path)

print(len(train_thumb_image))
print(len(test_thumb_image))

In [None]:
train_df['is_tma'].value_counts().reset_index()

## The data for detected Cancer= True & Not detected Cancer= False  

In [None]:
dec_cancer= train_df[train_df['is_tma']== True]
dec_cancer

In [None]:
not_dec_cancer= train_df[train_df['is_tma']== False]
not_dec_cancer

In [None]:
not_dec_cancer['image_id_path'] = not_dec_cancer['image_id'].apply(lambda x: f"{x}_thumbnail.png")
not_dec_cancer

In [None]:
dec_cancer['image_id_path'] = dec_cancer['image_id'].apply(lambda x: f"{x}_thumbnail.png")
dec_cancer


## Image Data Processing
- 

### Thumbnail Data Images

In [None]:
train_images_ = train_image
train_images_[:10]

In [None]:
labels = ['HGSC', 'LGSC', 'EC', 'CC', 'MC']
for lab in labels:
    imgs_with_label = dec_cancer[dec_cancer.label==lab]
    img_ids = list(imgs_with_label[imgs_with_label.is_tma==True].image_id)
    figure = plt.figure(figsize = (22,6))
    i = 0
    for img_id in img_ids:
        ax = figure.add_subplot(1,5,i+1)
        try:
            io.imshow(f'/kaggle/input/UBC-OCEAN/train_images/{img_id}.png')
            ax.set_title(f'Image Id:{img_id}, label:{lab}', fontsize=14)
            plt.tick_params(labelbottom=False, labelleft=False, labelright=False, labeltop=False, 
                            bottom=False, left=False, right=False, top=False)
            i= i+1
        except:
            print(f'Thumbnail #{img_id} does not exist.')
plt.show()

In [None]:
train_thumbnails_ = train_thumb_image
train_thumbnails_[:5]

In [None]:
plt.figure(figsize=(15,20))
thumb_path = "/kaggle/input/UBC-OCEAN/train_thumbnails"
j=1
for img, lb in zip(not_dec_cancer['image_id_path'][:20],not_dec_cancer['label'][:20]):
    plt.subplot(6,4,j)
    path = os.path.join("/kaggle/input/UBC-OCEAN/train_thumbnails/",img)
    image = plt.imread(path)
    image = plt.imshow(image)
    plt.title(f"Label:{lb}")
    j+=1   


In [None]:
sns.pairplot(train_df,vars=['image_width', 'image_height', 'image_id'], hue='is_tma')
plt.show()

In [None]:
train_images = [int(file.split('/')[-1].split('.')[0]) for file in os.listdir('/kaggle/input/UBC-OCEAN/train_images')]
train_thumbnails = [int(file.split('/')[-1].split('_')[0]) for file in os.listdir('/kaggle/input/UBC-OCEAN/train_thumbnails')]
train_images.sort()
train_thumbnails.sort()

print('Number of train images:', len(train_images))
print('Number of train thumbnails:', len(train_thumbnails))

In [None]:
test_thumbnail = cv2.imread('/kaggle/input/UBC-OCEAN/test_thumbnails/41_thumbnail.png')
plt.imshow(test_thumbnail)

In [None]:
print(train_df['image_width'].min())
print(train_df['image_width'].max())
print(train_df['image_width'].mean())
print(train_df['image_height'].min())
print(train_df['image_height'].max())
print(train_df['image_height'].mean())

In [None]:
test_data = pd.read_csv("/kaggle/input/UBC-OCEAN/test.csv")
test_data.head()