DIMENSIONALITY REDUCTION

In [5]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=40)
mpl.rc('xtick', labelsize=20)
mpl.rc('ytick', labelsize=20)

# Where to save the figures
PROJECT_ROOT_DIR = "."
def save_fig(fig_id, tight_layout=True):
    path = os.path.join(PROJECT_ROOT_DIR, fig_id + ".png") 
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

import pandas as pd

def plot_img(data):
    plt.imshow(data, cmap = "gray",interpolation="nearest")
    plt.axis("off")


In [6]:
#read data
#data = pd.read_pickle('MAMMOGRAMS_DATA')
#data = pd.read_pickle('images_no_preprocessed')
data = pd.read_pickle('MAMMOGRAMS_augmented_2')

#data = data.drop(['x_pos','y_pos'], axis=1)
#data['images'] = data['images'].apply(np.ravel)
print("Number of rows in data =",data.shape[0])
print("Number of columns in data =",data.shape[1])
print("\n**Sample data:**")

data.head()

Number of rows in data = 615
Number of columns in data = 5

**Sample data:**


Unnamed: 0,tissue,abnormality,severity,radius_of_anormality,images
0,2.0,3.0,0.0,197.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,2.0,3.0,0.0,69.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,0.0,5.0,2.0,-99999.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,0.0,5.0,2.0,-99999.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,1.0,3.0,0.0,30.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [7]:
print('b',(data['severity'] ==0).value_counts())
print('m',(data['severity'] ==1).value_counts())
print('n',(data['severity'] ==2).value_counts())

b False    477
True     138
Name: severity, dtype: int64
m False    345
True     270
Name: severity, dtype: int64
n False    408
True     207
Name: severity, dtype: int64


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95) #0.95 means that we want to preserve 95% ratio of variance



In [9]:
Images_red_pca = []
len_data = data.shape[0]
for i in range(len_data):
    print("*", end="") 
    X_reduced = pca.fit_transform(data.iloc[i]['images'].reshape(796,1360))
    Images_red_pca.append(X_reduced)

***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************

In [10]:
data['images_red_pca'] = [x for x in Images_red_pca]
data.head()

Unnamed: 0,tissue,abnormality,severity,radius_of_anormality,images,images_red_pca
0,2.0,3.0,0.0,197.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[-1052.054918681725, -152.8478493246391, 183...."
1,2.0,3.0,0.0,69.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[-586.3384354328837, 1347.253044739814, 704.5..."
2,0.0,5.0,2.0,-99999.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[1498.6148471320926, -116.21343187406835, -18..."
3,0.0,5.0,2.0,-99999.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[1744.6962758039979, -200.42765474770198, 286..."
4,1.0,3.0,0.0,30.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[1184.3088659993653, 306.47244271551443, -282..."


In [11]:
dimensions_red = [x.shape[1] for x in data['images_red_pca']]
print(dimensions_red[:50])

[8, 9, 9, 9, 16, 16, 18, 16, 17, 13, 10, 12, 14, 10, 11, 9, 10, 9, 11, 14, 18, 13, 10, 12, 12, 21, 19, 25, 22, 9, 9, 13, 13, 11, 14, 9, 10, 8, 10, 8, 7, 12, 13, 9, 10, 15, 18, 13, 16, 12]


In [12]:
1 - pca.explained_variance_ratio_.sum() #see how much information we lost

0.04730826470757965

In [13]:
#from sklearn.manifold import TSNE

#tsne = TSNE(n_components=3, random_state=42)

from MulticoreTSNE import MulticoreTSNE as TSNE

tsne = TSNE(n_jobs=-1,n_components=2,random_state=42)

from tqdm import tqdm


In [14]:
tsne_dat=[]
for i in tqdm(range(data['images_red_pca'].shape[0])):
    #print(str(i)," ", end="") 
    tsne_img = tsne.fit_transform(data.iloc[i]['images_red_pca'])
    tsne_dat.append(tsne_img)

100%|██████████| 615/615 [39:01<00:00,  3.87s/it]


In [15]:
data['tsne_imgs'] = [x for x in tsne_dat]

In [16]:
data.head()

Unnamed: 0,tissue,abnormality,severity,radius_of_anormality,images,images_red_pca,tsne_imgs
0,2.0,3.0,0.0,197.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[-1052.054918681725, -152.8478493246391, 183....","[[-12.874163712320122, 24.7249853050932], [-12..."
1,2.0,3.0,0.0,69.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[-586.3384354328837, 1347.253044739814, 704.5...","[[-10.991412401623322, 6.168735868557621], [-1..."
2,0.0,5.0,2.0,-99999.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[1498.6148471320926, -116.21343187406835, -18...","[[5.216777947474753, 18.41380630238956], [5.25..."
3,0.0,5.0,2.0,-99999.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[1744.6962758039979, -200.42765474770198, 286...","[[-13.08081876335438, -2.7253877893140874], [-..."
4,1.0,3.0,0.0,30.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[1184.3088659993653, 306.47244271551443, -282...","[[-17.387718596989156, 18.421740759324877], [-..."


In [17]:
data.to_pickle ('final_data_augmented_2')