In [32]:
%load_ext autoreload
%autoreload 1

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [42]:
%aimport utils

In [51]:
import sys
sys.path.append("..")

# Books

In [217]:
import pandas as pd
import numpy as np
import plotly.express as px

from lab1.lab1 import one_hot_encoder

## Preprocessing

In [254]:
df = pd.read_csv('../datasets/google_books_1299.csv')

In [255]:
df = df[["title","description","generes"]]

In [256]:
df = df.where(df["generes"] != 'none')
df = df.dropna()
df["generes"] = df["generes"].apply(lambda x: x.split(', ')[1])
df["description"] = df["description"].apply(utils.clear_text)

In [257]:
head = df['description']
head = np.unique(' '.join(head).split(' '))

In [259]:
df["description"] = df["description"].apply(
                    lambda x: utils.bag_of_words(x, head)
                    )

In [260]:
df.describe()

Unnamed: 0,title,description,generes
count,772,772,772
unique,241,772,100
top,The Queen of Nothing (The Folk of the Air #3),"[0, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",Economics
freq,12,1,111


In [261]:
df.to_json("../datasets/formated_books.json")

In [264]:
df = pd.read_json('../datasets/formated_books.json')

In [265]:
df

Unnamed: 0,title,description,generes
1,Antiques Roadkill: A Trash 'n' Treasures Mystery,"[0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Mystery &amp
2,The Art of Super Mario Odyssey,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Activities
4,"The Painted Man (The Demon Cycle, Book 1)","[0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Fantasy
6,God of War: The Official Novelization,"[0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Media Tie-In
7,Edgedancer: From the Stormlight Archive,"[0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Fantasy
...,...,...,...
1288,The Essentials of Finance and Accounting for N...,"[0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Economics
1289,The Magic of Thinking Big,"[0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Personal Growth
1294,Twas The Nightshift Before Christmas: Festive ...,"[0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Health Care Delivery
1295,Why We Sleep: The New Science of Sleep and Dreams,"[0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Cognitive Psychology &amp


## PCA

In [266]:
from sklearn.decomposition import PCA

### 2 dimensions

In [267]:
pca = PCA(n_components=2)

In [268]:
transformed = pca.fit_transform(list(df["description"].values))
x, y = np.split(transformed, 2, axis=1)
x = x.reshape(772)
y = y.reshape(772)
np.shape(x), np.shape(y)

((772,), (772,))

In [269]:
px.scatter(x=x,
           y=y,
           color=df["generes"],
           hover_name=df["title"])

### 3 dimensions

In [270]:
pca = PCA(n_components=3)

In [271]:
transformed = pca.fit_transform(list(df["description"].values))
x, y, z = np.split(transformed, 3, axis=1)
x = x.reshape(772)
y = y.reshape(772)
z = z.reshape(772)
np.shape(x), np.shape(y)

((772,), (772,))

In [272]:
px.scatter_3d(x=x,
           y=y,
           z=z,
           color=df["generes"],
           hover_name=df["title"])

## UMAP

In [222]:
import umap

### 2 dimensions 

In [278]:
transformed = umap.UMAP(n_neighbors=len(np.unique(df["generes"])),
                        n_components=2,
                        min_dist=0.3,
                        metric='correlation').fit_transform(
                                            list(df["description"].values)
                                            )

In [279]:
x, y = np.split(transformed, 2, axis=1)
x = x.reshape(772)
y = y.reshape(772)
np.shape(x), np.shape(y)

((772,), (772,))

In [280]:
px.scatter(x=x,
           y=y,
           color=df["generes"],
           hover_name=df["title"])

### 3 dimensions 

In [281]:
transformed = umap.UMAP(n_neighbors=len(np.unique(df["generes"])),
                        n_components=3,
                        min_dist=0.3,
                        metric='correlation').fit_transform(
                                            list(df["description"].values)
                                            )

In [282]:
x, y, z  = np.split(transformed, 3, axis=1)
x = x.reshape(772)
y = y.reshape(772)
z = z.reshape(772)
np.shape(x), np.shape(y), np.shape(z)

((772,), (772,), (772,))

In [283]:
px.scatter_3d(x=x,
           y=y,
           z=z,
           color=df["generes"],
           hover_name=df["title"])

## TSNE

In [285]:
from sklearn.manifold import TSNE

### 2 dimensions 

In [291]:
transformed = TSNE(
                    n_components=2,
                    perplexity=len(np.unique(df["generes"]))
                  ).fit_transform(
                                list(df["description"].values)
                                )

In [292]:
x, y = np.split(transformed, 2, axis=1)
x = x.reshape(772)
y = y.reshape(772)
np.shape(x), np.shape(y)

((772,), (772,))

In [293]:
px.scatter(x=x,
           y=y,
           color=df["generes"],
           hover_name=df["title"])

### 3 dimensions 

In [294]:
transformed = TSNE(
                    n_components=3,
                    perplexity=len(np.unique(df["generes"]))
                  ).fit_transform(
                                list(df["description"].values)
                                )

In [295]:
x, y, z  = np.split(transformed, 3, axis=1)
x = x.reshape(772)
y = y.reshape(772)
z = z.reshape(772)
np.shape(x), np.shape(y), np.shape(z)

((772,), (772,), (772,))

In [296]:
px.scatter_3d(x=x,
           y=y,
           z=z,
           color=df["generes"],
           hover_name=df["title"])