<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc" style="margin-top: 1em;"><ul class="toc-item"><li><span><a href="#20-news-groups-load" data-toc-modified-id="20-news-groups-load-1">20 news groups load</a></span></li><li><span><a href="#Spectral-Clustering" data-toc-modified-id="Spectral-Clustering-2">Spectral Clustering</a></span></li><li><span><a href="#PCA" data-toc-modified-id="PCA-3">PCA</a></span></li><li><span><a href="#SPECTRAL" data-toc-modified-id="SPECTRAL-4">SPECTRAL</a></span></li><li><span><a href="#SPECTRAL-V2" data-toc-modified-id="SPECTRAL-V2-5">SPECTRAL V2</a></span></li><li><span><a href="#3D" data-toc-modified-id="3D-6">3D</a></span></li></ul></div>

# 20 news groups load

In [1]:
import spectral
import scipy
from scipy import sparse
import numpy as np
import pandas as pd
import networkx as nx
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline
sns.set(rc={"figure.figsize": (15, 6)})
sns.set_palette(sns.color_palette("Set2", 10))

from sklearn import preprocessing, decomposition, model_selection

In [2]:
from sklearn import metrics
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [3]:
all_news = fetch_20newsgroups(subset='all')

In [4]:
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.5, sublinear_tf=True, max_features=500)

In [5]:
news_data = vectorizer.fit_transform(all_news.data)
news_target = all_news.target
news_target_names = all_news.target_names 

feature_names = vectorizer.get_feature_names()

In [6]:
subset_mask = np.random.permutation(len(news_target))[:4000]

X = scipy.sparse.csr_matrix.todense(news_data[subset_mask])
y_brute = news_target[subset_mask]

In [7]:
news_target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [8]:
df_targets = pd.DataFrame(news_target_names)

In [9]:
parent_cat = ['comp.', 'rec.', 'religion', '.politics.', 'sci.', 'misc.forsale']

parent_cat_ind = []
for c in parent_cat:
    ind = df_targets[df_targets[0].apply(lambda x: c in x).values].index.values
    if c == 'religion':
        ind = np.append(ind, 0)
    parent_cat_ind.append(ind)

In [10]:
def is_in(s):
    return np.vectorize(lambda x: x in s)

In [11]:
y = np.ones_like(y_brute) * 999

for i, s in enumerate(parent_cat_ind):
    m = is_in(s)(y_brute)
    print(i, s, m)
    y[m] = i

0 [1 2 3 4 5] [False  True False ..., False False False]
1 [ 7  8  9 10] [False False False ..., False False False]
2 [15 19  0] [False False  True ..., False False False]
3 [16 17 18] [False False False ...,  True False False]
4 [11 12 13 14] [ True False False ..., False  True  True]
5 [6] [False False False ..., False False False]


In [29]:
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go

In [30]:
init_notebook_mode(connected=True)

In [31]:
def plot_new_point(message):
    data = vectorizer.transform((message))
    X_2 = scipy.sparse.csr_matrix.todense(data)
    y_2 = 10
    new_X = np.append(X, X_2, axis=0)
    new_y = np.append(y, y_2)
    distances = spectral.features_to_dist_matrix(new_X, metric='cosine')

    if np.count_nonzero(np.isnan(distances)) > 0:
        print('there are some nan')
        distances = np.nan_to_num(distances, copy=False)
    
    all_weights = spectral.dist_to_adj_matrix(distances, 'gaussian')
    
    NEIGHBORS = 100
    weights = spectral.filter_neighbors(all_weights, NEIGHBORS)
    
    degrees = np.sum(weights, axis=0)

    D = np.diag(degrees)
    W = weights
    L = D - W

    inv_sqrt_D = np.diag(1 / np.diag(D**(0.5)))

    normalized_laplacian = inv_sqrt_D @ L @ inv_sqrt_D
    eigenvalues, eigenvectors = sparse.linalg.eigsh(normalized_laplacian, k=10, which='SM') # which='SA' gives us similar results
    
    
    traces = []
    
    labels = sorted(set(new_y))
    labels_to_name = {l:parent_cat[l] for l in labels[:len(parent_cat_ind)]}
    labels_to_name[y_2] = 'noname'


    for label in labels:
        label_mask = new_y == label
        #'''
        axis_x = eigenvectors[:, 1][label_mask]
        axis_y = eigenvectors[:, 2][label_mask]
        axis_z = eigenvectors[:, 3][label_mask]
        
        trace = go.Scatter3d(
            x=axis_x,
            y=axis_y,
            z=axis_z,
            name=labels_to_name[label],
            mode='markers',
            marker=dict(
                size=12,
                color=label,
                #colorscale='Viridis',
                #opacity=0.8,
                line=dict(
                    width=2,
                    color='black'
                )
            )
        )
    
        traces.append(trace)
        
    layout = go.Layout(
        hovermode= 'closest',
        margin=dict(
            l=0,
            r=0,
            b=0,
            t=0
        )
    )

    data = traces

    fig = go.Figure(data=data, layout=layout)
    return fig

In [22]:
new_message = ["Jesus is our lord in the sky and on earth"]

In [24]:
fig = plot_new_point(new_message)

In [32]:
iplot(fig)