<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc" style="margin-top: 1em;"><ul class="toc-item"><li><span><a href="#20-news-groups-load" data-toc-modified-id="20-news-groups-load-1">20 news groups load</a></span></li><li><span><a href="#Spectral-Clustering" data-toc-modified-id="Spectral-Clustering-2">Spectral Clustering</a></span></li><li><span><a href="#PCA" data-toc-modified-id="PCA-3">PCA</a></span></li><li><span><a href="#SPECTRAL" data-toc-modified-id="SPECTRAL-4">SPECTRAL</a></span></li><li><span><a href="#SPECTRAL-V2" data-toc-modified-id="SPECTRAL-V2-5">SPECTRAL V2</a></span></li><li><span><a href="#3D" data-toc-modified-id="3D-6">3D</a></span></li></ul></div>

# 20 news groups load

In [1]:
import spectral
import scipy
from scipy import sparse
import numpy as np
import pandas as pd
import networkx as nx
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline
sns.set(rc={"figure.figsize": (15, 6)})
sns.set_palette(sns.color_palette("Set2", 10))

from sklearn import preprocessing, decomposition, model_selection

In [2]:
from sklearn import metrics
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [3]:
all_news = fetch_20newsgroups(subset='all')

In [4]:
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.5, sublinear_tf=True, max_features=500)

In [190]:
news_data = vectorizer.fit_transform(all_news.data)
news_target = all_news.target
news_target_names = all_news.target_names 

feature_names = vectorizer.get_feature_names()

In [191]:
subset_mask = np.random.permutation(len(news_target))[:4000]

X = scipy.sparse.csr_matrix.todense(news_data[subset_mask])
y_brute = news_target[subset_mask]

In [192]:
X.shape, y_brute.shape

((4000, 500), (4000,))

In [193]:
news_target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [194]:
df_targets = pd.DataFrame(news_target_names)

In [None]:
parent_cat = ['comp.', 'rec.', 'religion', '.politics.'] #, 'sci.'] #, 'misc.forsale']

parent_cat_ind = []
for c in parent_cat:
    ind = df_targets[df_targets[0].apply(lambda x: c in x).values].index.values
    if c == 'religion':
        ind = np.append(ind, 0)
    parent_cat_ind.append(ind)

In [None]:
parent_cat_ind

In [None]:
def is_in(s):
    return np.vectorize(lambda x: x in s)

In [None]:
y = np.ones_like(y_brute) * 999

for i, s in enumerate(parent_cat_ind):
    m = is_in(s)(y_brute)
    print(i, s, m)
    y[m] = i

In [None]:
set(y)

In [None]:
y.shape

targets = np.random.permutation(len(news_target_names))

mask = np.zeros_like(news_target, dtype=bool)
label_to_name = {}

for t in targets:
    name = news_target_names[t]
    label_to_name[t] = name
    print(name)
    mask = mask | (news_target == t)

# Spectral Clustering

In [None]:
print(np.count_nonzero(y == np.nan))

In [None]:
distances = spectral.features_to_dist_matrix(X, metric='cosine')

if np.count_nonzero(np.isnan(distances)) > 0:
    print('there are some nan')
    distances = np.nan_to_num(distances, copy=False)

In [None]:
plt.hist(np.nan_to_num(distances, copy=True).flatten(), bins=100);

In [None]:
print('{} distances equal exactly zero.'.format(np.sum(distances == 0)))

In [None]:
all_weights = spectral.dist_to_adj_matrix(distances, 'gaussian')

In [None]:
fix, axes = plt.subplots(2, 2, figsize=(17, 8))
def plot(weights, axes):
    axes[0].spy(weights)
    axes[1].hist(weights[weights > 0].reshape(-1), bins=50);

NEIGHBORS = 100
weights = spectral.filter_neighbors(all_weights, NEIGHBORS)

plot(all_weights, axes[:, 0])
plot(weights, axes[:, 1])

In [None]:
# Weighted degree distribution
degrees = np.sum(weights, axis=0)

plt.hist(degrees, bins=50, log=True);

In [None]:
D = np.diag(degrees)
W = weights
L = D - W

inv_sqrt_D = np.diag(1 / np.diag(D**(0.5)))

normalized_laplacian = inv_sqrt_D @ L @ inv_sqrt_D

In [None]:
plt.spy(normalized_laplacian);

In [None]:
eigenvalues, eigenvectors = sparse.linalg.eigsh(normalized_laplacian, k=20, which='SM') # which='SA' gives us similar results

In [None]:
plt.plot(eigenvalues, '.-', markersize=20);

In [None]:
G = nx.from_numpy_matrix(L)
gc = max(nx.connected_component_subgraphs(G), key=len)

In [None]:
assert len(gc.nodes()) == weights.shape[0], "GCC should contain all nodes"

# PCA

In [None]:
features_pca = decomposition.PCA(n_components=3).fit_transform(X)

In [None]:
plt.scatter(features_pca[:, 0], features_pca[:, 1], c=y, cmap='Accent', alpha=0.5);

# SPECTRAL

In [None]:
spectral.compare_plot_labels(eigenvectors, y)

# SPECTRAL V2

In [None]:
unsupervised_labels = (eigenvectors[:, 1] - eigenvectors[:, 2]) > 0

In [None]:
accuracy = spectral.compute_err(y, unsupervised_labels)

print('accuracy of {}%'.format(int(accuracy * 10000) / 100))

In [None]:
f, axes = plt.subplots(nrows=1, ncols=2)

for ax, lab in zip(axes.flatten(), [unsupervised_labels, y]):
    spectral.plot_labels(eigenvectors, lab, ax=ax)

# 3D

In [None]:
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go

In [None]:
init_notebook_mode(connected=True)
print(__version__)

In [None]:
traces = []
labels = sorted(set(y))
labels_to_name = {l:parent_cat[l] for l in labels[:len(parent_cat_ind)]}
labels_to_name[999] = 999


for label in labels:
    label_mask = y == label
    #'''
    axis_x = eigenvectors[:, 1][label_mask]
    axis_y = eigenvectors[:, 2][label_mask]
    axis_z = eigenvectors[:, 3][label_mask]
    '''
    axis_x = features_pca[:, 0][label_mask]
    axis_y = features_pca[:, 1][label_mask]
    axis_z = features_pca[:, 2][label_mask]
    #'''
    trace = go.Scatter3d(
        x=axis_x,
        y=axis_y,
        z=axis_z,
        name=labels_to_name[label],
        mode='markers',
        marker=dict(
            size=12,
            color=label,
            #colorscale='Viridis',
            #opacity=0.8,
            line=dict(
                width=2,
                color='black'
            )
        )
    )
    
    traces.append(trace)

In [None]:
layout = go.Layout(
    hovermode= 'closest',
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0
    )
)

In [None]:
data = traces

fig = go.Figure(data=data, layout=layout)
iplot(fig)