## Lab 4.3

#### Setup your imports

In [1]:
import pandas as pd 
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.cluster import KMeans
from sklearn import metrics
import matplotlib.pyplot as plt

#### 1. Pull the training set from the newsgroup data

In [2]:
newsgroups_train = fetch_20newsgroups(subset='train')

In [3]:
y = newsgroups_train.target
x = newsgroups_train.data

#### 2. Create the vectorizer 

In [4]:
vectorizer = CountVectorizer(max_features = 1000,
                             ngram_range=(1, 2),
                             stop_words='english',
                             binary=True)

#### 3. Create the Truncated Singular Value Decomposition

In [5]:
svd = TruncatedSVD(n_components=50, random_state=42)

#### 4. Setup your k-means clustering

In [6]:
print newsgroups_train.target

[7 4 4 ..., 3 1 8]


In [7]:
k = 10
km = KMeans(n_clusters=k)

#### 5. Fit the vectorizer and SVD

In [8]:
X = vectorizer.fit_transform(x)

In [9]:
X2 = svd.fit_transform(X)

#### 7. Fit the kmeans

In [10]:
km.fit(X2)

KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=10, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)

In [11]:
labels = km.labels_
centroids = km.cluster_centers_

In [12]:
print labels
print centroids

[9 9 1 ..., 6 3 9]
[[  4.32792035e+00  -1.15070242e-01   8.74496838e-01  -6.07072417e-01
   -1.29970811e-02  -1.41248157e-01   1.50315767e-01   1.60373792e-01
   -1.82131829e-01   7.20658892e-02   1.19618613e-01  -3.25913911e-02
   -1.86246051e-02   1.03941156e-01   2.55800888e-02  -1.69496201e-01
   -7.31721936e-03   1.89012616e-02  -7.63338569e-02   2.15183286e-02
   -1.09688129e-02  -2.27228378e-02  -1.10801904e-02   2.28741642e-02
    7.05062605e-03   2.76678207e-02  -4.49504539e-02   5.02603803e-02
   -3.50048101e-02   3.25842497e-02  -4.34251115e-02  -4.88250726e-02
   -7.95995012e-02  -1.29430237e-02   4.24683491e-02  -4.42085038e-03
   -4.90827656e-02   3.09577191e-02   3.43393340e-03  -1.74960392e-02
    1.78162968e-02   1.03319842e-04  -7.79935266e-03   6.37039262e-02
   -2.50635420e-02  -4.40351349e-03  -7.39779274e-03   2.95970359e-02
   -1.23696646e-02   2.31215832e-02]
 [  3.41459062e+00   1.33022932e+00  -5.68027119e-01  -3.25852263e-01
    5.83463470e-02  -1.34259948e-0

#### 8. Check the performance of our kmeans test

In [13]:
metrics.accuracy_score(y, labels)

0.043044016263036944

#### Classification Report

In [14]:
print metrics.classification_report(y, labels)

             precision    recall  f1-score   support

          0       0.06      0.09      0.07       480
          1       0.01      0.03      0.02       584
          2       0.00      0.00      0.00       591
          3       0.03      0.09      0.05       590
          4       0.04      0.13      0.06       578
          5       0.09      0.09      0.09       593
          6       0.09      0.22      0.13       585
          7       0.03      0.00      0.01       594
          8       0.03      0.07      0.04       598
          9       0.04      0.12      0.06       597
         10       0.00      0.00      0.00       600
         11       0.00      0.00      0.00       595
         12       0.00      0.00      0.00       591
         13       0.00      0.00      0.00       594
         14       0.00      0.00      0.00       593
         15       0.00      0.00      0.00       599
         16       0.00      0.00      0.00       546
         17       0.00      0.00      0.00   

  'precision', 'predicted', average, warn_for)


#### Confusion Matrix

In [15]:
print(metrics.confusion_matrix(y, labels))

[[ 43  67  30 113 123  37  14   4  19  30   0   0   0   0   0   0   0   0
    0   0]
 [ 21  15   2  47  72  56 101   4 119 147   0   0   0   0   0   0   0   0
    0   0]
 [ 18  28   1  59  89  34  88   2 150 122   0   0   0   0   0   0   0   0
    0   0]
 [ 25  24   4  53  76  26 125   2  99 156   0   0   0   0   0   0   0   0
    0   0]
 [ 20  23   1  79  74  41  83   1 102 154   0   0   0   0   0   0   0   0
    0   0]
 [ 17  16   0  54  58  56 123  12  92 165   0   0   0   0   0   0   0   0
    0   0]
 [ 10   9   0  18  16  21 127   0 171 213   0   0   0   0   0   0   0   0
    0   0]
 [ 45  62   6 103 117  30  64   2  71  94   0   0   0   0   0   0   0   0
    0   0]
 [ 27  41   2 138 147  32  62   1  42 106   0   0   0   0   0   0   0   0
    0   0]
 [ 42  49  12 125 122  26  40   0 107  74   0   0   0   0   0   0   0   0
    0   0]
 [ 46  42  13  89 133  20  39   1 111 106   0   0   0   0   0   0   0   0
    0   0]
 [ 78  85  18  93 106  20 102  10  32  51   0   0   0   0   0   0

#### Note: Repeat the kmeans test with varying values of "k" to determine the best performance. Use the techniques that we learned about in the *Tuning Clusters* lesson to further tune the clusters