In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import MiniBatchKMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.datasets import load_files

In [2]:
DATA_DIR = "bbc\\"
data = load_files(DATA_DIR, encoding="utf-8", decode_error="replace", random_state=1000)
df = pd.DataFrame(list(zip(data['data'], data['target'])), columns=['text', 'label'])


In [3]:
df.head(30)

Unnamed: 0,text,label
0,News Corp eyes video games market\n\nNews Corp...,0
1,Wasps 31-37 Leicester\n\nLeicester withstood a...,3
2,Looks and music to drive mobiles\n\nMobile pho...,4
3,'Errors' doomed first Dome sale\n\nThe initial...,2
4,Sculthorpe wants Lions captaincy\n\nPaul Scult...,3
5,Mobile gaming takes off in India\n\nGaming on ...,4
6,Watchdog probes e-mail deletions\n\nThe inform...,2
7,Edwards tips Idowu for Euro gold\n\nWorld outd...,3
8,Parmar ruled out of Davis Cup tie\n\nA knee in...,3
9,World tour for top video gamers\n\nTwo UK game...,4


In [4]:
df[df['label']==2].head(30)

Unnamed: 0,text,label
3,'Errors' doomed first Dome sale\n\nThe initial...,2
6,Watchdog probes e-mail deletions\n\nThe inform...,2
10,Strike threat over pension plans\n\nMillions o...,2
17,Labour MP praises Tory campaign\n\nThe Conserv...,2
19,UK pledges £1bn to vaccine effort\n\nUK Chance...,2
25,Boateng to step down at election\n\nPaul Boate...,2
33,Muslims discuss election concerns\n\nIssues th...,2
34,"Nat Insurance to rise, say Tories\n\nNational ...",2
36,Parties' plans for council tax\n\nAnger at cou...,2
37,Hospital suspends 'no Welsh' plan\n\nAn Englis...,2


In [5]:
df['label'].value_counts()

3    511
0    510
2    417
4    401
1    386
Name: label, dtype: int64

In [6]:
vec = TfidfVectorizer(stop_words="english")
vec.fit(df.text.values)
features = vec.transform(df.text.values)

In [7]:
features.toarray()

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.02280016, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.02459015, 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [8]:
len(features.toarray()) , len(features.toarray()[0]) 

(2225, 29126)

In [10]:
cls = MiniBatchKMeans(n_clusters=5, random_state=1000)
cls.fit(features)

AttributeError: 'NoneType' object has no attribute 'split'

In [None]:
# predict cluster labels for new dataset
cls.predict(features)

In [None]:
# to get cluster labels for the dataset used while
# training the model (used for models that does not
# support prediction on new dataset).
cls.labels_[:20]

In [None]:
cls.cluster_centers_

In [None]:
len(cls.cluster_centers_) , len(cls.cluster_centers_[0])

In [None]:
# reduce the features to 2D
pca = PCA(n_components=2, random_state=100)
reduced_features = pca.fit_transform(features.toarray())
reduced_features

In [None]:
len(reduced_features) , len(reduced_features[0]) 

In [None]:
# reduce the cluster centers to 2D
reduced_cluster_centers = pca.transform(cls.cluster_centers_)
reduced_cluster_centers

In [None]:
plt.scatter(reduced_features[:,0], reduced_features[:,1], c=cls.predict(features))
plt.scatter(reduced_cluster_centers[:, 0], reduced_cluster_centers[:,1], marker='x', s=150, c='b')


In [None]:
from sklearn.metrics import homogeneity_score
homogeneity_score(df.label, cls.predict(features))


In [None]:
cls.predict(vec.transform(['president']))

In [None]:
cls.predict(vec.transform(['music']))

In [None]:
cls.predict(vec.transform(['game']))

In [None]:
cls.predict(vec.transform(['festival']))

In [None]:
cls.predict(vec.transform(['talks']))