In [330]:
import glob
import os
from nltk.corpus import stopwords
import json
import string
import math
import numpy as np
from array import array
from nltk.tokenize import RegexpTokenizer
from scipy.spatial import distance
from nltk.stem import PorterStemmer
import random
import tkinter as tk


porter_stemmer=PorterStemmer()
stop_words = set(stopwords.words('english'))

## function to get files from directory

In [331]:
def getdatafromfile():
    classes=['athletics','cricket','football','rugby','tennis']
    list=[glob.glob('bbcsport/{}/*'.format(classes[i])) for i in range(len(classes))]
    return list,classes

## K-Mean clustering class

In [332]:
class KMeanClusteringClass:
    
    def __init__(self,list,classes):
        self.list=list
        self.classes=classes
        self.totalfiles=[len(i) for i in list]
        print("Total files in each class: ",self.totalfiles)      
        
    def preprocessData(self):

        self.alldocDict={}
        Init_corpus=[]
        
        for mainClass in range(len(self.list)): 
            for subFiles in range(len(self.list[mainClass])):
                f=open(self.list[mainClass][subFiles],'r')
                tokenizer = RegexpTokenizer(r'\w+')
                # convert to lower case
                fullfile = tokenizer.tokenize(f.read().lower())
                # stem document
                stemmedDocs=[porter_stemmer.stem(word) for word in fullfile]
                #trimming the file name and removing redundant '.txt'
                p=os.path.basename(self.list[mainClass][subFiles])
                p=p.split('.')[0]
                # remove all tokens that are not alphabetic and stop words
                tokens_without_sw = [word for word in stemmedDocs if word not in stop_words and word.isalpha()]
                self.alldocDict[mainClass,int(p)]=tokens_without_sw
                Init_corpus.append(tokens_without_sw)
                
        self.UniqueWord_corpus=set().union(*Init_corpus)
 
    
    def getAllDocs(self):
        return self.alldocDict
    
    def featureExtractionDf(self):
        print(len(self.UniqueWord_corpus))     
        self.corpus=[]
        self.idf={}
        for word in self.UniqueWord_corpus:
            df=0
            for v in self.alldocDict.values():
                if word in v:
                    df+=1
            if(df>2):
                self.corpus.append(word)
                self.idf[word]=round(math.log(737/df),5)
                
        print(len(self.corpus))
#         print(self.idf.keys())
        
    def calculatetfidfAndFormVectors(self):
              
        tfidf={}
        for word in self.idf.keys():
            tfidf[word]={}
            for doc in self.alldocDict.keys():
                if word in self.alldocDict[doc]:
                    tfidf[word][doc]=(self.alldocDict[doc].count(word)*self.idf[word])
                else:
                    tfidf[word][doc]=0
                    
        f = open("TFIDF_Clustering_allDocs.txt","w")          
        f.write(str(tfidf))
        f.close()
        print("tfidf calculated")
        
        self.docVect={}
        for docid in self.alldocDict.keys():
            
            self.docVect[docid]=[tfidf[word][docid] for word in self.corpus]

        f = open("DocVectors_Clustering.txt","w")          
        f.write(str(self.docVect))
        f.close()
        
    
    def getInitialCentroid(self,k=5):

        centro=[]
        cent=random.sample(self.docVect.keys(),k)
        print("initial seed-->",cent)

        for i in range(self.k):
#             print("centroid vector-->",cent[i],"-->",self.docVect[cent[i]])
            centro.append(self.docVect[cent[i]])

        
        return centro
    
    def purity(self,cluster):
        finalLabels=[[],[],[],[],[]]
        fc=[]
        p=0
        for i in range(len(cluster)):
            print("----------------",i,"----------------")
            for j in cluster[i]:
                finalLabels[i].append(j[0])
            
            print(finalLabels[i])
            
            frequentLabel=max(set(finalLabels[i]), key = finalLabels[i].count)
            
            cnt=finalLabels[i].count(frequentLabel)
            fc.append(cnt) 
            
            print("frequent class=",frequentLabel,",count=",cnt)
            
        s=sum(fc)
        print("sum=",s)
        p=s/737
        print("purity",p)
            
        
        return p
        
                
    def formCluster(self,cent,cluster):
        cluster=[[],[],[],[],[]]
        
        #traversing through all docs
        for docId in self.docVect.keys():
            dist=[]     
            #doc distance with all centroids
            for centdocVect in cent:
                dist.append(distance.cosine(self.docVect[docId], centdocVect))
            
            cluster[dist.index(min(dist))].append(docId) 
            
        return cluster
        
    def KmeanCentroid(self,cluster):
        
        centroid=[]
        
        for c in range(len(cluster)):
            
            lv=[] #temporary list vector of each cluster
            lv=[self.docVect[cid] for cid in cluster[c]]
            s=np.array(lv)
            centroidMean=np.mean(s,axis=0)
            centroidMean=centroidMean.tolist()
            centroid.append(centroidMean)        

        return centroid
        
        
    def KmeanClustering(self,k):
        self.k=k
        centroid=self.getInitialCentroid(k) #it has id of cendroid docs
        cluster=[[]]*self.k #initialing empty list of lists
        
        i=0
        while(i<30): 
            print("Iteration",i+1)
            i+=1
            cluster=self.formCluster(centroid,cluster)
            newCentroid=[]
            newCentroid=self.KmeanCentroid(cluster)
            
#             print(newCentroid)
    
            if(sorted(centroid) == sorted(newCentroid)):
                print("same centroids found!")
                p=self.purity(cluster)
                break
                
            else:
                print("centroids not same!")
                centroid=[]
                centroid=newCentroid
                       
        return
        
    

## get classwise documents

In [233]:
list,classes=getdatafromfile()

## Initialize class and perform pre-processing and feature extraction

In [234]:
clf=KMeanClusteringClass(list,classes)
clf.preprocessData()
clf.featureExtractionDf()

Total files in each class:  [101, 124, 265, 147, 100]
9130
4336


## form document vectors on the basis of tfidf

In [235]:
clf.calculatetfidfAndFormVectors()

tfidf calculated


## perform K-Mean Clustering

In [237]:
clf.KmeanClustering(5)

initial seed--> [(1, 25), (2, 58), (4, 27), (2, 257), (4, 81)]
Iteration 1
centroids not same!
Iteration 2
centroids not same!
Iteration 3
centroids not same!
Iteration 4
centroids not same!
Iteration 5
centroids not same!
Iteration 6
centroids not same!
Iteration 7
centroids not same!
Iteration 8
centroids not same!
Iteration 9
centroids not same!
Iteration 10
centroids not same!
Iteration 11
centroids not same!
Iteration 12
centroids not same!
Iteration 13
centroids not same!
Iteration 14
centroids not same!
Iteration 15
centroids not same!
Iteration 16
centroids not same!
Iteration 17
same centroids found!
---------------- 0 ----------------
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1