In [4]:
import glob
import os
from nltk.corpus import stopwords
import string
import math
import numpy as np
from nltk.tokenize import RegexpTokenizer
from scipy.spatial import distance
from nltk.stem import PorterStemmer
import random



porter_stemmer=PorterStemmer()
stop_words = set(stopwords.words('english'))

In [5]:

def getdatafromfile():
    classes=['athletics','cricket','football','rugby','tennis']
    list=[glob.glob('bbcsport/{}/*'.format(classes[i])) for i in range(len(classes))]
    return list,classes

def filesInEachClass(list):
    arr=[len(i) for i in list]
    print("count of total files in each class",arr)
    return arr

def trainCount(list, x=0.30):
    train_count=[len(i)-(int(len(i)*(x))) for i in list]
    print("count of train data in each class",train_count)
    return train_count


In [6]:
#create train word index

def createIndex(Train_documents):
    word_index={}
    for doc in range(len(Train_documents)):
        for word in Train_documents[doc]:
            if word not in word_index:
                word_index[word]={}
 
    return word_index


def preprocessData(list,classes,train_count):

#     Train_documents=[]
    
#     Test_documents=[]
#     Train_set=[]
    
    alldocDict={}

    for mainClass in range(len(list)): 

#         count=0
        for subFiles in range(len(list[mainClass])):
#             count+=1
    #         print(len(list[mainClass]))
    #         print(list[mainClass][subFiles])

            f=open(list[mainClass][subFiles],'r')
            tokenizer = RegexpTokenizer(r'\w+')

            # convert to lower case
            fullfile = tokenizer.tokenize(f.read().lower())

            # stem document
            stemmedDocs=[porter_stemmer.stem(word) for word in fullfile]

            #trimming the file name and removing redundant '.txt'
            p=os.path.basename(list[mainClass][subFiles])
            p=p.split('.')[0]

            # remove all tokens that are not alphabetic and stop words
            tokens_without_sw = [word for word in stemmedDocs if word not in stop_words and word.isalpha()]

            alldocDict[mainClass,int(p)]=tokens_without_sw

            # append into train and test set
#             if(count <= train_count[mainClass]):
#                 Train_documents.append((mainClass,int(p))) #
#                 Train_set.append(tokens_without_sw)
                
# #                 Train_label.append(mainClass)
#             else:
#                 Test_documents.append((mainClass,int(p)))
#                 Test_label.append(mainClass)



#     print("train documents",len(Train_documents))
#     print("test documents",len(Test_documents))
#     print("total", len(Train_documents)+len(Test_documents))
    
#     return Train_documents, Train_set ,Test_documents ,alldocDict
    return alldocDict


In [30]:
def GetTrainTestList(totalfiles,alldocDict,train_count,classes):
    Train_documents=[]
    Test_documents=[]
    Train_set=[]
    Train_Init=[]
    for classId in range(len(classes)):
        Train_Init.append(random.sample(range(1, totalfiles[classId]), train_count[classId]))
#         print(classId,len(Train_Init[classId]),Train_Init[classId])

    for classId in range(len(classes)):
        print(classId,"  -----> ")
        for i in Train_Init[classId]:
            Train_documents.append((classId,i))
            Train_set.append(alldocDict[(classId,i)])
        print(len(Train_documents),"-------------------------------------------------------")
        
    for class_id in alldocDict.keys():
        if(class_id not in Train_documents):
            Test_documents.append(class_id)
#             print(class_id)
            
#     print(len(Train_set))
    print(len(Test_documents))
    print(len(Train_documents))

    return Train_documents, Train_set, Test_documents

In [31]:
list,classes=getdatafromfile()
# noOfdata=filesInEachClass(list)
train_count=trainCount(list,0.30)
totalFiles=filesInEachClass(list)

count of train data in each class [71, 87, 186, 103, 70]
count of total files in each class [101, 124, 265, 147, 100]


In [32]:

# Train_documents, Train_set, Test_documents, 

alldocDict=preprocessData(list,classes,train_count)
Train_documents, Train_set, Test_documents=GetTrainTestList(totalFiles,alldocDict,train_count,classes)
# print("train Ids:\n",Train_documents)
# print("test Ids:\n", Test_documents)

# print(alldocDict[0,1])       # [0-->classID,1-->docID]

0   -----> 
71 -------------------------------------------------------
1   -----> 
158 -------------------------------------------------------
2   -----> 
344 -------------------------------------------------------
3   -----> 
447 -------------------------------------------------------
4   -----> 
517 -------------------------------------------------------
220
517


In [33]:
# calculate tfidf score

def tfidfCalculation(term_index, alldocDict,Train_documents):
    f = open("TFIDFdict_allDocs.txt","w")
    
    idf={}
    for word in term_index.keys():
        df=0
        for doc in alldocDict.keys():
            if word in alldocDict[doc]:
                df+=1
        idf[word]=math.log(len(Train_documents)/(df))
        

    for word in term_index.keys():
        for doc in alldocDict.keys():      
            if word in alldocDict[doc]:
    #             print('word found in ',doc)   
                term_index[word][doc]=((alldocDict[doc].count(word))*idf[word])  #calculate tfidf
            else:
                term_index[word][doc]=0
                
    f.write(str(term_index))
    f.close()
                
    return term_index

    

In [34]:
#Train_documents, Test_documents, tfidf_dict

def formVectors(Train_documents, Test_documents, tfidf_dict):

    #form train and test vectors
    trainDoc_vect={}
    testDoc_vect={}
    # for word in tfidf_dict.keys():

    f1 = open("TrainDocs_vector.txt","w")
    f2 = open("TestDocs_allDocs.txt","w")

    for docid in Train_documents:
        trainDoc_vect[docid]=[]
        for word in tfidf_dict.keys():
            trainDoc_vect[docid].append(tfidf_dict[word][docid])

    for docid in Test_documents:
        testDoc_vect[docid]=[]
        for word in tfidf_dict.keys():
            testDoc_vect[docid].append(tfidf_dict[word][docid])

    f1.write(str(trainDoc_vect))
    f1.close()
    f2.write(str(testDoc_vect))
    f2.close()
    
    return trainDoc_vect,testDoc_vect
    


In [35]:
word_index=createIndex(Train_set) # index of words in training set(train_set has all the words that are in training set)

In [36]:
tfidf_dict=tfidfCalculation(word_index, alldocDict,Train_documents)

In [37]:
trainVector,testVector=formVectors(Train_documents, Test_documents, tfidf_dict)

In [38]:
# print(len(testVector)+len(trainVector))

In [39]:
testsimilarity={}
for testid in testVector:
    testsimilarity[testid]={}
    for trainid in trainVector:
        testsimilarity[testid][trainid]=(distance.cosine(testVector[testid], trainVector[trainid]))
#         distance.euclidean(testVector[testid], trainVector[trainid])
        

In [40]:
correct=0
no=0
for i in testsimilarity:
    n1,n2,n3=sorted(testsimilarity[i], key=testsimilarity[i].get)[0:3]
    newList=[n1[0],n2[0],n3[0]]
    frequentNN=max(set(newList), key = newList.count)
    print(i[0],frequentNN)
    if(i[0]==frequentNN):
        correct+=1
    else:
        no+=1
  

0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 0
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 0
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 0
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 2
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
4 4
4 4
4 4
4 4
4 4
4 4
4 4
4 4
4 4
4 4
4 4
4 4
4 4
4 4
4 4
4 4
4 4
4 4
4 4
4 4
4 4
4 4
4 4
4 4
4 4
4 4
4 4
4 4
4 4
4 4


In [41]:
print(no)
print((correct/(no+correct))*100)

4
98.18181818181819
