In [34]:
import glob
import os
from nltk.corpus import stopwords
import string
import math
import numpy as np
from nltk.tokenize import RegexpTokenizer
from scipy.spatial import distance
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer 
  
lemmatizer = WordNetLemmatizer() 


porter_stemmer=PorterStemmer()
stop_words = set(stopwords.words('english'))


In [2]:

def getdatafromfile():
    classes=['athletics','cricket','football','rugby','tennis']
    list=[glob.glob('bbcsport/{}/*'.format(classes[i])) for i in range(len(classes))]
    return list,classes

def filesInEachClass(list):
    arr=[len(i) for i in list]
    print("count of total files in each class",arr)
    return arr

def trainCount(list, x=0.30):
    train_count=[len(i)-(int(len(i)*(x))) for i in list]
    print("count of train data in each class",train_count)
    return train_count


In [3]:
def preprocessData(list,classes,train_count):

    Train_documents=[]
    
    Test_documents=[]
    Train_set=[]
    
    alldocDict={}

    for mainClass in range(len(list)): 

        count=0
        for subFiles in range(len(list[mainClass])):
            count+=1
    #         print(len(list[mainClass]))
    #         print(list[mainClass][subFiles])

            f=open(list[mainClass][subFiles],'r')
            tokenizer = RegexpTokenizer(r'\w+')

            # convert to lower case
            fullfile = tokenizer.tokenize(f.read().lower())

            # stem document
            stemmedDocs=[porter_stemmer.stem(word) for word in fullfile]

            #trimming the file name and removing redundant '.txt'
            p=os.path.basename(list[mainClass][subFiles])
            p=p.split('.')[0]

            # remove all tokens that are not alphabetic and stop words
            tokens_without_sw = [word for word in stemmedDocs if word not in stop_words and word.isalpha()]

            alldocDict[mainClass,int(p)]=tokens_without_sw

            # append into train and test set
            if(count <= train_count[mainClass]):
                Train_documents.append((mainClass,int(p))) #
                Train_set.append(tokens_without_sw)
                
#                 Train_label.append(mainClass)
            else:
                Test_documents.append((mainClass,int(p)))
#                 Test_label.append(mainClass)



    print("train documents",len(Train_documents))
    print("test documents",len(Test_documents))
    print("total", len(Train_documents)+len(Test_documents))
    
    return Train_documents, Train_set ,Test_documents ,alldocDict


In [4]:
# calculate tfidf score

def tfidfCalculation(term_index, alldocDict,Train_documents):
    f = open("TFIDFdict_allDocs.txt","w")
    
    idf={}
    for word in term_index.keys():
        df=0
        for doc in alldocDict.keys():
            if word in alldocDict[doc]:
                df+=1
        idf[word]=math.log(len(Train_documents)/(df))
        

    for word in term_index.keys():
        for doc in alldocDict.keys():      
            if word in alldocDict[doc]:
    #             print('word found in ',doc)   
                term_index[word][doc]=((alldocDict[doc].count(word))*idf[word])  #calculate tfidf
            else:
                term_index[word][doc]=0
                
    f.write(str(term_index))
    f.close()
                
    return term_index

    

In [5]:
#Train_documents, Test_documents, tfidf_dict

def formVectors(Train_documents, Test_documents, tfidf_dict):

    #form train and test vectors
    trainDoc_vect={}
    testDoc_vect={}
    # for word in tfidf_dict.keys():

    f1 = open("TrainDocs_vector.txt","w")
    f2 = open("TestDocs_allDocs.txt","w")

    for docid in Train_documents:
        trainDoc_vect[docid]=[]
        for word in tfidf_dict.keys():
            trainDoc_vect[docid].append(tfidf_dict[word][docid])

    for docid in Test_documents:
        testDoc_vect[docid]=[]
        for word in tfidf_dict.keys():
            testDoc_vect[docid].append(tfidf_dict[word][docid])

    f1.write(str(trainDoc_vect))
    f1.close()
    f2.write(str(testDoc_vect))
    f2.close()
    
    return trainDoc_vect,testDoc_vect
    


In [6]:
#create train word index

def createIndex(Train_documents):
    word_index={}
    for doc in range(len(Train_documents)):
        for word in Train_documents[doc]:
            if word not in word_index:
                word_index[word]={}
 
    return word_index

In [7]:
list,classes=getdatafromfile()
# noOfdata=filesInEachClass(list)
train_count=trainCount(list,0.30)

count of train data in each class [71, 87, 186, 103, 70]


In [8]:

Train_documents, Train_set, Test_documents, alldocDict=preprocessData(list,classes,train_count)
# print("train Ids:\n",Train_documents)
# print("test Ids:\n", Test_documents)

# print(alldocDict[0,1])       # [0-->classID,1-->docID]

train documents 517
test documents 220
total 737


In [10]:
word_index=createIndex(Train_set) # index of words in training set(train_set has all the words that are in training set)

In [11]:
tfidf_dict=tfidfCalculation(word_index, alldocDict,Train_documents)

In [16]:
trainVector,testVector=formVectors(Train_documents, Test_documents, tfidf_dict)

In [45]:
# print(len(testVector)+len(trainVector))

In [39]:
testsimilarity={}
for testid in testVector:
    testsimilarity[testid]={}
    for trainid in trainVector:
        testsimilarity[testid][trainid]=distance.euclidean(testVector[testid], trainVector[trainid])
        

(0, 72)
(0, 73)
(0, 74)
(0, 75)
(0, 76)
(0, 77)
(0, 78)
(0, 79)
(0, 80)
(0, 81)
(0, 82)
(0, 83)
(0, 84)
(0, 85)
(0, 86)
(0, 87)
(0, 88)
(0, 89)
(0, 90)
(0, 91)
(0, 92)
(0, 93)
(0, 94)
(0, 95)
(0, 96)
(0, 97)
(0, 98)
(0, 99)
(0, 100)
(0, 101)
(1, 88)
(1, 89)
(1, 90)
(1, 91)
(1, 92)
(1, 93)
(1, 94)
(1, 95)
(1, 96)
(1, 97)
(1, 98)
(1, 99)
(1, 100)
(1, 101)
(1, 102)
(1, 103)
(1, 104)
(1, 105)
(1, 106)
(1, 107)
(1, 108)
(1, 109)
(1, 110)
(1, 111)
(1, 112)
(1, 113)
(1, 114)
(1, 115)
(1, 116)
(1, 117)
(1, 118)
(1, 119)
(1, 120)
(1, 121)
(1, 122)
(1, 123)
(1, 124)
(2, 187)
(2, 188)
(2, 189)
(2, 190)
(2, 191)
(2, 192)
(2, 193)
(2, 194)
(2, 195)
(2, 196)
(2, 197)
(2, 198)
(2, 199)
(2, 200)
(2, 201)
(2, 202)
(2, 203)
(2, 204)
(2, 205)
(2, 206)
(2, 207)
(2, 208)
(2, 209)
(2, 210)
(2, 211)
(2, 212)
(2, 213)
(2, 214)
(2, 215)
(2, 216)
(2, 217)
(2, 218)
(2, 219)
(2, 220)
(2, 221)
(2, 222)
(2, 223)
(2, 224)
(2, 225)
(2, 226)
(2, 227)
(2, 228)
(2, 229)
(2, 230)
(2, 231)
(2, 232)
(2, 233)
(2, 234)
(2, 2

In [58]:
# print("3NN of doc (4,100)--> ",sorted(testsimilarity[(4,100)], key=testsimilarity[(4,100)].get)[0:3])
# # n1,n2,n3=sorted(testsimilarity[(4,100)], key=testsimilarity[(4,100)].get)[0:3]
# # newList=[n1[0],n2[0],n3[0]]
# # frequentNN=max(set(newList), key = newList.count)
# # print(newList,frequentNN)

correct=0
no=0
for i in testsimilarity:
    n1,n2,n3=sorted(testsimilarity[i], key=testsimilarity[i].get)[0:3]
    newList=[n1[0],n2[0],n3[0]]
    frequentNN=max(set(newList), key = newList.count)
    print(i[0],frequentNN)
    if(i[0]==frequentNN):
        correct+=1
    else:
        no+=1
    

    

3NN of doc (4,100)-->  [(4, 51), (4, 47), (2, 4)]
0 0
0 0
0 0
0 0
0 0
0 0
0 2
0 0
0 0
0 0
0 0
0 2
0 0
0 0
0 0
0 2
0 0
0 0
0 0
0 0
0 2
0 2
0 0
0 0
0 0
0 2
0 0
0 0
0 0
0 0
1 1
1 2
1 1
1 2
1 2
1 2
1 0
1 2
1 0
1 2
1 1
1 0
1 0
1 2
1 1
1 0
1 2
1 2
1 2
1 2
1 2
1 0
1 1
1 0
1 2
1 1
1 1
1 0
1 2
1 2
1 2
1 2
1 0
1 0
1 2
1 2
1 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 0
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
3 0
3 2
3 2
3 0
3 2
3 0
3 0
3 2
3 2
3 2
3 2
3 2
3 0
3 0
3 2
3 0
3 2
3 0
3 3
3 3
3 2
3 2
3 2
3 0
3 3
3 2
3 3
3 3
3 3
3 3
3 3
3 0
3 2
3 3
3 2
3 3
3 3
3 3
3 3
3 3
3 2
3 2
3 2
3 2
4 4
4 0
4 0
4 0
4 4
4 4
4 0
4 4
4 0
4 4
4 2
4 4
4 4
4 2
4 2
4 0
4 4
4 2
4 4
4 4
4 4
4 2
4 4
4 2
4 4
4 4
4 2
4 4
4 2
4 4


In [62]:
print(no)
print(correct/(no+correct))

81
0.6318181818181818
