## Imports and Declerations:

In [32]:
import pickle
import os
import string
import pandas as pd
import re
from IPython.core.interactiveshell import InteractiveShell
from IPython.display import clear_output
import numpy as np
from copy import deepcopy
import random
from nltk.stem import PorterStemmer
from collections import Counter
import joblib
InteractiveShell.ast_node_interactivity = "all"

## Helper Functions:

In [2]:
# Remove Punctuation
def remove_punctuation(word):
    return word.translate(word.maketrans('', '', string.punctuation))


# Clean Query Term
def clean_word(word):
    # Case Folding
    ps = PorterStemmer()
    word = word.lower()
    # Filter non-ASCII characters
    word = ''.join(filter(lambda x: x in printable, word))
    #     print(word)
    # Remove Punctuations
    if word != '(' and word != ')':
        word = remove_punctuation(word)
#     print(word)
    if re.match('\d+[A-Za-z]+', word):
        word = re.split('\d+', word)[1]
    if re.match('[A-Za-z]+\d+', word):
        word = re.split('\d+', word)[0]


#     print(word)
    word = ps.stem(word)
    #     print(word)
    return word

In [3]:
DOCUMENTS_PATH = ('data', 'bbcsport')
STOPWORD_PATH = ('Stopword-List.txt')

In [131]:
class DocToVec(object):
    def __init__(self, DOCUMENTS_PATH, STOP_WORD_PATH):
        self.doc_index = {}
        self.documents_path = DOCUMENTS_PATH
        self.stop_word_path = STOPWORD_PATH
        self.stop_words = self.load_stop_words()
        self.vocab_index = self.file_extraction_wrapper(extract_vocab=True)
        self.vectors = self.file_extraction_wrapper(extract_vectors=True)
        self.X = self.vectors[0]
        self.y = self.vectors[1]

    def file_extraction_wrapper(self,
                                extract_vocab=False,
                                extract_vectors=False):
        vocab = set()
        docs = {}
        printable = set(string.printable)
        raw_data = []
        if extract_vectors:
            X = []
            y = []
        doc_count = 0
        # Printable characters are
        # 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
        # !"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c
        ps = PorterStemmer()
        classes = next(os.walk(os.path.join(self.documents_path)))[1]
        for c in classes:
            docs_in_c = next(os.walk(os.path.join(self.documents_path, c)))[2]
            print(c)
            for doc in docs_in_c:
                #             print(f'\t{doc}',end=',')
                if extract_vectors:
                    doc_vector = np.zeros((len(self.vocab_index)))
                with open(os.path.join(self.documents_path, c, doc),
                          'r') as file1:
                    lines = file1.readlines()
                    if extract_vectors:
                        doc_name = os.path.join(self.documents_path, c, doc)
                        self.doc_index[doc_count] = os.path.join(c, doc)
                        doc_count+=1
                    
                    for line_no, line in enumerate(lines):

#                         symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
#                         for i in symbols:
#                             line = line.replace(i, ' ')
                        for word in re.split('[.\s,?!:;-]', line):

                            # Case Folding
                            word = word.lower()

                            # Filter non-ASCII characters
                            word = ''.join(
                                filter(lambda x: x in printable, word))

                            if word in self.stop_words:
                                continue

                            # Remove Punctuations
                            word = remove_punctuation(word)

                            if re.match('\d+[A-Za-z]+', word):
                                word = re.split('\d+', word)[1]
                            if re.match('[A-Za-z]+\d+', word):
                                word = re.split('\d+', word)[0]

                            if len(word) == 0 or len(
                                    word) == 1 or word == '' or word == ' ':
                                continue

                            word = ps.stem(word)

                            if extract_vocab:
                                vocab.add(word)
                            if extract_vectors:
                                doc_vector[self.vocab_index[word]] += 1

                    if extract_vectors:
                        X.append(doc_vector)
                        y.append(c)
        if extract_vocab:
            print(f'Vocab Size : {len(vocab)}')
            vocab_list = sorted(list(vocab))
            vocab_hash = dict.fromkeys(vocab_list, 0)
            vocab_index = {
                word: index
                for index, word in enumerate(vocab_list)
            }
            return vocab_index

        if extract_vectors:
            return (X, y)

    def load_stop_words(self):
        stop_words = set()
        with open(self.stop_word_path, 'r') as stop_word_file:
            lines = stop_word_file.readlines()
            for line in lines:
                stop_words.add(line.split('\n')[0])
        return stop_words

In [132]:
dv = DocToVec(DOCUMENTS_PATH=os.path.join(*DOCUMENTS_PATH),
              STOP_WORD_PATH=os.path.join(*STOPWORD_PATH))

athletics
cricket
football
rugby
tennis
Vocab Size : 9585
athletics
cricket
football
rugby
tennis


In [133]:
# Save Vectors
vectors_file_name = 'VectorSpace'
pickle.dump(dv, open(vectors_file_name , 'wb'))

In [134]:
# Get Tf Feature Rows
data = pd.DataFrame(dv.X)
data.shape
# Feature Selection
# Drop Features with Df < 3
data.drop([
    col for col, val in pd.DataFrame(dv.X).sum().iteritems() if int(val) <= 3
],axis=1,inplace=True)
data.shape

(737, 9585)

(737, 4226)

In [135]:
# Tf-Idf Calculations
data.mul(data.sum().apply(lambda df: np.log10(data.shape[0] / (df + 1))),
         axis=1)

Unnamed: 0,1,10,16,22,23,24,30,33,34,35,...,9563,9564,9568,9572,9573,9575,9577,9579,9583,9584
0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.000000,0.0,2.089316,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Train Test Split:

In [139]:
data['label'] = dv.y
shuffled_data = data

shuffled_data.sample(frac=1)
train_size = 0.8
test_size = 0.2

train_data, test_data = shuffled_data.sample(frac=train_size), shuffled_data.sample(frac=test_size)
X_train, y_train = train_data.loc[:, train_data.columns != 'label'], train_data['label']
X_test, y_test = test_data.loc[:, test_data.columns != 'label'], test_data['label']

Unnamed: 0,1,10,16,22,23,24,30,33,34,35,...,9564,9568,9572,9573,9575,9577,9579,9583,9584,label
448,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,football
30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,athletics
22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,athletics
624,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,rugby
536,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,rugby
158,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,cricket
507,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,rugby
345,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,football
289,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,football
104,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,cricket


## Distance Formula:

In [140]:
def euclidian_distance(p1, p2):
    return np.linalg.norm(np.array(p2) - np.array(p1))


def cosine_similarity(p1, p2):
    return ((np.dot(p1, p2)) / (np.linalg.norm(p1) * np.linalg.norm(p2)))

In [141]:
def accuracy(y_test, pred):
    return len([1 for p, y in zip(pred, y_test) if p == y]) / len(pred) * 100

## k-nearest neighbors (KNN):

In [142]:
class KNNClassifier():
    def __init__(self, neighbors=3, distance_formula=euclidian_distance):
        self.distance_formula = distance_formula
        self.neighbors = neighbors
        self.X_train = []
        self.y_train = []

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X_test):
        pred = []
        for index, test_row in X_test.iterrows():
            print(index)
            clear_output(wait=True)
            if self.distance_formula == euclidian_distance:
                pred.append(
                    self.X_train.apply(
                        (lambda row: self.distance_formula(row, test_row)),
                        axis=1).sort_values(ascending=True))
            else:
                pred.append(
                    self.X_train.apply(
                        (lambda row: self.distance_formula(row, test_row)),
                        axis=1).sort_values(ascending=False))

        new_pred = [x[:self.neighbors] for x in pred]
        label_pred = []
        for indexes in new_pred:
            labels = []
            #     print(indexes)
            for index, value in indexes.items():
                #         print(index)
                #         print(y[index])
                labels.append(self.y_train[index])
            label_pred.append(Counter(labels).most_common(1)[0][0])
        return label_pred

In [143]:
knn = KNNClassifier(distance_formula=cosine_similarity)
knn.fit(X_train, y_train)
pred = knn.predict(X_test)

527


In [144]:
accuracy(y_test, pred)

99.31972789115646

In [145]:
knn = KNNClassifier(distance_formula=euclidian_distance)
knn.fit(X_train, y_train)
pred = knn.predict(X_test)

527


In [146]:
accuracy(y_test, pred)

93.19727891156462

## K-Means Clustering:

In [155]:
# PreProcessing For Kmeans
clustering_data = data
clustering_data['file_names'] = dv.doc_index.values()
clustering_data = clustering_data.sample(frac = 1)
clustering_data_labels, clustering_data_file_names = clustering_data['label'], clustering_data['file_names']
clustering_data = clustering_data.drop(['label', 'file_names'], axis = 1)
clustering_data_input = clustering_data

In [156]:
clustering_data

Unnamed: 0,1,10,16,22,23,24,30,33,34,35,...,9563,9564,9568,9572,9573,9575,9577,9579,9583,9584
694,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
626,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
121,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
130,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
115,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
589,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
154,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
272,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
243,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
303,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [157]:
from collections import Counter


class KMeansCluster():
    def __init__(self, n_clusters=2, distance_formula=euclidian_distance):
        self.num_clusters = n_clusters
        self.n_clusters = n_clusters
        self.max_iterations = 100
        self.distance_formula = distance_formula

    def get_labels(self, X_train, centroids):
        a1 = pd.DataFrame(centroids).apply(lambda center: X_train.apply(
            (lambda row: self.distance_formula(row, center)), axis=1),
                                           axis=1)
        a2 = []
        print(a1.shape)
        sort_ascending = self.distance_formula == euclidian_distance
        for col in a1:
            if sort_ascending:
#                 print(a1[col].sort_values(ascending=True).index[:5])
                a2.append(a1[col].sort_values(ascending=True).index[0])
            else:
                a2.append(a1[col].sort_values(ascending=False).index[0])
        return a2


#     def nearest_centeroid(self, x, centroids):
#             return pd.DataFrame(centroids).apply(lambda center: euclidian_distance(x, center), axis=1).sort_values(ascending=True).index[0]

    def fit(self, X_train):
        self.X_train = X_train

        self.centroids = []
        for c in range(0, self.num_clusters):
            self.centroids.append(
                pd.DataFrame(X_train.sample(self.n_clusters).values).apply(
                    sum, axis=0) / self.n_clusters)

        def new_centroids(X, labels):

            centroids = []
            for c in range(0, self.num_clusters):
                clustered_rows = X.iloc[[
                    x[0] for x in enumerate(labels) if x[1] == c
                ]]
                if len(clustered_rows.index) == 0:
                    print("NULL")
                    centroids.append(
                        pd.DataFrame(X.sample(self.n_clusters).values).apply(
                            sum, axis=0) / self.n_clusters)
                else:
                    centroids.append(
                        clustered_rows.apply(sum) / len(clustered_rows.index))
                    centroids[c].index = range(0, len(X.columns))
            return centroids

        iterations = 0
        old_centroids = self.centroids
        while iterations < self.max_iterations:
            clear_output(wait=True)
            print(iterations)
            old_centroids = self.centroids
            labels = self.get_labels(self.X_train, self.centroids)
            self.centroids = new_centroids(self.X_train, labels)

            converged = True
            for c in range(0, len(self.centroids)):
                if not self.centroids[c].equals(old_centroids[c]):
                    converged = False
                    break
            if converged:
                break

            iterations += 1
        self.labels = labels
        self.cluster_centers_ = self.centroids

    def predict(self, X_test):
        #         return  X_test.apply(lambda x : self.nearest_centeroid(x, self.centroids), axis= 1)
        return np.array(self.get_labels(X_test, self.centroids))

    def purity(self, labels):
        total = 0
        for c in range(0, self.n_clusters):
            la = labels.iloc[[
                x[0] for x in enumerate(self.labels) if x[1] == c
            ]]
            if len(la) != 0:
                print(f'Cluster {c} size : {len(la)}')
                print(f'Cluster {c} Most Common Label : {Counter(la).most_common(1)[0][0]}')
                print(f'Cluster {c} Most Common Label Count : {Counter(la).most_common(1)[0][1]}')
                print(Counter(la).most_common(1)[0][1] / len(la))
                total += Counter(la).most_common(1)[0][1]

        purity = total / self.X_train.shape[0]
        print()
        print(f'Purity : {purity}')
        return purity

In [158]:
%%time
kmeans = KMeansCluster(n_clusters=5, distance_formula=cosine_similarity)
kmeans.fit(clustering_data)

14
(5, 737)
Wall time: 29.4 s


In [159]:
kmeans.purity(clustering_data_labels)

Cluster 0 size : 268
Cluster 0 Most Common Label : football
Cluster 0 Most Common Label Count : 248
0.9253731343283582
Cluster 1 size : 116
Cluster 1 Most Common Label : cricket
Cluster 1 Most Common Label Count : 115
0.9913793103448276
Cluster 2 size : 104
Cluster 2 Most Common Label : tennis
Cluster 2 Most Common Label Count : 97
0.9326923076923077
Cluster 3 size : 99
Cluster 3 Most Common Label : athletics
Cluster 3 Most Common Label Count : 99
1.0
Cluster 4 size : 150
Cluster 4 Most Common Label : rugby
Cluster 4 Most Common Label Count : 135
0.9

Purity : 0.9416553595658074


0.9416553595658074

In [167]:
# Documents in cluster
for c in range(0, kmeans.n_clusters):
    print(f'Documents in Cluster {c}')
    print(clustering_data_file_names.iloc[[x[0] for x in enumerate(kmeans.labels) if x[1] == c ]])

Documents in Cluster 0
272     football\048.txt
434     football\210.txt
99     athletics\100.txt
484     football\260.txt
296     football\072.txt
393     football\169.txt
412     football\188.txt
603        rugby\114.txt
485     football\261.txt
406     football\182.txt
351     football\127.txt
252     football\028.txt
274     football\050.txt
472     football\248.txt
288     football\064.txt
439     football\215.txt
480     football\256.txt
362     football\138.txt
285     football\061.txt
277     football\053.txt
614        rugby\125.txt
375     football\151.txt
426     football\202.txt
347     football\123.txt
469     football\245.txt
287     football\063.txt
438     football\214.txt
457     football\233.txt
283     football\059.txt
353     football\129.txt
             ...        
247     football\023.txt
306     football\082.txt
389     football\165.txt
462     football\238.txt
427     football\203.txt
452     football\228.txt
605        rugby\116.txt
373     football\149.txt
38

## Saving Trained Models:

In [168]:
# Note : KNN has no trainng step
knn_file_name = 'KNN.sav'
kmeans_file_name = 'KMeans.sav'
joblib.dump(knn, knn_file_name)
joblib.dump(kmeans, kmeans_file_name)

['KNN.sav']

['KMeans.sav']