## Import models from sklearn library

In [1]:
from sklearn.cluster import KMeans
from scipy.sparse import csr_matrix
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from collections import defaultdict

import numpy as np

# Kmeans

In [2]:
def load_data(data_path):
    def sparse_to_dense(sparse_r_d, vocab_size):
        """
        Convert a sparse representation of a document by tf-idf to dense form
        by mapping its word to a new vector of vocabulary size.
        Word without any appearance in the doc get the value 0.0
        """
        r_d = [0.0 for _ in range(vocab_size)]
        indices_tf_idfs = sparse_r_d.split()
        for index_tf_idf in indices_tf_idfs:
            index = int(index_tf_idf.split(":")[0])
            tfidf = float(index_tf_idf.split(":")[1])
            r_d[index] = tfidf
        return np.array(r_d)
    with open(data_path) as f:
        d_lines = f.read().splitlines()
    with open("../datasets/20news-bydate/words_idfs.txt") as f:
        vocab_size = len(f.read().splitlines())

    X = []
    Y = []
    label_count = defaultdict(int)
    for data_id, d in enumerate(d_lines):
        features = d.split("<fff>")
        label, doc_id = int(features[0]), int(features[1])
        label_count[label] += 1
        r_d = sparse_to_dense(
            sparse_r_d=features[2], vocab_size=vocab_size)
        X.append(r_d)
        Y.append(label)
    return X, Y
        

In [3]:
X, Y = load_data(data_path="../datasets/20news-bydate/data_tf_idf.txt")

In [4]:
def clustering_with_kmeans(X, Y):
    X = csr_matrix(X)
    model = KMeans(
        n_clusters=20,
        # init = "k-means++",
        init = "random",
        n_init = 5,
        tol=1e-3,
        random_state=2023
    )
    model.fit(X)
    Y_pred = model.labels_
    return(Y_pred)

In [5]:
Y_pred = clustering_with_kmeans(X, Y)

In [6]:
def compute_accuracy(Y_pred, Y):
    matches = len(np.where(Y_pred == Y)[0])
    total = len(Y_pred)
    return matches/total

In [7]:
compute_accuracy(Y_pred, Y)

0.10511514379709222

# Linear SVM

In [10]:
def classifying_with_linear_SVMs():
    X_train, Y_train = load_data('../datasets/20news-bydate/train_tf_idf.txt')
    X_test, Y_test = load_data('../datasets/20news-bydate/test_tf_idf.txt')
    classifier = LinearSVC(
        C = 10.0, #penalty co-efficience
        tol = 1e-3, # tolerance for stopping criterion
        verbose = True # whether to print log or not
    )
    classifier.fit(csr_matrix(X_train), Y_train)

    Y_pred = classifier.predict(X_test)
    accuracy = compute_accuracy(Y_pred, Y_test)
    print('Accuracy: {}'.format(accuracy))

In [11]:
classifying_with_linear_SVMs()

[LibLinear]Accuracy: 0.8267392458842273


# Kernel SVMs

In [17]:
def classifying_with_kernel_SVMs():
    X_train, Y_train = load_data('../datasets/20news-bydate/train_tf_idf.txt')
    X_test, Y_test = load_data('../datasets/20news-bydate/test_tf_idf.txt')
    classifier = SVC(
        C = 50.0, #penalty co-efficience
        kernel = "rbf",
        gamma = 0.1,
        tol = 1e-3, # tolerance for stopping criterion
        verbose = True # whether to print log or not
    )
    classifier.fit(csr_matrix(X_train), Y_train)
    print("Training process finished!")
    Y_pred = classifier.predict(X_test)
    accuracy = compute_accuracy(Y_pred, Y_test)
    print('Accuracy: {}'.format(accuracy))

In [18]:
classifying_with_kernel_SVMs()

[LibSVM]Training process finished!
Accuracy: 0.8250132766861391
