## Loading 20NG Dataset

In [1]:
import pandas as pd
from scipy.sparse import lil_matrix
import re

# Path to the ARFF file
arff_file_path = r'20NG-F.arff'

# Read the ARFF file
with open(arff_file_path, 'r') as f:
    lines = f.readlines()

# Initialize variables to store the attributes and data
attributes = []
data_lines = []

# Flag to indicate when data section starts
data_section = False

# Process each line to extract attributes and data
for line in lines:
    line = line.strip()
    if line.startswith('@attribute'):
        attr_name = line.split()[1]
        attributes.append(attr_name)
    elif line.startswith('@data'):
        data_section = True
    elif data_section:
        data_lines.append(line)

# Find the total number of attributes
num_attributes = len(attributes)

# Create a sparse matrix
sparse_matrix = lil_matrix((len(data_lines), num_attributes))

# Regular expression to parse the sparse format
sparse_pattern = re.compile(r'(\d+)\s+(\d+)')

# Fill the sparse matrix with data
for i, row in enumerate(data_lines):
    matches = sparse_pattern.findall(row)
    for index, value in matches:
        sparse_matrix[i, int(index)] = int(value)

# Convert the sparse matrix to a dense DataFrame
df1 = pd.DataFrame.sparse.from_spmatrix(sparse_matrix)

# Set the column names
df1.columns = attributes

# Print the DataFrame
print(df1)


       comp.os_ms_windows_misc  religion.rmisc  rec.sport.baseball  sci.space  \
0                            0               0                   0          0   
1                            0               0                   0          0   
2                            0               0                   0          0   
3                            0               0                   0          0   
4                            0               0                 1.0          0   
...                        ...             ...                 ...        ...   
19296                        0               0                   0          0   
19297                        0               0                   0          0   
19298                        0               0                   0          0   
19299                        0               0                   0          0   
19300                        0               0                   0          0   

       comp.sys.mac_hardwar

## Loading ENRON Dataset

In [2]:
import pandas as pd
from scipy.sparse import lil_matrix
import re

# Path to the ARFF file
arff_file_path = r'ENRON-F.arff'

# Read the ARFF file
with open(arff_file_path, 'r') as f:
    lines = f.readlines()

# Initialize variables to store the attributes and data
attributes = []
data_lines = []

# Flag to indicate when data section starts
data_section = False

# Process each line to extract attributes and data
for line in lines:
    line = line.strip()
    if line.startswith('@attribute'):
        attr_name = line.split()[1]
        attributes.append(attr_name)
    elif line.startswith('@data'):
        data_section = True
    elif data_section:
        data_lines.append(line)

# Find the total number of attributes
num_attributes = len(attributes)

# Create a sparse matrix
sparse_matrix = lil_matrix((len(data_lines), num_attributes))

# Regular expression to parse the sparse format
sparse_pattern = re.compile(r'(\d+)\s+(\d+)')

# Fill the sparse matrix with data
for i, row in enumerate(data_lines):
    matches = sparse_pattern.findall(row)
    for index, value in matches:
        sparse_matrix[i, int(index)] = int(value)

# Convert the sparse matrix to a dense DataFrame
df2 = pd.DataFrame.sparse.from_spmatrix(sparse_matrix)

# Set the column names
df2.columns = attributes

# Print the DataFrame
print(df2)


      A.A8  C.C9  B.B12  C.C11  C.C5  C.C7  B.B2  B.B3  D.D16  A.A7  ...  \
0        0     0      0      0     0     0     0     0      0     0  ...   
1        0     0      0      0     0     0     0     0      0     0  ...   
2        0     0      0      0     0     0     0     0      0     0  ...   
3        0     0      0      0     0     0     0     0      0     0  ...   
4        0     0      0      0     0     0     0     0      0     0  ...   
...    ...   ...    ...    ...   ...   ...   ...   ...    ...   ...  ...   
1698     0     0      0      0     0     0   1.0     0      0     0  ...   
1699     0     0      0      0     0     0   1.0     0      0     0  ...   
1700     0     0      0      0     0     0   1.0     0      0     0  ...   
1701     0   1.0      0      0     0     0   1.0     0      0     0  ...   
1702     0     0      0      0     0     0   1.0     0      0     0  ...   

      workers  working  world  writer  writers  www  year  years  yesterday  \
0       

## Loading Medical Dataset

In [3]:
import pandas as pd
from scipy.sparse import lil_matrix
import re

# Path to the ARFF file
arff_file_path = r'medical.arff'

# Read the ARFF file
with open(arff_file_path, 'r') as f:
    lines = f.readlines()

# Initialize variables to store the attributes and data
attributes = []
data_lines = []

# Flag to indicate when data section starts
data_section = False

# Process each line to extract attributes and data
for line in lines:
    line = line.strip()
    if line.startswith('@attribute'):
        attr_name = line.split()[1]
        attributes.append(attr_name)
    elif line.startswith('@data'):
        data_section = True
    elif data_section:
        data_lines.append(line)

# Find the total number of attributes
num_attributes = len(attributes)

# Create a sparse matrix
sparse_matrix = lil_matrix((len(data_lines), num_attributes))

# Regular expression to parse the sparse format
sparse_pattern = re.compile(r'(\d+)\s+(\d+)')

# Fill the sparse matrix with data
for i, row in enumerate(data_lines):
    matches = sparse_pattern.findall(row)
    for index, value in matches:
        sparse_matrix[i, int(index)] = int(value)

# Convert the sparse matrix to a dense DataFrame
df3 = pd.DataFrame.sparse.from_spmatrix(sparse_matrix)

# Set the column names
df3.columns = attributes

# Print the DataFrame
print(df3)


       -  /  0  00  04  0;  0cm    1  1-1/2  1-1/2-year  ...  Class-35-493_90  \
0      0  0  0   0   0   0    0    0      0           0  ...                0   
1    1.0  0  0   0   0   0    0  1.0      0           0  ...                0   
2    1.0  0  0   0   0   0    0  1.0      0           0  ...                0   
3    1.0  0  0   0   0   0    0    0      0           0  ...                0   
4      0  0  0   0   0   0    0  1.0      0           0  ...                0   
..   ... .. ..  ..  ..  ..  ...  ...    ...         ...  ...              ...   
973    0  0  0   0   0   0    0    0      0           0  ...              1.0   
974  1.0  0  0   0   0   0    0    0      0           0  ...                0   
975  1.0  0  0   0   0   0    0    0      0           0  ...                0   
976    0  0  0   0   0   0    0    0      0           0  ...                0   
977  1.0  0  0   0   0   0    0    0      0           0  ...                0   

     Class-36-788_30  Class

In [4]:
df1.shape

(19301, 1026)

In [5]:
df2.shape

(1703, 1054)

In [6]:
df3.shape

(978, 1494)

## Evaluation Metrics

In [7]:
import numpy as np
from sklearn.metrics import hamming_loss, average_precision_score

# Hamming Loss
def compute_hamming_loss(y_true, y_pred):
    return hamming_loss(y_true, y_pred)

# One Error
def compute_one_error(y_true, y_prob):
    one_error = 0
    for i in range(len(y_true)):
        top_label = np.argmax(y_prob[i])
        if y_true[i][top_label] == 0:
            one_error += 1
    return one_error / len(y_true)

# Coverage
def compute_coverage(y_true, y_prob):
    coverage = 0
    for i in range(len(y_true)):
        sorted_indices = np.argsort(y_prob[i])[::-1]
        true_indices = np.where(y_true[i] == 1)[0]
        if len(true_indices) == 0:
            continue  # Skip if there are no true labels for this sample
        max_index = max([np.where(sorted_indices == idx)[0][0] for idx in true_indices])
        coverage += max_index
    return (coverage + 1) / len(y_true)

# Ranking Loss
def compute_ranking_loss(y_true, y_prob):
    ranking_loss = 0
    total_pairs = 0
    for i in range(len(y_true)):
        relevant = np.where(y_true[i] == 1)[0]
        irrelevant = np.where(y_true[i] == 0)[0]
        if len(relevant) == 0 or len(irrelevant) == 0:
            continue  # Skip if either relevant or irrelevant labels are empty
        total_pairs += len(relevant) * len(irrelevant)
        for rel in relevant:
            for irrel in irrelevant:
                if y_prob[i][rel] <= y_prob[i][irrel]:
                    ranking_loss += 1
    if total_pairs == 0:
        return 0  # Return 0 if there are no pairs to compute loss
    return ranking_loss / total_pairs




# Average Precision
def compute_average_precision(y_true, y_prob):
    return np.mean([average_precision_score(y_true[i], y_prob[i]) for i in range(len(y_true))])


## ML-KNN

In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, hamming_loss

def ml_knn(S, k, s):
    # Split the dataset into features and labels
    labels = S.iloc[:, :20].values
    features = S.iloc[:, 20:].values

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

    # Initialize variables
    m = y_train.shape[1]  # Number of labels
    N = len(y_train)  # Number of training samples
    Y_pred = []
    Y_prob = []

    # Computing the prior probabilities P(Hj)
    P_Hj = np.zeros(m)
    for j in range(m):
        Cj = np.sum(y_train[:, j])
        P_Hj[j] = (Cj + s) / (N + s * 2)

    P_Hj_bar = 1 - P_Hj

    # Computing the posterior probabilities P(Ej|Hj), P(Ej|H̅j)
    P_E_Hj = np.zeros((m, k+1))
    P_E_Hj_bar = np.zeros((m, k+1))
    neighbors = NearestNeighbors(n_neighbors=k).fit(X_train)

    for j in range(m):
        c = np.zeros(k + 1)
        c_bar = np.zeros(k + 1)
        for index in range(N):
            distances, indices = neighbors.kneighbors([X_train[index]], return_distance=True)
            delta = np.sum(y_train[indices[0], j])
            if y_train[index, j] == 1:
                c[int(delta)] += 1
            else:
                c_bar[int(delta)] += 1

        for i in range(k + 1):
            P_E_Hj[j][i] = (c[i] + s) / (np.sum(c) + s * (k + 1))
            P_E_Hj_bar[j][i] = (c_bar[i] + s) / (np.sum(c_bar) + s * (k + 1))

    # Making predictions for the test set
    for index in range(len(X_test)):
        N_k = neighbors.kneighbors([X_test[index]], return_distance=False)
        E_prime = np.sum(y_train[N_k[0]], axis=0)

        Y_temp = []
        Y_temp_prob = []
        for j in range(m):
            if E_prime[j] > k:
                E_prime[j] = k  # Cap the value at k

            prob_Hj = P_Hj[j] * P_E_Hj[j][int(E_prime[j])]
            prob_Hj_bar = P_Hj_bar[j] * P_E_Hj_bar[j][int(E_prime[j])]

            # Normalize to get probabilities
            total_prob = prob_Hj + prob_Hj_bar
            prob_label_1 = prob_Hj / total_prob

            if prob_label_1 > 0.5:
                Y_temp.append(1)
            else:
                Y_temp.append(0)

            Y_temp_prob.append(prob_label_1)

        Y_pred.append(Y_temp)
        Y_prob.append(Y_temp_prob)


    return y_test, Y_pred, Y_prob




In [9]:
import csv

def save_results_to_csv(y_test, y_pred, y_prob, file_name):
    with open(file_name, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['y_test', 'y_pred', 'y_prob'])
        for y_t, y_p, y_pr in zip(y_test, y_pred, y_prob):
            writer.writerow([y_t, y_p, y_pr])

In [10]:
# Example usage:
# Assuming 'df' is your DataFrame with the first 20 columns as labels and the rest as features
y_test1, Y_pred1 , y_prob1 = ml_knn(df1, k=7, s=1.0)

In [11]:
save_results_to_csv(y_test1, Y_pred1, y_prob1, 'results1.csv')
# files.download('results1.csv')

In [12]:
# Example usage:
# Assuming 'df' is your DataFrame with the first 20 columns as labels and the rest as features
y_test2, Y_pred2 , y_prob2 = ml_knn(df2, k=7, s=1.0)

In [13]:
save_results_to_csv(y_test2, Y_pred2, y_prob2, 'results2.csv')
# files.download('results2.csv')

In [14]:
# Example usage:
# Assuming 'df' is your DataFrame with the first 20 columns as labels and the rest as features
y_test3, Y_pred3 , y_prob3 = ml_knn(df3, k=7, s=1.0)

In [15]:
save_results_to_csv(y_test3, Y_pred3, y_prob3, 'results3.csv')
# files.download('results3.csv')

## Evaluating 20NG Dataset

In [16]:
from sklearn.metrics import label_ranking_average_precision_score
from sklearn.preprocessing import MultiLabelBinarizer

# Hamming Loss
hamming = compute_hamming_loss(y_test1, Y_pred1)

# One Error
one_error = compute_one_error(y_test1, y_prob1)

# Coverage
coverage = compute_coverage(y_test1, y_prob1)

# Ranking Loss
ranking_loss = compute_ranking_loss(y_test1, y_prob1)

# Average Precision
average_precision = label_ranking_average_precision_score(y_test1, y_prob1)

print(f"Hamming Loss: {hamming:.4f}")
print(f"One Error: {one_error:.4f}")
print(f"Coverage: {coverage:.4f}")
print(f"Ranking Loss: {ranking_loss:.4f}")
print(f"Average Precision: {average_precision:.4f}")


Hamming Loss: 0.0407
One Error: 0.5242
Coverage: 3.6037
Ranking Loss: 0.1862
Average Precision: 0.5928


## Evaluating ENRON Dataset

In [17]:
from sklearn.metrics import label_ranking_average_precision_score
from sklearn.preprocessing import MultiLabelBinarizer

# Hamming Loss
hamming = compute_hamming_loss(y_test2, Y_pred2)

# One Error
one_error = compute_one_error(y_test2, y_prob2)

# Coverage
coverage = compute_coverage(y_test2, y_prob2)

# Ranking Loss
ranking_loss = compute_ranking_loss(y_test2, y_prob2)

# Average Precision
average_precision = label_ranking_average_precision_score(y_test2, y_prob2)

print(f"Hamming Loss: {hamming:.4f}")
print(f"One Error: {one_error:.4f}")
print(f"Coverage: {coverage:.4f}")
print(f"Ranking Loss: {ranking_loss:.4f}")
print(f"Average Precision: {average_precision:.4f}")


Hamming Loss: 0.0630
One Error: 0.3314
Coverage: 2.9795
Ranking Loss: 0.0846
Average Precision: 0.7618


## Evaluating Medical Dataset

In [18]:
from sklearn.metrics import label_ranking_average_precision_score
from sklearn.preprocessing import MultiLabelBinarizer

# Hamming Loss
hamming = compute_hamming_loss(y_test3, Y_pred3)

# One Error
one_error = compute_one_error(y_test3, y_prob3)

# Coverage
coverage = compute_coverage(y_test3, y_prob3)

# Ranking Loss
ranking_loss = compute_ranking_loss(y_test3, y_prob3)

# Average Precision
average_precision = label_ranking_average_precision_score(y_test3, y_prob3)

print(f"Hamming Loss: {hamming:.4f}")
print(f"One Error: {one_error:.4f}")
print(f"Coverage: {coverage:.4f}")
print(f"Ranking Loss: {ranking_loss:.4f}")
print(f"Average Precision: {average_precision:.4f}")


Hamming Loss: 0.0296
One Error: 0.6071
Coverage: 0.7041
Ranking Loss: 0.0468
Average Precision: 0.9376
