In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.decomposition import NMF
from sklearn.metrics import accuracy_score, confusion_matrix
import itertools

df_train = pd.read_csv("data/BBC News Train.csv")
df_test = pd.read_csv("data/BBC News Test.csv")


In [2]:
#encode testing labels
id_items, cat_items= df_train.Category.factorize()
df_train['category_id'] = id_items
# track_item = set()
# id_list = [id for id in id_items if id not in track_item and not track_item.add(id)]
# track_cat = set()
# cat_list = [cat for cat in cat_items if cat not in track_cat and not track_cat.add(cat)]
# category_to_id = dict(zip(id_list, track_cat))
category_to_id = dict(enumerate(pd.Categorical(cat_items).categories))


In [3]:
category_to_id
df_train

Unnamed: 0,ArticleId,Text,Category,category_id
0,1833,worldcom ex-boss launches defence lawyers defe...,business,0
1,154,german business confidence slides german busin...,business,0
2,1101,bbc poll indicates economic gloom citizens in ...,business,0
3,1976,lifestyle governs mobile choice faster bett...,tech,1
4,917,enron bosses in $168m payout eighteen former e...,business,0
...,...,...,...,...
1485,857,double eviction from big brother model caprice...,entertainment,4
1486,325,dj double act revamp chart show dj duo jk and ...,entertainment,4
1487,1590,weak dollar hits reuters revenues at media gro...,business,0
1488,1587,apple ipod family expands market apple has exp...,tech,1


In [4]:
tokenizer_model = TfidfVectorizer(sublinear_tf=True, min_df = 3, norm = 'l2', encoding ='latin-1', ngram_range=(1,2),
                                              stop_words = 'english')

word_model = tokenizer_model.fit_transform(df_train.Text)
model_NMF = NMF(n_components=5,init='nndsvd', solver='cd',
                random_state=42,alpha_W=0.0, alpha_H=0.01, l1_ratio=0.0, max_iter=500)
W = model_NMF.fit_transform(word_model)
H = model_NMF.components_



In [5]:
# def label_permute_compare(ytdf,yp,n=5):
#     """
#     ytdf: labels dataframe object. These are the true labels
#     yp: NMF label prediction output. a numpy array containing the index of
#     the label with the highest score from the W matrix in the NMF_execute method
#     Returns permuted label order and accuracy. 
#     Example output: (3, 4, 1, 2, 0), 0.74 
#     """
#     label_permutation = itertools.permutations(range(n))
#     unique_labels = ytdf['category_id'].unique()
#     #now associate a key of label for each permutation
#     best_perm = None
#     best_acc = 0
#     for perm in label_permutation:
#         key = dict(zip(unique_labels, perm))
#         #map the key to the ytru data
#         ytrue_mapped = ytdf['category_id'].map(key)
#         accuracy = accuracy_score(ytrue_mapped, yp)
#         if accuracy>best_acc:
#             best_acc = accuracy
#             best_perm = perm
#     return best_perm, best_acc

In [6]:
def label_permute_compare(ytdf, yp, n=5):
    label_permutation = itertools.permutations(range(n))
    unique_labels = ytdf['category_id'].unique()
    
    best_perm = None
    best_acc = 0
    for perm in label_permutation:
        # Create mapping from predicted cluster IDs to true category IDs
        key = dict(zip(range(n), perm))  # Map cluster 0->perm[0], cluster 1->perm[1], etc.
        
        # Map the predicted labels using this permutation
        yp_mapped = pd.Series(yp).map(key)  # Convert to Series for .map() method
        
        accuracy = accuracy_score(ytdf['category_id'], yp_mapped)
        if accuracy > best_acc:
            best_acc = accuracy
            best_perm = perm
    return best_perm, best_acc

In [7]:
predicted_topics = W.argmax(axis=1)
perm, acc = label_permute_compare(df_train, predicted_topics)

print(f'The best permutation is {perm},\
      \n which corresponds to {[category_to_id[it] for it in perm]}\
      \n and has an accuracy of {acc}')

The best permutation is (1, 3, 2, 4, 0),      
 which corresponds to ['entertainment', 'sport', 'politics', 'tech', 'business']      
 and has an accuracy of 0.9516778523489933


In [8]:
test_instance = tokenizer_model.transform(df_test.Text)

In [9]:
W_test = model_NMF.transform(test_instance)

In [10]:
predicted_topics = W_test.argmax(axis=1)

In [11]:
# Apply the same mapping logic as in the training evaluation
cluster_to_category = dict(zip(range(5), perm))  # {0: 1, 1: 3, 2: 2, 3: 4, 4: 0}
final_topic_ids = [cluster_to_category[cluster_id] for cluster_id in predicted_topics]

# Then convert to category names
results = pd.DataFrame()
results['ArticleId'] = df_test['ArticleId'].copy()
results['Category'] = [category_to_id[cat_id] for cat_id in final_topic_ids]
results.to_csv('BBC NEW Solution.csv', index=False)