In [18]:
import pandas as pd #
import numpy as np #
import re #
import nltk #
import pyLDAvis
import pyLDAvis.sklearn

import matplotlib.pyplot as plt #


from nltk.stem.porter import PorterStemmer #
from sklearn.feature_extraction.text import TfidfVectorizer #
from sklearn import decomposition #
# from sklearn.decomposition import TruncatedSVD, NMF
# from sklearn.pipeline import Pipeline, make_pipeline
# from sklearn.cluster import KMeans, DBSCAN
# from sklearn.metrics import silhouette_score, accuracy_score
# from sklearn.preprocessing import normalize
# from scipy.sparse import SparseEfficiencyWarning
from sklearn.model_selection import train_test_split #


In [7]:
# nltk.download('punkt')
# nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bergs\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
train_df = pd.read_csv('data/BBC News Train.csv')

In [4]:
train_df.head(3)

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business


In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1490 entries, 0 to 1489
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ArticleId  1490 non-null   int64 
 1   Text       1490 non-null   object
 2   Category   1490 non-null   object
dtypes: int64(1), object(2)
memory usage: 35.1+ KB


In [5]:
train_df['Category'].value_counts()

Category
sport            346
business         336
politics         274
entertainment    273
tech             261
Name: count, dtype: int64

complaints -> Text

In [None]:
# function to remove duplicates
def remove_duplicates(df: pd.DataFrame) -> pd.DataFrame:
    # number of duplicates
    number_duplicates = df['Text'].duplicated().sum()

    # print no. duplicates
    print(f'No. of duplicate articles: {number_duplicates}' '\n')

    # removal
    df_cleaned = df.drop_duplicates(subset = ['Text'])

    return df_cleaned

# function to remove non-alphabetical characters
def remove_non_alpha(text):
    
    return re.sub(r'[^a-zA-Z\s]', '', text)



In [9]:
stemmer = nltk.stem.SnowballStemmer('english')
stop_words = set(nltk.corpus.stopwords.words('english'))

In [25]:
# function to remove duplicates (run this before tokenization)
def remove_duplicates(df: pd.DataFrame) -> pd.DataFrame:
    # number of duplicates
    number_duplicates = df['Text'].duplicated().sum()

    # print no. duplicates
    print(f'No. of duplicate articles: {number_duplicates}' '\n')

    # removal
    df_cleaned = df.drop_duplicates(subset = ['Text'])

    return df_cleaned

# function to remove non-alphabetical characters (integrated into tokenize function)
def remove_non_alpha(text):
    
    return re.sub(r'[^a-zA-Z\s]', '', text)
    
# function for cleaning, rather than using defaults
def tokenize(text):
    
    text = remove_non_alpha(text)
    
    tokens = [word for word in nltk.word_tokenize(text) if (len(word)) > 3]
    tokens = map(str.lower, tokens)
    stems = [stemmer.stem(item) for item in tokens if (item not in stop_words)]
    return stems

In [26]:
train_df = remove_duplicates(train_df)

No. of duplicate articles: 50



In [27]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1440 entries, 0 to 1489
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ArticleId  1440 non-null   int64 
 1   Text       1440 non-null   object
 2   Category   1440 non-null   object
dtypes: int64(1), object(2)
memory usage: 45.0+ KB


In [28]:
vectorizer = (TfidfVectorizer(tokenizer = tokenize, 
                              stop_words = None,
                              max_df = 0.75, # max freq of word in documents
                              max_features = 1000,
                              lowercase = False,
                              ngram_range = (1, 2),
                              token_pattern = None) # turn off token warning 
              )

In [29]:
tfidf_vectors = vectorizer.fit_transform(train_df.Text)

In [32]:
tfidf_dense = tfidf_vectors.toarray()
tfidf_dense

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.04267657, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [34]:
vectorizer.get_feature_names_out()

array(['abil', 'abl', 'academi', 'accept', 'access', 'accord', 'account',
       'accus', 'achiev', 'across', 'act', 'action', 'activ', 'actor',
       'actress', 'actual', 'ad', 'address', 'admit', 'affair', 'affect',
       'africa', 'agenc', 'agre', 'agreement', 'ahead', 'aim', 'airlin',
       'alan', 'album', 'alleg', 'allow', 'almost', 'along', 'alreadi',
       'also', 'although', 'alway', 'america', 'american', 'among',
       'amount', 'analyst', 'andi', 'andrew', 'announc', 'annual',
       'anoth', 'answer', 'anyth', 'appeal', 'appear', 'appl', 'approach',
       'approv', 'april', 'area', 'argu', 'around', 'arrest', 'arsenal',
       'artist', 'ask', 'associ', 'asylum', 'athlet', 'attack', 'attempt',
       'attend', 'attract', 'audienc', 'august', 'australia',
       'australian', 'author', 'avail', 'averag', 'aviat', 'avoid',
       'award', 'away', 'back', 'ball', 'band', 'bank', 'bankruptci',
       'base', 'battl', 'beat', 'becam', 'becom', 'began', 'begin',
       'be

In [35]:
clf = decomposition.NMF(n_components = 5, random_state = 5510)

W1 = clf.fit_transform(tfidf_vectors)
H1 = clf.components_

In [36]:
H1
# video: 16:35 - purpose of H matrix

array([[0.00698201, 0.02300131, 0.        , ..., 0.00095059, 0.20736017,
        0.        ],
       [0.01921616, 0.04364995, 0.        , ..., 0.06303685, 0.        ,
        0.09709774],
       [0.00042829, 0.01323801, 0.16615939, ..., 0.0587904 , 0.        ,
        0.00226522],
       [0.01967389, 0.02638005, 0.        , ..., 0.03369955, 0.        ,
        0.00080749],
       [0.03138191, 0.07921708, 0.        , ..., 0.01773869, 0.        ,
        0.        ]])

In [37]:
W1
# video: 17:00 - purpose of W matrix

array([[0.07292985, 0.00626263, 0.00784127, 0.00979723, 0.00853967],
       [0.14137859, 0.        , 0.        , 0.        , 0.        ],
       [0.07379133, 0.00861941, 0.00063403, 0.04671532, 0.02646402],
       ...,
       [0.12420799, 0.01191492, 0.00250395, 0.        , 0.        ],
       [0.01965571, 0.        , 0.01639135, 0.        , 0.24374781],
       [0.        , 0.        , 0.        , 0.        , 0.15544705]])

In [39]:
num_words = 15

vocab = np.array(vectorizer.get_feature_names_out())

top_words = lambda t: [vocab[i] for i in np.argsort(t)[:-num_words-1:-1]]
topic_words = ([top_words(t) for t in H1])
topics = [' '.join(t) for t in topic_words]

In [40]:
topics

['market firm compani sale growth year bank share profit economi price rate econom china trade',
 'game play england player match champion team injuri wale side ireland first club coach world',
 'film award best star actor nomin oscar actress director festiv movi music includ comedi year',
 'labour elect blair parti tori brown minist govern would prime prime minist howard lord campaign chancellor',
 'mobil phone peopl technolog music use servic user comput softwar digit network broadband game microsoft']

In [41]:
colnames = ['Topic' + str(i) for i in range(clf.n_components)]
docnames = ['Doc' + str(i) for i in range(len(train_df.Text))]
doc_topic_df = pd.DataFrame(np.round(W1, 2), columns = colnames, index = docnames)
significant_topic = np.argmax(doc_topic_df.values, axis = 1)
doc_topic_df['dominant_topic'] = significant_topic

In [44]:
doc_topic_df.head(10)

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,dominant_topic
Doc0,0.07,0.01,0.01,0.01,0.01,0
Doc1,0.14,0.0,0.0,0.0,0.0,0
Doc2,0.07,0.01,0.0,0.05,0.03,0
Doc3,0.0,0.0,0.0,0.0,0.27,4
Doc4,0.1,0.01,0.03,0.0,0.01,0
Doc5,0.0,0.06,0.0,0.07,0.02,3
Doc6,0.0,0.15,0.0,0.0,0.0,1
Doc7,0.0,0.0,0.19,0.03,0.0,2
Doc8,0.13,0.0,0.0,0.0,0.0,0
Doc9,0.02,0.01,0.13,0.01,0.0,2


In [45]:
train_df.head(10)

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business
5,1582,howard truanted to play snooker conservative...,politics
6,651,wales silent on grand slam talk rhys williams ...,sport
7,1797,french honour for director parker british film...,entertainment
8,2034,car giant hit by mercedes slump a slump in pro...,business
9,1866,fockers fuel festive film chart comedy meet th...,entertainment


In [None]:
import numpy as np
import pyLDAvis
import pyLDAvis.sklearn

# Extract the topic-term and document-topic matrices
nmf_topics = nmf_model.components_  # shape: (n_topics, n_features)
doc_topic_dist = nmf_model.transform(tfidf_matrix)  # shape: (n_docs, n_topics)

# Create the term frequency matrix from the vectorizer
term_freq = np.asarray(tfidf_matrix.sum(axis=0)).flatten()

# Create the data dictionary required by PyLDAvis
data = {
    'topic_term_dists': nmf_topics / nmf_topics.sum(axis=1)[:, None],  # Normalize topic-term matrix
    'doc_topic_dists': doc_topic_dist / doc_topic_dist.sum(axis=1)[:, None],  # Normalize document-topic matrix
    'doc_lengths': np.sum(tfidf_matrix, axis=1).A1,  # Document lengths
    'vocab': vectorizer.get_feature_names_out(),  # List of all terms in the vocabulary
    'term_frequency': term_freq  # Term frequencies
}

# Create a PyLDAvis prepared object
vis_data = pyLDAvis.prepare(**data)

# Visualize the NMF topics
pyLDAvis.display(vis_data)  # Opens an interactive display
