
# **Load Dataset**

In [None]:
pip install datasets

In [5]:
from datasets import load_dataset

In [None]:
ds = load_dataset('multi_news')

# **Convert Dataset to Dataframe**

In [7]:
# convert datasets to pandas dataframes
import pandas as pd
ds_train = pd.DataFrame(ds['train'])
ds_test = pd.DataFrame(ds['test'])

# **Remove Punctuation**

In [8]:
import string

# define funtion to remove punctuation.
def remove_punctuation(review):
    review = review.translate(str.maketrans("", "", string.punctuation))
    return review

In [9]:
ds_train['document'] = ds_train['document'].apply(lambda x: remove_punctuation(x))
ds_train['summary'] = ds_train['summary'].apply(lambda x: remove_punctuation(x))

In [10]:
ds_test['document'] = ds_test['document'].apply(lambda x: remove_punctuation(x))
ds_test['summary'] = ds_test['summary'].apply(lambda x: remove_punctuation(x))

# **Convert Dataframe to Dataset**

In [11]:
# convert dataframes to datasets
from datasets import Dataset, DatasetDict
# assign the splits
train = Dataset.from_pandas(ds_train)
test = Dataset.from_pandas(ds_test)
# reconstruct both datasets into a Dataset Dict object
processed_ds = DatasetDict(
    {
        'train': train,
        'test': test
    }
)
# view the resulting dataset dict object
processed_ds

DatasetDict({
    train: Dataset({
        features: ['document', 'summary'],
        num_rows: 44972
    })
    test: Dataset({
        features: ['document', 'summary'],
        num_rows: 5622
    })
})

# **Import Dependencies:**

In [12]:
import sklearn

In [13]:
# import TfidfVectorizer from sklearn
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [27]:
documents = processed_ds.data

In [21]:
print(type(processed_ds))
print(type(ds_dict))

<class 'datasets.dataset_dict.DatasetDict'>
<class 'dict'>


In [26]:
print(len(ds_dict['train']))
print(type(ds_dict['train']))

44972
<class 'datasets.table.InMemoryTable'>


In [17]:
type(documents)
documents.keys()

dict_keys(['train', 'test'])

# **TF-IDF**

In [28]:
# set number of features (output vocabulary)
no_features = 100

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming 'no_features' is a variable you've defined elsewhere with the desired number of features
no_features = 100  # Example value; adjust as needed

tfidf_vectorizer = TfidfVectorizer( #max_df=0.95,  # documents with a term frequency higher than this will be ignored
                                   #min_df=2,     # terms must appear in at least 2 documents to be considered
                                   max_features=no_features,  # the size of the output vocabulary
                                   stop_words='english')  # remove English stop words

# To use the vectorizer, you need to provide it with a dataset to fit and transform
# Example:
# dataset = ["document one text", "document two text", ...]
# tfidf_matrix = tfidf_vectorizer.fit_transform(dataset)

In [35]:
# fit and transform tfidf_vectorizer with dataset
tfidf_matrix = tfidf_vectorizer.fit_transform(processed_ds)
# This tfidf_matrix can then be used as input for various machine learning models.

In [36]:
# inspect features
tfidf_vectorizer.get_feature_names_out()

array(['test', 'train'], dtype=object)

# **NMF**

In [None]:
no_topics = 100

In [None]:
# Number of topics/components
n_components = no_topics

# Instantiate and fit_transform data with NMF
nmf = NMF(n_components=n_components, random_state=1, init='nndsvd')
W = nmf.fit_transform(tfidf_matrix)  # W matrix contains the document-topic distributions
H = nmf.components_  # H matrix contains the topic-term distributions

# Now, W and H can be used for further analysis, like identifying dominant topics for documents

In [None]:
len(H)

In [None]:
# Assuming 'nmf' is your fitted NMF model from the previous example
# and 'tfidf_vectorizer' is the TfidfVectorizer you used to transform your documents

feature_names = tfidf_vectorizer.get_feature_names_out()

for topic_idx, topic in enumerate(nmf.components_):
    print(f"Topic #{topic_idx + 1}:")

    # Sort the weights in the topic and get the indices of the top 10 features
    top_feature_indices = topic.argsort()[-10:][::-1]

    # Map the indices to actual words
    top_features = [feature_names[i] for i in top_feature_indices]
    print(" ".join(top_features))