
# **Load Dataset**

In [None]:
pip install datasets

In [None]:
from datasets import load_dataset

In [None]:
ds = load_dataset('multi_news')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/295M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/28.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/39.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/40.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/44972 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5622 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5622 [00:00<?, ? examples/s]

# **Convert Dataset to Dataframe**

In [None]:
# convert datasets to pandas dataframes
import pandas as pd
ds_train = pd.DataFrame(ds['train'])
ds_test = pd.DataFrame(ds['test'])

# **Remove Punctuation**

In [None]:
import string

# define funtion to remove punctuation.
def remove_punctuation(review):
    review = review.translate(str.maketrans("", "", string.punctuation))
    return review

In [None]:
ds_train['document'] = ds_train['document'].apply(lambda x: remove_punctuation(x))
ds_train['summary'] = ds_train['summary'].apply(lambda x: remove_punctuation(x))

In [None]:
ds_test['document'] = ds_test['document'].apply(lambda x: remove_punctuation(x))
ds_test['summary'] = ds_test['summary'].apply(lambda x: remove_punctuation(x))

In [None]:
ds_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44972 entries, 0 to 44971
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   document  44972 non-null  object
 1   summary   44972 non-null  object
dtypes: object(2)
memory usage: 702.8+ KB


# **Convert Dataframe to Dataset**

In [None]:
# convert dataframes to datasets
from datasets import Dataset, DatasetDict
# assign the splits
train = Dataset.from_pandas(ds_train)
test = Dataset.from_pandas(ds_test)
# reconstruct both datasets into a Dataset Dict object
processed_ds = DatasetDict(
    {
        'train': train,
        'test': test
    }
)
# view the resulting dataset dict object
processed_ds

DatasetDict({
    train: Dataset({
        features: ['document', 'summary'],
        num_rows: 44972
    })
    test: Dataset({
        features: ['document', 'summary'],
        num_rows: 5622
    })
})

# **Import Dependencies:**

In [None]:
import sklearn

In [None]:
# import TfidfVectorizer from sklearn
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [None]:
documents = processed_ds.data

In [None]:
documents['train'][1]

<pyarrow.lib.ChunkedArray object at 0x7d98f4766020>
[
  [
    "– The unemployment rate dropped to 82 last month but the economy only added 120000 jobs when 203000 new jobs had been predicted according to todays jobs report Reaction on the Wall Street Journals MarketBeat Blog was swift Woah Bad number The unemployment rate however is better news it had been expected to hold steady at 83 But the AP notes that the dip is mostly due to more Americans giving up on seeking employment",
    "– Shelly Sterling plans eventually to divorce her estranged husband Donald she tells Barbara Walters at ABC News As for her stake in the Los Angeles Clippers she plans to keep it the AP notes Sterling says she would absolutely fight any NBA decision to force her to sell the team The team is her legacy to her family she says To be honest with you Im wondering if a wife of one of the owners … said those racial slurs would they oust the husband Or would they leave the husband in",
    "– A twinengine Embraer

In [None]:
print(type(processed_ds))
print(type(documents))

<class 'datasets.dataset_dict.DatasetDict'>
<class 'dict'>


In [None]:
print(len(documents['train']))
print(type(documents['train']))

44972
<class 'datasets.table.InMemoryTable'>


In [None]:
type(documents)
documents.keys()

dict_keys(['train', 'test'])

# **TF-IDF**

In [None]:
# set number of features (output vocabulary)
no_features = 100

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming 'no_features' is a variable you've defined elsewhere with the desired number of features
no_features = 100  # Example value; adjust as needed

tfidf_vectorizer = TfidfVectorizer( max_df=0.95,  # documents with a term frequency higher than this will be ignored
                                   min_df=2,     # terms must appear in at least 2 documents to be considered
                                  #  max_features=no_features,  # the size of the output vocabulary
                                   stop_words='english')  # remove English stop words

# To use the vectorizer, you need to provide it with a dataset to fit and transform
# Example:
# dataset = ["document one text", "document two text", ...]
# tfidf_matrix = tfidf_vectorizer.fit_transform(dataset)

In [None]:
# fit and transform tfidf_vectorizer with dataset
tfidf_matrix = tfidf_vectorizer.fit_transform(ds_train['document'])
# This tfidf_matrix can then be used as input for various machine learning models.

In [None]:
# inspect features
tfidf_vectorizer.get_feature_names_out()

array(['10', 'according', 'american', 'asked', 'associated', 'called',
       'campaign', 'case', 'children', 'city', 'company', 'country',
       'county', 'court', 'day', 'days', 'death', 'department', 'did',
       'family', 'friday', 'going', 'good', 'got', 'government', 'group',
       'health', 'help', 'home', 'house', 'including', 'information',
       'just', 'know', 'later', 'law', 'left', 'life', 'like', 'long',
       'make', 'man', 'media', 'million', 'monday', 'mr', 'national',
       'new', 'news', 'night', 'north', 'obama', 'office', 'officials',
       'people', 'percent', 'photo', 'photos', 'police', 'president',
       'press', 'public', 'really', 'report', 'reported', 'right', 'said',
       'say', 'says', 'school', 'security', 'state', 'statement',
       'states', 'story', 'think', 'thursday', 'time', 'times', 'told',
       'took', 'trump', 'tuesday', 'united', 'university', 'use', 'used',
       'video', 'want', 'washington', 'way', 'wednesday', 'week', 'white',
