# 1) Setup

## Install some important librairies

In [2]:
!pip install huggingface bertopic



## Mount on Google Drive

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# 2) Data handling

## Load data

In [4]:
# Check the current directory
import os
print(os.getcwd())

/content


In [5]:
!pip install jsonlines pandas
import pandas as pd
import jsonlines

# Define file paths
jsonl_file_path = '/content/drive/MyDrive/Saniia/Huff_news/news.jsonl'
csv_file_path = '/content/drive/MyDrive/Saniia/Huff_news/news.csv'

# Step 1: Read the .jsonl file into a pandas DataFrame
data = []
with jsonlines.open(jsonl_file_path) as reader:
    for obj in reader:
        data.append(obj)

df = pd.DataFrame(data)

# Step 2: Save the DataFrame as a CSV file
df.to_csv(csv_file_path, index=False)

# Step 3: Load the CSV file with pandas (optional, for verification)
df_loaded = pd.read_csv(csv_file_path)



In [6]:
# Show dataframe shape
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [7]:
print("We have",len(df),"articles in our dataset")

We have 209527 articles in our dataset


## Create the dataset

Let's select a sample of articles to better understand the content choices, comparing headlines with short descriptions:

In [8]:
for i in range(0,len(df),15000):
  print ("Headline:",df['headline'][i])
  print ("Short description:",df['short_description'][i])
  print("-------------------------------------------")

Headline: Over 4 Million Americans Roll Up Sleeves For Omicron-Targeted COVID Boosters
Short description: Health experts said it is too early to predict whether demand would match up with the 171 million doses of the new boosters the U.S. ordered for the fall.
-------------------------------------------
Headline: Feds Arrest Arizona Man For Selling Armor-Piercing Ammo To Las Vegas Shooter
Short description: Douglas Haig allegedly met shooter Stephen Paddock multiple times.
-------------------------------------------
Headline: A Lynda Carter Appearance In 'Wonder Woman 2' Has Already Been Discussed
Short description: The actress said she wasn't in the first film due to timing issues.
-------------------------------------------
Headline: Elon Musk Backs Oil Magnate Rex Tillerson For Secretary Of State
Short description: The clean energy titan may be playing politics since joining President Donald Trump's economic advisory team.
-------------------------------------------
Headline: The Ag

Let's use only headlines :

In [9]:
dataset = df['headline'] # + df['short_description']

In [10]:
dataset.head()

Unnamed: 0,headline
0,Over 4 Million Americans Roll Up Sleeves For O...
1,"American Airlines Flyer Charged, Banned For Li..."
2,23 Of The Funniest Tweets About Cats And Dogs ...
3,The Funniest Tweets From Parents This Week (Se...
4,Woman Who Called Cops On Black Bird-Watcher Lo...


# 3) Modeling

## Build a GenAI

In [11]:
# Enter Hugging Face token

from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [12]:
# Define our model
from torch import cuda
model_name = "meta-llama/Llama-2-7b-chat-hf"

# Check if GPU is used
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

print(device)

cuda:0


In [13]:
# Making the model light to run faster and better performance
!pip install bitsandbytes
from torch import bfloat16
import transformers


config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)



In [14]:
# Load the associated Tokenizer of LLaMA-2
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)

# Load the model
model = transformers.AutoModelForCausalLM.from_pretrained(model_name,
                                                          trust_remote_code=True,
                                                          quantization_config=config,
                                                          device_map="auto")

# Go to evaluation mode
model.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): Lla

In [15]:
# Create a pipeline for a text generation task
generator = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    temperature=0.7,
    top_p=0.15,
    repetition_penalty=1.1
)

In [16]:
# Let's test it
prompt = "What is Madagascar?"
result = generator(prompt)
print(result[0]['generated_text'])

What is Madagascar?
 Unterscheidung zwischen Madagaskar und Afrika. 2017-03-06. Retrieved from <https://www.google.de/search?q=madagaskar+africa&tbs=bks%3A1%2Ccdr%3A1%2Cict%3A1%2Cjcs%3A1%2Cnd%3A1%2Csln%3A4%2Ctlp%3A1%2Ctls%3A1%2Cuat%3A1%2Cvnd%3A1%3Bdme%3DAGO%3Brt%3D1%2Cmh%3D1%2Cpri%3D1%2Cvis%3D1%2Cfiletype%3Doff%2Cq%3DMadagaskar%2B%25E2%2580%2593%2BAfrika>.

In this article, the author discusses the distinction between Madagascar and Africa. The author explains that while Madagascar is an island located off the coast of East Africa, it is not part of the African continent. The author provides several reasons for this distinction, including geographical location, political status, and cultural differences. The author also notes that despite its distance from the African mainland, Madagascar has strong cultural and historical ties to the African continent.


In [17]:
# Define the role of the assistant
system_prompt = """
<s>[INST] <<SYS>>
You are a helpful and honest assistant for labeling categories of news headlines.
<</SYS>>
"""

In [18]:
# Example of use
example_prompt = """I have a category that contains the following news headline:
"The Most Hilarious Misadventures of Pets in 2024.
[/INST] Comedy
"""


In [19]:
# Main instruction to generate labels
main_prompt = """
[INST]
I have a category that contains the following headlines:
[HEADLINES]

The category is described by the following keywords: '[KEYWORDS]'.

Based on the information about the category above, please create a very short label of this category. Make sure you to only return the category in at most 2 to 3 words and nothing more.
[/INST]
"""

In [20]:
prompt = system_prompt + example_prompt + main_prompt

In [21]:
from sentence_transformers import SentenceTransformer

# Pre-calculate embeddings
embedding_model = SentenceTransformer("BAAI/bge-small-en")
embeddings = embedding_model.encode(dataset, show_progress_bar=True)

Batches:   0%|          | 0/6548 [00:00<?, ?it/s]

In [22]:
embeddings

array([[-0.05784671, -0.00369158, -0.01192453, ..., -0.05685468,
         0.05111515,  0.03365831],
       [-0.0120551 , -0.02122177, -0.02003092, ..., -0.05354525,
         0.03994128,  0.03604829],
       [-0.06177127, -0.04570142,  0.02375841, ...,  0.00666955,
         0.04155824,  0.07206292],
       ...,
       [-0.02847751, -0.00487951,  0.01970425, ..., -0.00149732,
         0.03206749,  0.01325438],
       [-0.01697166,  0.02579027,  0.01404026, ...,  0.00273565,
         0.04624968,  0.03121976],
       [-0.06650429, -0.01602458, -0.0371992 , ..., -0.02336792,
        -0.04148348,  0.0206691 ]], dtype=float32)

## Clustering

In [23]:
from umap import UMAP
from hdbscan import HDBSCAN

# Intialize models for clustering
umap_model = UMAP(n_neighbors=100, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=150, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

In [24]:
# Pre-reduce embeddings for visualization purposes
#reduced_embeddings = UMAP(n_neighbors=100, n_components=2, min_dist=0.0, metric='cosine', random_state=42).fit_transform(embeddings)

In [25]:
# Let's take 2 others models to compare with our LLaMa2

from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, TextGeneration

# KeyBERT
keybert = KeyBERTInspired()

# MMR
mmr = MaximalMarginalRelevance(diversity=0.2)

# Text generation with Llama 2
llama2 = TextGeneration(generator, prompt=prompt)

# All representation models
representation_model = {
    "KeyBERT": keybert,
    "Llama2": llama2,
    "MMR": mmr,
}

In [26]:
from bertopic import BERTopic

topic_model = BERTopic(

  # Sub-models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=10,
  verbose=True
)

# Train model
topics, probs = topic_model.fit_transform(dataset, embeddings)

2024-08-19 11:17:21,807 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-08-19 11:33:56,155 - BERTopic - Dimensionality - Completed ✓
2024-08-19 11:33:56,162 - BERTopic - Cluster - Start clustering the reduced embeddings
  pid = os.fork()
2024-08-19 11:34:53,897 - BERTopic - Cluster - Completed ✓
2024-08-19 11:34:53,955 - BERTopic - Representation - Extracting topics from clusters using representation models.
  9%|▉         | 9/99 [00:19<03:19,  2.21s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 99/99 [03:31<00:00,  2.14s/it]
2024-08-19 11:38:37,602 - BERTopic - Representation - Completed ✓


In [27]:
# Show topics by model
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,Llama2,MMR,Representative_Docs
0,-1,113626,-1_the_to_of_in,"[the, to, of, in, and, for, trump, on, is, with]","[trump, photo, gop, our, their, american, wome...","[Politics, , , , , , , , , ]","[of, trump, with, new, from, how, photos, what...",[Russia Says It Was In Touch With Trump Campai...
1,0,8834,0_mom_my_kids_parents,"[mom, my, kids, parents, parenting, mother, da...","[parenting, motherhood, moms, mom, mothers, mo...","[Parenting Humor, , , , , , , , , ]","[mom, kids, parents, parenting, dad, moms, par...","[I Love Kids, But I Don't Want To Be A Mom, Pa..."
2,1,7141,1_travel_photos_hotels_airlines,"[travel, photos, hotels, airlines, hotel, flig...","[travel, destinations, getaways, travelers, tr...","[Travel, , , , , , , , , ]","[travel, hotels, airlines, vacation, best, des...",[The 5 Best (And Most Affordable) Places To Tr...
3,2,6127,2_meditation_you_your_life,"[meditation, you, your, life, happiness, how, ...","[meditation, meditate, mindfulness, mindful, m...","[Mindfulness & Self-Care, , , , , , , , , ]","[meditation, life, happiness, mindfulness, fea...",[5 Ways to Get Your Meditation Practice on Tra...
4,3,5612,3_nfl_olympic_olympics_football,"[nfl, olympic, olympics, football, game, bowl,...","[nfl, patriots, kaepernick, seahawks, football...","[Sports News, , , , , , , , , ]","[nfl, olympic, olympics, nba, sports, lebron, ...",[Trump Says NFL Ratings Are 'Way Down' Due To ...
...,...,...,...,...,...,...,...,...
94,93,168,93_terror_attacks_paris_attack,"[terror, attacks, paris, attack, brussels, fra...","[terrorists, terrorism, bombings, paris, terro...","[Terrorism, , , , , , , , , ]","[terror, paris, belgium, suspects, terrorism, ...","[Analyzing the Paris Terror Attack, Belgium Ch..."
95,94,167,94_drag_rupaul_race_queens,"[drag, rupaul, race, queens, queen, stars, ref...","[rupaul, contestants, dragcon, drag, trixie, o...","[Drag Culture, , , , , , , , , ]","[drag, rupaul, race, queen, stars, reflects, r...",[6 Questions With MILK from Rupaul's Drag Race...
96,95,166,95_cosby_bill_assault_sexual,"[cosby, bill, assault, sexual, deposition, def...","[cosby, accusers, allegations, bill, raped, te...","[Cosby Trial, , , , , , , , , ]","[cosby, defamation, accusers, dickinson, janic...","[Bill Cosby's Fall From Grace, Bill Cosby Cosb..."
97,96,163,96_funniest_tweets_20_week,"[funniest, tweets, 20, week, women, this, from...","[tweets, twitter, funniest, laughingwhileblack...","[Funny Tweets, , , , , , , , , ]","[funniest, tweets, 20, women, oct, sept, manzi...","[The 20 Funniest Tweets From Women This Week, ..."


In [30]:
topic_model.generate_topic_labels(nr_words=3, topic_prefix=True)

['-1_the_to_of',
 '0_mom_my_kids',
 '1_travel_photos_hotels',
 '2_meditation_you_your',
 '3_nfl_olympic_olympics',
 '4_gay_lgbt_trans',
 '5_fashion_photos_style',
 '6_swift_taylor_music',
 '7_police_shooting_cops',
 '8_dog_cat_dogs',
 '9_recipes_recipe_make',
 '10_clinton_hillary_sanders',
 '11_trump_donald_twitter',
 '12_home_craft_photos',
 '13_wedding_bride_weddings',
 '14_climate_change_epa',
 '15_wall_24_credit',
 '16_obamacare_care_health',
 '17_divorce_divorced_after',
 '18_sleep_sleeping_apnea',
 '19_isis_syria_syrian',
 '20_workout_fitness_exercise',
 '21_snl_fallon_jimmy',
 '22_kardashian_kim_jenner',
 '23_hurricane_sandy_storm',
 '24_holiday_christmas_santa',
 '25_cocktails_wine_beer',
 '26_hair_skin_beauty',
 '27_immigration_ban_daca',
 '28_cancer_breast_prostate',
 '29_obama_michelle_barack',
 '30_middleton_prince_kate',
 '31_nasa_space_mars',
 '32_gun_nra_guns',
 '33_cruz_ted_rubio',
 '34_democrats_gop_race',
 '35_art_artist_gallery',
 '36_marriage_cheating_infidelity',
 

In [33]:
# Count unique values in the 'Topic' column (for checking)
num_unique_topics = df['category'].nunique()

print("Number of unique topics:", num_unique_topics)

Number of unique topics: 42


In [48]:
# model.reduce_topics(df['headline'], nr_topics=42)
# model.visualize_topics(0)

Let's compare output by model

In [49]:
llama2_labels = [label[0][0].split("\n")[0] for label in topic_model.get_topics(full=True)["Llama2"].values()]
MMR_labels = [label[0][0].split("\n")[0] for label in topic_model.get_topics(full=True)["MMR"].values()]
KeyBERT_labels = [label[0][0].split("\n")[0] for label in topic_model.get_topics(full=True)["KeyBERT"].values()]

In [36]:
llama2_labels

['Politics',
 'Parenting Humor',
 'Travel',
 'Mindfulness & Self-Care',
 'Sports News',
 'LGBTQ+ Rights',
 'Fashion Week',
 'Music News',
 'Police Shootings',
 'Pet Adventures',
 'Food Recipes',
 'Politics',
 'Political Satire',
 'Home Decor',
 'Wedding Planning',
 'Climate Change',
 'Financial News',
 'Healthcare Reform',
 'Divorce Advice',
 'Sleep Studies',
 'Middle East Conflict',
 'Fitness & Exercise',
 'Comedy Sketch Shows',
 'Celebrity Gossip',
 'Natural Disasters',
 'Holiday Tips',
 'Drinks',
 'Beauty Tips',
 'Immigration Ban',
 'Health',
 'Politics',
 'Royalty News',
 'Space Exploration',
 'Gun Control',
 'Politics',
 'Politics',
 'Art Exhibition',
 'Marriage Issues',
 'Medical News',
 'Abortion Rights',
 'International Crisis',
 'Clergy Abuse',
 'Politics Europe',
 'Pandemic News',
 'Politics',
 'Sexual Assault',
 'Racism',
 'Yoga Practice',
 'Political Satire',
 'Movies & Superheroes',
 'Health Crisis',
 'Awards Season',
 'Celebrity Deaths',
 'Cannabis Legalization',
 'Food &

In [43]:
len(set(llama2_labels))

86

In [37]:
MMR_labels

['of',
 'mom',
 'travel',
 'meditation',
 'nfl',
 'gay',
 'fashion',
 'beyoncé',
 'police',
 'dog',
 'recipes',
 'clinton',
 'trump',
 'craft',
 'wedding',
 'epa',
 'credit',
 'obamacare',
 'divorce',
 'sleep',
 'isis',
 'workout',
 'snl',
 'kardashian',
 'hurricane',
 'christmas',
 'cocktails',
 'hair',
 'immigration',
 'cancer',
 'obama',
 'prince',
 'nasa',
 'nra',
 'cruz',
 'democrats',
 'art',
 'marriage',
 'doctor',
 'abortion',
 'korea',
 'pope',
 'brexit',
 'covid',
 'mueller',
 'assault',
 'charlottesville',
 'yoga',
 'colbert',
 'trailer',
 'flu',
 'oscar',
 'dead',
 'marijuana',
 'diet',
 'apple',
 'deals',
 'weight',
 'uber',
 'gop',
 'weight',
 'divorce',
 'autism',
 'jedi',
 'weekend',
 'opioid',
 'love',
 'hiv',
 'thrones',
 'ebola',
 'journalism',
 'soul',
 'resolutions',
 'trump',
 'grief',
 'iran',
 'donald',
 'kimmel',
 'refugees',
 'homeless',
 'debate',
 'facebook',
 'israel',
 'bachelor',
 'trump',
 'business',
 'stigma',
 'trump',
 'election',
 'instagram',
 'ukr

In [50]:
KeyBERT_labels

['trump',
 'parenting',
 'travel',
 'meditation',
 'nfl',
 'gays',
 'fashion',
 'grammys',
 'cops',
 'dogs',
 'recipes',
 'hillary',
 'trump',
 'decorating',
 'bridal',
 'warming',
 'jpmorgan',
 'obamacare',
 'divorce',
 'sleeping',
 'isis',
 'workouts',
 'snl',
 'kardashian',
 'hurricane',
 'holidays',
 'cocktails',
 'skincare',
 'immigration',
 'cancer',
 'obama',
 'duchess',
 'nasa',
 'guns',
 'rubio',
 'democrats',
 'art',
 'marriage',
 'doctors',
 'abortion',
 'korea',
 'pope',
 'brexit',
 'covid',
 'mueller',
 'assault',
 'charlottesville',
 'yoga',
 'colbert',
 'trailer',
 'flu',
 'oscars',
 'died',
 'marijuana',
 'diet',
 'apple',
 'deals',
 'obesity',
 'uber',
 'republicans',
 'losing',
 'divorced',
 'autism',
 'jedi',
 'movie',
 'opioid',
 'love',
 'aids',
 'thrones',
 'ebola',
 'journalism',
 'gps',
 'resolutions',
 'maher',
 'grief',
 'iran',
 'trump',
 'kimmel',
 'refugees',
 'homeless',
 'debates',
 'zuckerberg',
 'gaza',
 'bachelorette',
 'seth',
 'women',
 'stigma',
 'r

In [38]:
topic_model.set_topic_labels(llama2_labels)

df['predicted_topic']=topics

In [39]:
df

Unnamed: 0,link,headline,category,short_description,authors,date,predicted_topic
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23,42
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23,1
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23,8
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23,-1
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22,-1
...,...,...,...,...,...,...,...
209522,https://www.huffingtonpost.com/entry/rim-ceo-t...,RIM CEO Thorsten Heins' 'Significant' Plans Fo...,TECH,Verizon Wireless and AT&T are already promotin...,"Reuters, Reuters",2012-01-28,-1
209523,https://www.huffingtonpost.com/entry/maria-sha...,Maria Sharapova Stunned By Victoria Azarenka I...,SPORTS,"Afterward, Azarenka, more effusive with the pr...",,2012-01-28,3
209524,https://www.huffingtonpost.com/entry/super-bow...,"Giants Over Patriots, Jets Over Colts Among M...",SPORTS,"Leading up to Super Bowl XLVI, the most talked...",,2012-01-28,3
209525,https://www.huffingtonpost.com/entry/aldon-smi...,Aldon Smith Arrested: 49ers Linebacker Busted ...,SPORTS,CORRECTION: An earlier version of this story i...,,2012-01-28,-1
