In [3]:
!pip install -q scikit-learn spacy nltk langdetect

In [4]:
!python -m spacy download fr_core_news_sm

Collecting fr-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.8.0/fr_core_news_sm-3.8.0-py3-none-any.whl (16.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m64.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: fr-core-news-sm
Successfully installed fr-core-news-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [6]:
# Read the 
import pandas as pd

news_path = "/kaggle/input/news-dataset/part-00000-4da3abc3-0212-4040-8d15-818baf7e38bf-c000.csv"
news_df = pd.read_csv( news_path, sep=";")
news_df.head()

Unnamed: 0,id,title,link,author,description,pubDate,content,tags
0,https://www.africaintelligence.com/central-afr...,Central African Republic : The nebulous web of...,https://www.africaintelligence.com/central-afr...,,"When it comes to economic diplomacy, the Centr...","Fri, 13 Jun 2025 04:40:00 GMT",,[]
1,https://www.africaintelligence.com/west-africa...,Guinea : Conakry sets foundation for big plans...,https://www.africaintelligence.com/west-africa...,,"The Guinean embassy in Washington, headed by S...","Fri, 13 Jun 2025 04:40:00 GMT",,[]
2,https://www.africaintelligence.com/west-africa...,Niger : General Tchiani walks a tightrope to m...,https://www.africaintelligence.com/west-africa...,,As the security situation continues to deterio...,"Fri, 13 Jun 2025 04:40:00 GMT",,[]
3,https://www.africaintelligence.com/north-afric...,Morocco : Energy firm Nareva on look out for n...,https://www.africaintelligence.com/north-afric...,,Nareva is back to square one as it seeks a new...,"Fri, 13 Jun 2025 04:40:00 GMT",,[]
4,https://www.africaintelligence.com/north-afric...,Libya : Arkenu Oil and Bares Holding negotiate...,https://www.africaintelligence.com/north-afric...,,"The Arabian Gulf Oil Company, or AGOCO, is cur...","Fri, 13 Jun 2025 04:40:00 GMT",,[]


# Topic Modeling

In [7]:
import re
import spacy

from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
import numpy as np

# Importing Gensim
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [8]:
from warnings import filterwarnings
filterwarnings('ignore')

In [9]:
corpus = news_df['description'].values

In [10]:
from langdetect import detect
detect(corpus[8])

'en'

In [11]:
import spacy
nlp = spacy.load("fr_core_news_sm")
en_nlp = spacy.load('en_core_web_sm')

In [12]:
import re

french_regex_to_exclude = [
    r"africa presse",
    r'<a\s+href=["]{1,2}[^"]+["]{1,2}>?',
    r"<p>",
    r"</a>",
    r"</p>",
    r"https://\S+\.com",
    r"\n"
]
def exclude_regex(text):
    for regex in french_regex_to_exclude:
        text= re.sub(regex,"", text)
    return text
    
def lemmatize_token(token):
  return token.lemma_.strip()

In [19]:
def process_french_text(text):
    
    text = exclude_regex(text.lower())
    complete_doc  = nlp(text)

    words = [
         lemmatize_token(token)
         for token in complete_doc
         if not token.is_stop and not token.is_punct
     ]

    
    return " ".join(words)

In [20]:
def process_english_text(text):
    complete_doc  = en_nlp(text)

    words = [
         lemmatize_token(token)
         for token in complete_doc
         if not token.is_stop and not token.is_punct
     ]  
    
    return " ".join(words)

In [21]:
def process_text(text):
    lang = detect(text)
    if lang =="fr":
        clean_text = process_french_text(text)
    elif lang == "en":
        clean_text = process_english_text(text)
    return clean_text
        

In [22]:
clean_corpus = [process_text(text) for text in corpus]

In [24]:
vectorizer = CountVectorizer()
doc_term_matrix = vectorizer.fit_transform(clean_corpus)

In [27]:
lda = LatentDirichletAllocation(n_components=4, random_state=42)
lda.fit(doc_term_matrix)

In [28]:
# Step 3: Display topics
for idx, topic in enumerate(lda.components_):
    print(f"Topic {idx}:")
    print([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-5:]])

Topic 0:
['minerai', 'appeared', 'first', 'the', 'post']
Topic 1:
['washington', 'appeared', 'first', 'the', 'post']
Topic 2:
['burundi', 'juin', 'oil', 'company', 'national']
Topic 3:
['power', 'touadéra', 'union', 'african', 'africain']


In [35]:
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora import Dictionary

texts = [text.split(" ") for text in clean_corpus]  # tokenized documents

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(clean_corpus)
feature_names = vectorizer.get_feature_names_out()

scores = []
for k in range(2, 21):
    lda = LatentDirichletAllocation(n_components=k, random_state=42)
    lda.fit(X)
    
    topics = []
    for topic_weights in lda.components_:
        top_terms = topic_weights.argsort()[:-11:-1]
        topics.append([feature_names[i] for i in top_terms])

    dictionary = Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    
    cm = CoherenceModel(topics=topics, texts=texts, dictionary=dictionary, coherence='c_v')
    scores.append((k, cm.get_coherence()))


In [37]:
print(scores)

[(2, 0.40788820232734235), (3, 0.6290474463186028), (4, 0.6682634563004438), (5, 0.6887662312794809), (6, 0.6608813284295615), (7, 0.6259049855437218), (8, 0.7717654334444232), (9, 0.7052254998814608), (10, 0.7097093941697181), (11, 0.7374645737174972), (12, 0.6305461464148191), (13, 0.6127261311035554), (14, 0.6915442704872461), (15, 0.66184633130168), (16, 0.657938594274794), (17, 0.627150990701944), (18, 0.6486039340023637), (19, 0.6006655884062354), (20, 0.6816662185758157)]


### Visualize the clustering result

In [31]:
import pyLDAvis.lda_model

pyLDAvis.enable_notebook()

panel = pyLDAvis.lda_model.prepare(lda, doc_term_matrix, vectorizer)
pyLDAvis.save_html(panel, 'lda_viz.html')


In [30]:
pip install pyLDAvis

Collecting scipy (from pyLDAvis)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.6/38.6 MB[0m [31m40.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: scipy
  Attempting uninstall: scipy
    Found existing installation: scipy 1.15.2
    Uninstalling scipy-1.15.2:
      Successfully uninstalled scipy-1.15.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tsfresh 0.21.0 requires scipy>=1.14.0; python_version >= "3.10", but you have scipy 1.13.1 which is incompatible.
cesium 0.12.4 requires n

In [None]:
import pyLDAvis
pyLDAvis.gensimvis.prepare(Lda, corpus, dict_)

In [32]:
from IPython.core.display import display, HTML

display(HTML("/kaggle/working/lda_viz.html"))

# Knowledge graph

In [4]:
!pip install -q mistralai python_dotenv langchain langchain_community

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.1/374.1 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m56.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m438.1/438.1 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.0/363.0 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.5/65.5 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the fol

In [15]:
import os
import json
import re
import uuid
from mistralai import Mistral
from dotenv import load_dotenv

load_dotenv()

api_key = os.getenv("MISTRAL_API_KEY",)
model = "open-mistral-nemo"

client = Mistral(api_key='3AoCcGo1dtInBpnJVPeKjNyTI2v49jQn')



def generate_completion(system_prompt= "" , user_prompt=""):
    chat_response = client.chat.complete(
        model= model,
        messages = [
            {
                "role": "system",
                "content": system_prompt,
            },
            {
                "role": "user",
                "content": user_prompt,
            },
            
        ]
    )
    answer = chat_response.choices[0].message.content
    return answer

In [6]:
#1. Splitting the text into chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import CSVLoader

loader = CSVLoader(file_path=news_path,
                   content_columns="content",
                    csv_args={
                    'delimiter': ';',
                    'quotechar': '"',
                    
                })

In [7]:
documents = loader.load()
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=150,
    length_function=len,
    is_separator_regex=False,
)

pages = splitter.split_documents(documents)
print("Number of chunks = ", len(pages))
print(pages[18].page_content)

Number of chunks =  39
content: <p>Un avion de la compagnie Air India s’est écrasé mercredi peu après son décollage de l’aéroport d’Ahmedabad, dans l’ouest de l’Inde. L’accident a fait au moins 265 morts, dont 24 victimes au sol, et un seul survivant identifié à ce stade. Le Boeing 787-8 Dreamliner, à destination de Londres, transportait 276 personnes, dont une majorité d’Indiens, mais aussi des passagers britanniques, portugais et une Canadienne.</p>
<p>L’accident s’est produit à 13h38, heure locale, quelques secondes seulement après le décollage. L’avion a percuté un centre d’hébergement du personnel médical, causant également des pertes humaines au sol. Selon la police, le corps de 265 personnes a été récupéré, tandis qu’une cinquantaine de blessés, principalement des médecins et étudiants en médecine, ont été recensés. L’unique survivant est un ressortissant britannique, Vishwash Kumar Ramesh, miraculeusement épargné malgré la violence du crash.</p>
<p>Le vol AI 171 quittait Ahmeda

In [8]:
# helper functions to generate ontology

def extractConcepts(prompt: str, metadata={}):
    SYS_PROMPT = (
        "Your task is extract the key concepts (and non personal entities) mentioned in the given context. "
        "Extract only the most important and atomistic concepts, if  needed break the concepts down to the simpler concepts."
        "Categorize the concepts in one of the following categories: "
        "[event, concept, place, object, document, organisation, condition, misc]\n"
        "Format your output as a list of json with the following format:\n"
        "[\n"
        "   {\n"
        '       "entity": The Concept,\n'
        '       "importance": The concontextual importance of the concept on a scale of 1 to 5 (5 being the highest),\n'
        '       "category": The Type of Concept,\n'
        "   }, \n"
        "{ }, \n"
        "]\n"
    )
    response = generate_completion(system_prompt=SYS_PROMPT, user_prompt=prompt)
    try:
        response = re.sub(r"```json|```","", response).strip()
        result = json.loads(response)
        result = [dict(item, **metadata) for item in result]
    except Exception as e:
        print(e)
        print("\n\nERROR ### Here is the buggy response: ", response, "\n\n")
        result = None
    return result


def graphPrompt(input: str, metadata={}):

    SYS_PROMPT = (
        "You are a network graph maker who extracts terms and their relations from a given context. "
        "You are provided with a context chunk (delimited by ```) Your task is to extract the ontology "
        "of terms mentioned in the given context. These terms should represent the key concepts as per the context. \n"
        "Thought 1: While traversing through each sentence, Think about the key terms mentioned in it.\n"
            "\tTerms may include object, entity, location, organization, person, \n"
            "\tcondition, acronym, documents, service, concept, etc.\n"
            "\tTerms should be as atomistic as possible\n\n"
        "Thought 2: Think about how these terms can have one on one relation with other terms.\n"
            "\tTerms that are mentioned in the same sentence or the same paragraph are typically related to each other.\n"
            "\tTerms can be related to many other terms\n\n"
        "Thought 3: Find out the relation between each such related pair of terms. \n\n"
        "Format your output as a list of json. Each element of the list contains a pair of terms"
        "and the relation between them, like the follwing: \n"
        "[\n"
        "   {\n"
        '       "node_1": "A concept from extracted ontology",\n'
        '       "node_2": "A related concept from extracted ontology",\n'
        '       "edge": "relationship between the two concepts, node_1 and node_2 in one or two sentences"\n'
        "   }, {...}\n"
        "]"
    )

    USER_PROMPT = f"context: ```{input}``` \n\n output: "
    response = generate_completion(system_prompt=SYS_PROMPT, user_prompt=USER_PROMPT)
    try:
        response = re.sub("Based on the provided context, here's the extracted ontology with terms and their relations:","",response)
        response = re.sub(r"```json|```","", response).strip()
        result = json.loads(response)
        result = [dict(item, **metadata) for item in result]
    except Exception as e:
        print(e)
        print("\n\nERROR ### Here is the buggy response: ", response, "\n\n")
        result = None
    return result

## Helper functions and prompt to build the graph

inspired by [link](https://towardsdatascience.com/how-to-convert-any-text-into-a-graph-of-concepts-110844f22a1a/)

In [9]:
def documents2Dataframe(documents) -> pd.DataFrame:
    rows = []
    for chunk in documents:
        row = {
            "text": chunk.page_content,
            **chunk.metadata,
            "chunk_id": uuid.uuid4().hex,
        }
        rows = rows + [row]

    df = pd.DataFrame(rows)
    return df


def df2ConceptsList(dataframe: pd.DataFrame) -> list:
    # dataframe.reset_index(inplace=True)
    results = dataframe.progress_apply(
        lambda row: extractConcepts(
            row.text, {"chunk_id": row.chunk_id, "type": "concept"}
        ),
        axis=1,
    )
    # invalid json results in NaN
    results = results.dropna()
    results = results.reset_index(drop=True)

    ## Flatten the list of lists to one single list of entities.
    concept_list = np.concatenate(results).ravel().tolist()
    return concept_list


def concepts2Df(concepts_list) -> pd.DataFrame:
    ## Remove all NaN entities
    concepts_dataframe = pd.DataFrame(concepts_list).replace(" ", np.nan)
    concepts_dataframe = concepts_dataframe.dropna(subset=["entity"])
    concepts_dataframe["entity"] = concepts_dataframe["entity"].apply(
        lambda x: x.lower()
    )

    return concepts_dataframe


def df2Graph(dataframe: pd.DataFrame) -> list:
    # dataframe.reset_index(inplace=True)
    results = dataframe.apply(
        lambda row: graphPrompt(row.text, {"chunk_id": row.chunk_id}), axis=1
    )
    # invalid json results in NaN
    results = results.dropna()
    results = results.reset_index(drop=True)

    ## Flatten the list of lists to one single list of entities.
    concept_list = np.concatenate(results).ravel().tolist()
    return concept_list


def graph2Df(nodes_list) -> pd.DataFrame:
    ## Remove all NaN entities
    graph_dataframe = pd.DataFrame(nodes_list).replace(" ", np.nan)
    graph_dataframe = graph_dataframe.dropna(subset=["node_1", "node_2"])
    graph_dataframe["node_1"] = graph_dataframe["node_1"].apply(lambda x: x.lower())
    graph_dataframe["node_2"] = graph_dataframe["node_2"].apply(lambda x: x.lower())

    return graph_dataframe

In [13]:
pages_df = documents2Dataframe(pages)

In [10]:
from pathlib import Path

## Input data directory
data_dir = "cureus"
inputdirectory = Path(f"/kaggle/working//data_input/{data_dir}")

out_dir = data_dir
outputdirectory = Path(f"/kaggle/working//data_output/{out_dir}")

In [11]:
from tqdm.notebook import tqdm
tqdm.pandas()

In [None]:
regenerate = True

if regenerate:
    concepts_list = df2Graph(pages_df)
    dfg1 = graph2Df(concepts_list)
    if not os.path.exists(outputdirectory):
        os.makedirs(outputdirectory)
    
    dfg1.to_csv(outputdirectory/"graph.csv", sep="|", index=False)
    pages_df.to_csv(outputdirectory/"chunks.csv", sep="|", index=False)
else:
    dfg1 = pd.read_csv(outputdirectory/"graph.csv", sep="|")

dfg1.replace("", np.nan, inplace=True)
dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
dfg1['count'] = 4 
## Increasing the weight of the relation to 4. 
## We will assign the weight of 1 when later the contextual proximity will be calculated.  
print(dfg1.shape)
dfg1.head()

In [None]:
def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame:
    ## Melt the dataframe into a list of nodes
    dfg_long = pd.melt(
        df, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node"
    )
    dfg_long.drop(columns=["variable"], inplace=True)
    # Self join with chunk id as the key will create a link between terms occuring in the same text chunk.
    dfg_wide = pd.merge(dfg_long, dfg_long, on="chunk_id", suffixes=("_1", "_2"))
    # drop self loops
    self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index
    dfg2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)
    ## Group and count edges.
    dfg2 = (
        dfg2.groupby(["node_1", "node_2"])
        .agg({"chunk_id": [",".join, "count"]})
        .reset_index()
    )
    dfg2.columns = ["node_1", "node_2", "chunk_id", "count"]
    dfg2.replace("", np.nan, inplace=True)
    dfg2.dropna(subset=["node_1", "node_2"], inplace=True)
    # Drop edges with 1 count
    dfg2 = dfg2[dfg2["count"] != 1]
    dfg2["edge"] = "contextual proximity"
    return dfg2


dfg2 = contextual_proximity(dfg1)
dfg2.tail()

In [None]:
dfg = pd.concat([dfg1, dfg2], axis=0)
dfg = (
    dfg.groupby(["node_1", "node_2"])
    .agg({"chunk_id": ",".join, "edge": ','.join, 'count': 'sum'})
    .reset_index()
)
dfg

## Visualize the nodes and edge using network X

In [None]:
nodes = pd.concat([dfg['node_1'], dfg['node_2']], axis=0).unique()
nodes.shape

In [None]:
import networkx as nx
G = nx.Graph()

## Add nodes to the graph
for node in nodes:
    G.add_node(
        str(node)
    )

## Add edges to the graph
for index, row in dfg.iterrows():
    G.add_edge(
        str(row["node_1"]),
        str(row["node_2"]),
        title=row["edge"],
        weight=row['count']/4
    )

In [None]:
communities_generator = nx.community.girvan_newman(G)
top_level_communities = next(communities_generator)
next_level_communities = next(communities_generator)
communities = sorted(map(sorted, next_level_communities))
print("Number of Communities = ", len(communities))
print(communities)

In [None]:
import seaborn as sns
import random

palette = "hls"

## Now add these colors to communities and make another dataframe
def colors2Community(communities) -> pd.DataFrame:
    ## Define a color palette
    p = sns.color_palette(palette, len(communities)).as_hex()
    random.shuffle(p)
    rows = []
    group = 0
    for community in communities:
        color = p.pop()
        group += 1
        for node in community:
            rows += [{"node": node, "color": color, "group": group}]
    df_colors = pd.DataFrame(rows)
    return df_colors


colors = colors2Community(communities)
colors

In [None]:
for index, row in colors.iterrows():
    G.nodes[row['node']]['group'] = row['group']
    G.nodes[row['node']]['color'] = row['color']
    G.nodes[row['node']]['size'] = G.degree[row['node']]

## plotting the graph

In [None]:
!pip install -q pyvis

In [None]:
os.mkdir("/kaggle/working/docs/")

In [None]:
from pyvis.network import Network

graph_output_directory = "/kaggle/working/docs/index.html"

net = Network(
    notebook=False,
    cdn_resources="remote",
    height="900px",
    width="100%",
    select_menu=True,
    filter_menu=False,
)

net.from_nx(G)
net.force_atlas_2based(central_gravity=0.015, gravity=-31)
net.show_buttons(filter_=["physics"])

net.show(name=graph_output_directory, notebook=False)

In [None]:
from IPython.core.display import display, HTML

display(HTML(graph_output_directory))

## Network analysis

### Graph level metrics

In [None]:
# Number of nodes in the graph
print(f"Number of nodes : {nx.number_of_nodes(G)}")

In [None]:
# Global clustering coefficient 
global_clustering = nx.transitivity(G)
print("Global Clustering Coefficient:", global_clustering)

In [None]:
# Average path length
average_path_length = nx.average_shortest_path_length(G)
print("Average Path Length:", average_path_length)

### Node level metrics

In [None]:
# Compute degree centrality : the number of connection the nodes has
degree_centrality = nx.degree_centrality(G)

In [None]:
# Compute betweenness centrality
betweenness_centrality = nx.betweenness_centrality(G)

In [None]:
# Compute closeness centrality
closeness_centrality = nx.closeness_centrality(G)

In [None]:
# Compute page rank centrality
pr = nx.pagerank(G, alpha=0.85)

### Comparison between previous and new graph

In [None]:
pr_old = nx.pagerank(G_old, alpha=0.85)

In [None]:
old_top_nodes = sorted(pr_old.items(), key=lambda x: x[1], reverse=True)[:5]
top_nodes = sorted(pr.items(), key=lambda x: x[1], reverse=True)[:5]

In [None]:
#

# Classification

In [124]:
system_prompt = """You are a news classification assistant.

Your task is to classify each news article description into **only one** of the following categories:

- Politics
- Economy
- Entertainment
- Science/Tech
- Sports

If an article clearly doesn't belong to any of these, label it as: Faits divers

The input will be a numbered list of article descriptions.

Respond with a numbered list of corresponding category names, **in the same order**. Each line should be formatted exactly like this:
1. Economy
2. Politics
3. Faits divers
... etc.

Only return the numbered list of categories, no other text.
"""


In [132]:
def batch_generate_completion(texts, system_prompt):
    user_prompt = "Classify these news articles:\n" + "\n".join(
        f"{i+1}. {text.strip()}" for i, text in enumerate(texts)
    )

    chat_response = client.chat.complete(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ]
    )

    output = chat_response.choices[0].message.content.strip()
    lines = output.splitlines()

    # Clean & validate
    categories = []
    for line in lines:
        match = re.match(r"^\d+\.\s*(.+)", line)
        if match:
            categories.append(match.group(1).strip())
    return categories


In [143]:
from tqdm import tqdm

def classify_in_batches(df, batch_size=10):
    results = []
    for i in tqdm(range(0, len(df), batch_size)):
        batch = df.iloc[i:i+batch_size]
        texts = batch["description"].tolist()
        categories = batch_generate_completion(texts, system_prompt=system_prompt)
        results.extend(categories)
        
    return results

In [146]:
news_df['category']= classify_in_batches(news_df)

100%|██████████| 2/2 [00:00<00:00,  2.71it/s]


In [147]:
news_df

Unnamed: 0,id,title,link,author,description,pubDate,content,tags,category
0,https://www.africaintelligence.com/central-afr...,Central African Republic : The nebulous web of...,https://www.africaintelligence.com/central-afr...,,"When it comes to economic diplomacy, the Centr...","Fri, 13 Jun 2025 04:40:00 GMT",,[],Economy
1,https://www.africaintelligence.com/west-africa...,Guinea : Conakry sets foundation for big plans...,https://www.africaintelligence.com/west-africa...,,"The Guinean embassy in Washington, headed by S...","Fri, 13 Jun 2025 04:40:00 GMT",,[],Faits divers
2,https://www.africaintelligence.com/west-africa...,Niger : General Tchiani walks a tightrope to m...,https://www.africaintelligence.com/west-africa...,,As the security situation continues to deterio...,"Fri, 13 Jun 2025 04:40:00 GMT",,[],Politics
3,https://www.africaintelligence.com/north-afric...,Morocco : Energy firm Nareva on look out for n...,https://www.africaintelligence.com/north-afric...,,Nareva is back to square one as it seeks a new...,"Fri, 13 Jun 2025 04:40:00 GMT",,[],Economy
4,https://www.africaintelligence.com/north-afric...,Libya : Arkenu Oil and Bares Holding negotiate...,https://www.africaintelligence.com/north-afric...,,"The Arabian Gulf Oil Company, or AGOCO, is cur...","Fri, 13 Jun 2025 04:40:00 GMT",,[],Economy
5,https://www.africaintelligence.com/eastern-afr...,South Sudan : Riek Machar loyalists drum up su...,https://www.africaintelligence.com/eastern-afr...,,Riek Machar's allies are demanding his release...,"Fri, 13 Jun 2025 04:40:00 GMT",,[],Politics
6,https://www.africaintelligence.com/eastern-afr...,Somalia : AU appoints new head of mission in M...,https://www.africaintelligence.com/eastern-afr...,,The task promises to be particularly challengi...,"Fri, 13 Jun 2025 04:40:00 GMT",,[],Faits divers
7,https://www.africaintelligence.com/central-afr...,"Gabon : Oyima steps down at top of BVMAC, succ...",https://www.africaintelligence.com/central-afr...,,The new economy minister is preparing to resig...,"Fri, 13 Jun 2025 04:40:00 GMT",,[],Politics
8,https://www.africaintelligence.com/southern-af...,Namibia : Chinese-Botswanan Unik continues to ...,https://www.africaintelligence.com/southern-af...,,Chinese-Botswanan firm Unik Construction Engin...,"Fri, 13 Jun 2025 04:40:00 GMT",,[],Economy
9,https://africapresse.com/?p=30788,Kenya : la mort d’Albert Ojwang ravive la colè...,https://africapresse.com/kenya-la-mort-dalbert...,Patrick Babingwa,<p>Des centaines de manifestants ont défilé le...,"Fri, 13 Jun 2025 10:45:42 +0000",<p>Des centaines de manifestants ont défilé le...,"[""""Kenya"""",""""Politique"""",""""Albert Ojwang"""",""""k...",Faits divers


# Weekly Summary

In [None]:
summary_prompt = """You are an expert news journalist known for writing concise and insightful summaries that help readers quickly understand complex topics.

Summarize the following text into key points. Use clear, short bullet points — one per topic or event. Avoid repetition and keep each point focused.

Text:
{text}

Summary:
"""

In [None]:
text = "/n".join(news_df['content'])

In [None]:
summary = generate_content(system_prompt="",user_prompt= "summary_prompt")
print(summary)