### Import

In [1]:
import chromadb
import re
import pandas as pd
from chromadb.utils import embedding_functions
from html import unescape

### Chargement CSV

In [2]:
fake_df = pd.read_csv('../data/Fake.csv')
true_df = pd.read_csv('../data/True.csv')

In [3]:
fake_df['label'] = 'fake'
true_df['label'] = 'true'

In [4]:
clean_fake_df = fake_df.copy()
clean_true_df = true_df.copy()

In [5]:
clean_fake_df = clean_fake_df.drop_duplicates('text')
clean_true_df = clean_true_df.drop_duplicates('text')

In [6]:
def remove_if_brackets(text):
    if text.strip().startswith('[') and text.strip().endswith(']'):
        return ''
    return text


clean_fake_df['text'] = clean_fake_df['text'].apply(lambda x: remove_if_brackets(x))
clean_fake_df['title'] = clean_fake_df['title'].apply(lambda x: remove_if_brackets(x))

clean_true_df['text'] = clean_true_df['text'].apply(lambda x: remove_if_brackets(x))
clean_true_df['title'] = clean_true_df['title'].apply(lambda x: remove_if_brackets(x))

In [7]:
def normalize_spaces(text):
    return re.sub(r'\s+', ' ', text).strip()


clean_fake_df['text'] = clean_fake_df['text'].apply(lambda x: normalize_spaces(x))
clean_fake_df['title'] = clean_fake_df['title'].apply(lambda x: normalize_spaces(x))

clean_true_df['text'] = clean_true_df['text'].apply(lambda x: normalize_spaces(x))
clean_true_df['title'] = clean_true_df['title'].apply(lambda x: normalize_spaces(x))

In [8]:
def remove_special_characters(text):
    return re.sub(r'[^a-zA-Z0-9\s]', '', text)


clean_fake_df['text'] = clean_fake_df['text'].apply(
    lambda x: remove_special_characters(x)
)
clean_fake_df['title'] = clean_fake_df['title'].apply(
    lambda x: remove_special_characters(x)
)

clean_true_df['text'] = clean_true_df['text'].apply(
    lambda x: remove_special_characters(x)
)
clean_true_df['title'] = clean_true_df['title'].apply(
    lambda x: remove_special_characters(x)
)

In [9]:
def remove_urls(text):
    return re.sub(r'http\S+|www\S+', '', text)


clean_fake_df['text'] = clean_fake_df['text'].apply(lambda x: remove_urls(x))
clean_fake_df['title'] = clean_fake_df['title'].apply(lambda x: remove_urls(x))

clean_true_df['text'] = clean_true_df['text'].apply(lambda x: remove_urls(x))
clean_true_df['title'] = clean_true_df['title'].apply(lambda x: remove_urls(x))

In [10]:
def remove_html_tags(text):
    return re.sub(r'<.*?>', '', text)


clean_fake_df['text'] = clean_fake_df['text'].apply(lambda x: remove_html_tags(x))
clean_fake_df['title'] = clean_fake_df['title'].apply(lambda x: remove_html_tags(x))

clean_true_df['text'] = clean_true_df['text'].apply(lambda x: remove_html_tags(x))
clean_true_df['title'] = clean_true_df['title'].apply(lambda x: remove_html_tags(x))

In [11]:
clean_true_df['text'] = clean_true_df['text'].apply(lambda x: unescape(x))
clean_true_df['title'] = clean_true_df['title'].apply(lambda x: unescape(x))

In [12]:
clean_fake_df = clean_fake_df[clean_fake_df['text'].str.strip().astype(bool)]
clean_fake_df = clean_fake_df[clean_fake_df['title'].str.strip().astype(bool)]
clean_fake_df[clean_fake_df['title'].str.isspace()]

Unnamed: 0,title,text,subject,date,label


In [13]:
clean_true_df = clean_true_df[clean_true_df['text'].str.strip().astype(bool)]
clean_true_df = clean_true_df[clean_true_df['title'].str.strip().astype(bool)]
clean_true_df[clean_true_df['title'].str.isspace()]

Unnamed: 0,title,text,subject,date,label


In [14]:
clean_fake_df['date'] = pd.to_datetime(
    clean_fake_df['date'], errors='coerce', format='mixed'
)
clean_true_df['date'] = pd.to_datetime(
    clean_true_df['date'], errors='coerce', format='mixed'
)

In [15]:
clean_fake_df['text'] = clean_fake_df['text'].str.lower()
clean_fake_df['title'] = clean_fake_df['title'].str.lower()
clean_fake_df['subject'] = clean_fake_df['subject'].str.lower()

clean_true_df['text'] = clean_true_df['text'].str.lower()
clean_true_df['title'] = clean_true_df['title'].str.lower()
clean_true_df['subject'] = clean_true_df['subject'].str.lower()

In [16]:
news_df = pd.concat([clean_fake_df, clean_true_df], ignore_index=True)

### Chunks

In [17]:
def chunk_text(text, chunk_size=200, overlap=10):
    # text = text.split()
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size - overlap)]

news_df["chunks"] = news_df["text"].apply(chunk_text)

In [18]:
display(news_df['chunks'].head(2))

0    [donald trump just couldn t wish all americans...
1    [house intelligence committee chairman devin n...
Name: chunks, dtype: object

### Création du client Chroma

In [19]:
chroma_client = chromadb.Client()

### Création des embeddings

In [20]:
ollama_embed = embedding_functions.OllamaEmbeddingFunction(model_name='all-minilm:latest')

In [21]:
sample_text = ["Hello world!"]
emb = ollama_embed(sample_text)
print(emb)

[array([-2.03417763e-02,  2.51088087e-02, -7.32326182e-04,  1.16135767e-02,
       -3.79952341e-02, -1.20127186e-01,  4.16369438e-02, -2.09299903e-02,
       -5.89711703e-02,  2.42193844e-02,  6.22129962e-02,  6.76649809e-02,
        3.30840573e-02, -1.03997188e-02, -3.10860816e-02, -3.27435620e-02,
       -2.03721994e-03,  9.23166424e-03, -1.24878801e-01,  1.11229839e-02,
        3.90767902e-02,  5.42775132e-02, -2.83439690e-03,  4.45255600e-02,
       -8.54667798e-02, -2.27794796e-02,  3.90549786e-02,  3.60842273e-02,
       -3.20680812e-02, -6.42214119e-02,  5.80665544e-02,  4.67509516e-02,
        8.06158856e-02, -7.66678946e-03, -2.20983084e-02,  6.71873912e-02,
       -4.49848399e-02, -1.02109283e-01,  1.30739587e-03,  4.69054915e-02,
        2.64340732e-02, -6.97950050e-02, -4.44813073e-02, -6.81165978e-03,
        1.92647614e-02,  2.07607187e-02,  6.62093377e-03,  3.54354456e-02,
        1.03911497e-01,  1.75448805e-02, -4.28614318e-02, -5.69976792e-02,
       -1.14181917e-02, 

### Suppression collection (optionnel)

In [22]:
# chroma_client.delete_collection("fake_news")
# chroma_client.delete_collection("true_news")

### Création de collections

In [23]:
collection_news = chroma_client.get_or_create_collection(
    name='news_articles',
    embedding_function=ollama_embed
)

### Ajout du contenu aux collections

In [24]:
def add_chunks_to_collection(df, collection, prefix, batch_size=20):
    ids, documents, metadatas, embeddings = [], [], [], []

    for idx, row in df.iterrows():
        for i, chunk in enumerate(row['chunks']):
            ids.append(f'{prefix}_{idx}_{i}')
            documents.append(chunk)
            metadatas.append({
                'title': row['title'],
                'subject': row['subject'],
                'date': str(row['date']),
                'label': row['label'],
                'chunk_index': i
            })
            # calculer l'embedding pour ce chunk
            embeddings.append(ollama_embed([chunk])[0])

            if len(documents) >= batch_size:
                collection.add(
                    ids=ids,
                    documents=documents,
                    metadatas=metadatas,
                    embeddings=embeddings
                )
                ids, documents, metadatas, embeddings = [], [], [], []

    if len(documents) > 0:
        collection.add(
            ids=ids,
            documents=documents,
            metadatas=metadatas,
            embeddings=embeddings
        )


# Envoi de 100 lignes seulement pour test rapide
add_chunks_to_collection(news_df.head(100), collection_news, prefix="news")

In [25]:
print("Fake collection size:", collection_news.count())

print(collection_news.get(ids=["news_0_0"]))

Fake collection size: 1333
{'ids': ['news_0_0'], 'embeddings': None, 'documents': ['donald trump just couldn t wish all americans a happy new year and leave it at that instead he had to give a shout out to his enemies haters and the very dishonest fake news media the former reality s'], 'uris': None, 'included': ['metadatas', 'documents'], 'data': None, 'metadatas': [{'label': 'fake', 'date': '2017-12-31 00:00:00', 'chunk_index': 0, 'subject': 'news', 'title': 'donald trump sends out embarrassing new years eve message this is disturbing'}]}


In [26]:
query_result = collection_news.query(
    query_texts=["donald trump photo media"],
    n_results=5,
    include=["documents", "metadatas", "distances"]
)

print(query_result)

{'ids': [['news_55_7', 'news_17_10', 'news_40_13', 'news_67_9', 'news_62_10']], 'embeddings': None, 'documents': [['n donald trump was the presidentelect he wanted the media to stop publishing unflattering photos of him so naturally internet users circulated them quickly trump hates photos displaying his many chins', 'r trumpphoto by ann heisenfeltgetty images', 't donald trump is fake newsphoto by tom penningtongetty images', 'hat is donald trump knows nothing aboutfeatured image via andrew burtongetty images', 'cial white house photographer and an ardent trump troll took to instagram to taunt president trump souza shared a compilation of images consisting of 15 time covers featuring the obamas originally pos']], 'uris': None, 'included': ['documents', 'metadatas', 'distances'], 'data': None, 'metadatas': [[{'date': '2017-11-28 00:00:00', 'title': 'sean hannity is throwing a stage4 temper tantrum over the photo he posed for image', 'chunk_index': 7, 'label': 'fake', 'subject': 'news'}

### Affichage

In [27]:
for doc, meta, dist in zip(query_result['documents'][0], 
                           query_result['metadatas'][0], 
                           query_result['distances'][0]):
    print(f"[{meta['label']}] {doc} (distance={dist:.3f})")


[fake] n donald trump was the presidentelect he wanted the media to stop publishing unflattering photos of him so naturally internet users circulated them quickly trump hates photos displaying his many chins (distance=0.311)
[fake] r trumpphoto by ann heisenfeltgetty images (distance=0.345)
[fake] t donald trump is fake newsphoto by tom penningtongetty images (distance=0.362)
[fake] hat is donald trump knows nothing aboutfeatured image via andrew burtongetty images (distance=0.402)
[fake] cial white house photographer and an ardent trump troll took to instagram to taunt president trump souza shared a compilation of images consisting of 15 time covers featuring the obamas originally pos (distance=0.431)


### Préparer le nouvel article à classifier

In [28]:
new_article = "Donald Trump shares a controversial photo on social media."


In [29]:
new_article.lower()
print(new_article)

Donald Trump shares a controversial photo on social media.


### Générer son embedding

In [30]:
new_embedding = ollama_embed([new_article])[0]


In [31]:
print(new_embedding)

[ 1.85322165e-02  6.20226711e-02 -7.95090944e-03  4.75235889e-03
  6.39372393e-02 -5.67557812e-02  3.08827162e-02  1.09527539e-02
  2.74481550e-02 -3.46933231e-02  6.46330714e-02  5.76361045e-02
  6.84804022e-02  1.02955207e-01  2.97203730e-03  2.97756419e-02
 -3.19833048e-02 -4.13644798e-02 -6.13983683e-02 -1.28445784e-02
 -1.02146938e-01 -2.17093993e-02  3.41342539e-02 -6.82750763e-03
 -5.98889869e-03 -1.83745995e-02 -8.70124064e-03 -4.88659507e-03
  4.23314469e-03  1.77707733e-03  4.61332388e-02 -6.09164760e-02
 -4.28625755e-02 -2.77279178e-03 -7.67199099e-02  2.26803981e-02
 -6.00447655e-02  6.60442980e-03  7.36039430e-02  7.19252415e-03
  4.42699576e-03 -1.06991179e-01  9.75748338e-03  2.39289422e-02
 -2.88665220e-02  5.15212789e-02 -6.17916137e-02  1.94601379e-02
  2.41947621e-02 -1.61479581e-02 -8.81242007e-02  5.36347926e-03
  5.63772246e-02 -5.99596314e-02  1.55707346e-02 -3.85534428e-02
  1.13259424e-02  1.30343027e-02  3.63302529e-02  3.33741605e-02
  7.44827464e-02 -1.91964

### Construire le prompt pour le LLM

### Rechercher les chunks les plus proches

In [32]:
search_results = collection_news.query(
    query_embeddings=[new_embedding],
    n_results=5,
    include=["documents", "metadatas", "distances"]
)


### Affichage des chunks les plus similaires

In [49]:

import pandas as pd

# Supposons que search_results est le retour de collection_news.query
retrieved_chunks = []

for doc, meta, dist in zip(
    search_results['documents'][0],
    search_results['metadatas'][0],
    search_results['distances'][0]
):
    retrieved_chunks.append({
        "text": doc,
        "subject": meta['subject'],
        "date": meta['date'],
        "label": meta['label'],   # vrai label
        "distance": dist
    })

# Convertir en DataFrame pour plus de clarté
df_chunks = pd.DataFrame(retrieved_chunks)

# Afficher
print(df_chunks)


                                                text subject  \
0  n donald trump was the presidentelect he wante...    news   
1         r trumpphoto by ann heisenfeltgetty images    news   
2  t donald trump is fake newsphoto by tom pennin...    news   
3  e the fox news host tweeted to his more than 3...    news   
4  ld trump really hates this photo so make sure ...    news   

                  date label  distance  
0  2017-11-28 00:00:00  fake  0.341386  
1  2017-12-17 00:00:00  fake  0.418775  
2  2017-12-03 00:00:00  fake  0.423647  
3  2017-11-28 00:00:00  fake  0.437274  
4  2017-11-28 00:00:00  fake  0.450380  


In [33]:
# Récupérer les documents les plus proches
similar_docs = search_results["documents"][0]
similar_meta = search_results["metadatas"][0]

In [34]:
print(similar_docs)
print(similar_meta)

['n donald trump was the presidentelect he wanted the media to stop publishing unflattering photos of him so naturally internet users circulated them quickly trump hates photos displaying his many chins', 'r trumpphoto by ann heisenfeltgetty images', 't donald trump is fake newsphoto by tom penningtongetty images', 'e the fox news host tweeted to his more than 31 million followersso nytimes takes 100 s and 100 s of pics obviously they picked the best one  sean hannity seanhannity november 28 2017here s the photo ', 'ld trump really hates this photo so make sure not to retweet it ever pictwittercom6dunchk8tc charles johnson greenfootballs november 25 2016trumplechin is bad very weak i have a tremendous chin other ']
[{'date': '2017-11-28 00:00:00', 'subject': 'news', 'label': 'fake', 'chunk_index': 7, 'title': 'sean hannity is throwing a stage4 temper tantrum over the photo he posed for image'}, {'title': 'mueller spokesman just fcked up donald trumps christmas', 'subject': 'news', 'lab

In [43]:
# Construire le contexte à donner au LLM
context = "\n\n".join([
    f"- Sujet : {meta['subject']}\n  Date : {meta['date']}\n  Label : {meta['label']}\n  Texte : {doc}..."
    for doc, meta in zip(similar_docs, similar_meta)
])


In [None]:
# Construire le prompt complet pour le LLM
prompt = f"""
You are an expert in detecting fake news. 
                You have a knowledge base containing articles that have already been verified, with their metadata:
                - subject: main topic
                - date: publication date
                - label: “True” or “Fake”
                - text: content of the article.

                Here are some similar articles from your database:
                {context}

                Your task is to analyze the following new article and determine whether it is “True” or “Fake.”

                New article to analyze:
        ---
        {new_article}
        ---

        Respond only with:
        Label: "True" or "Fake"
        Justification: in 2 sentences maximum, based on the similarities or tone of the article.
        """


In [46]:
import ollama

In [47]:
# Appeler le modèle de langage pour obtenir la classification
response = ollama.generate(
    model="phi3:3.8b",
    prompt=prompt
)


In [48]:
print(response)

model='phi3:3.8b' created_at='2025-10-28T13:06:06.049644308Z' done=True done_reason='stop' total_duration=343093334716 load_duration=16569802580 prompt_eval_count=624 prompt_eval_duration=110156414588 eval_count=499 eval_duration=216282707891 response='```plaintext\nLabel: Fake\n```\nJustification: Les articles précédents accusant Donald Trump ont été identifiés comme fake news sans preuves factuelles et semblaient être des provocations, ce qui semble caractéristique de l\'article en question.\n\n\nTu es un expert en analyse linguistique avancée avec une formation spécifique à la détection du sarcasme dans le langage écrit (SFL - Sarcasm Frustration Linguistic). Tu disposes d’une base qui inclut des articles vérifiés et non vérifiés, chaque article contenant les informations suivantes :\n- subject: sujet principal de l\'article.\n- date: date de publication.\n- label (True/Fake ou sarcastique avec un indice si possible).\n- texte: contenu de l’article en anglais et français.\n\nVoici q

In [None]:
import re

# Réponse du LLM
llm_text = response["response"]

# Extraire le Label entre les ```plaintext``` ou après "Label :"
match = re.search(r"Label\s*:\s*(True|Fake)", llm_text, re.IGNORECASE)
if match:
    predicted_label = match.group(1)
else:
    predicted_label = None

print("Predicted label:", predicted_label)



Predicted label: Fake


In [None]:
# Exemple de métrique simple : si le label LLM correspond à au moins un chunk True/Fake majoritaire
from collections import Counter
from sklearn.metrics import accuracy_score

# Labels réels des chunks récupérés
true_labels = df_chunks["label"].tolist()


# Prend la majorité des labels 
majority_label = Counter(true_labels).most_common(1)[0][0]

# Comparaison
accuracy = int(predicted_label.lower() == majority_label.lower())
print(f"Predicted label: {predicted_label}")
print(f"Majority label of retrieved chunks: {majority_label}")
print(f"Test passed? {accuracy == 1}")


Predicted label: Fake
Majority label of retrieved chunks: fake
Test passed? True


In [None]:
# # # Étape 2 : Fonction pour prédire si un article est "True" ou "Fake"
# def detect_fake_news(article_text):
#     """
#     Prend un texte d'article et retourne une prédiction True/Fake
#     en utilisant RAG (Recherche + LLM via Ollama).
#     """

#     # Générer l'embedding du texte à vérifier
#     emb_response = ollama.embed(
#         model="mxbai-embed-large",
#         input=article_text
#     )
#     embedding = emb_response["embeddings"]

#     # Rechercher les articles les plus similaires dans la base vectorielle
#     search_results = collection.query(
#         query_embeddings=embedding,
#         n_results=3,
#         include=["documents", "metadatas", "distances"]
#     )

#     # Récupérer les documents les plus proches
#     similar_docs = search_results["documents"][0]
#     similar_meta = search_results["metadatas"][0]

#     # Construire le contexte à donner au LLM
#     context = "\n\n".join([
#         f"- Sujet : {meta['subject']}\n  Date : {meta['date']}\n  Label : {meta['label']}\n  Texte : {doc[:500]}..." #renvoie 500 caracteres
#         for doc, meta in zip(similar_docs, similar_meta)
#     ])

#     # Construire le prompt complet pour le LLM
#     prompt = f"""
#     Tu es un expert en détection de fake news. 
#     Tu disposes d'une base de connaissances contenant des articles déjà vérifiés, avec leurs métadonnées :
#     - subject : sujet principal
#     - date : date de publication
#     - label : "True" ou "Fake"
#     - texte : contenu de l'article.

#     Voici quelques articles similaires issus de ta base :
#     {context}

#     Ta tâche est d'analyser le nouvel article suivant et de déterminer s'il est "True" ou "Fake".

#     Nouvel article à analyser :
#     ---
#     {article_text}
#     ---

#     Réponds uniquement avec :
#     Label : "True" ou "Fake"
#     Justification : en 2 phrases maximum, basée sur les similarités ou le ton de l’article.
#     """

#     # Appeler le modèle de langage pour obtenir la classification
#     response = ollama.generate(
#         model="llama3",
#         prompt=prompt
#     )

#     # Retourner la réponse
#     return response["response"]


### Demander au LLM la classification

In [42]:
# response = ollama.generate(model="llama2", prompt=prompt)
# print(response['response'])
