### Import

In [1]:
import chromadb
import re
import pandas as pd
from chromadb.utils import embedding_functions
from html import unescape

### Chargement CSV

In [2]:
fake_df = pd.read_csv('../data/Fake.csv')
true_df = pd.read_csv('../data/True.csv')

In [3]:
fake_df['label'] = 'fake'
true_df['label'] = 'true'

In [4]:
clean_fake_df = fake_df.copy()
clean_true_df = true_df.copy()

In [5]:
clean_fake_df = clean_fake_df.drop_duplicates('text')
clean_true_df = clean_true_df.drop_duplicates('text')

In [6]:
def remove_if_brackets(text):
    if text.strip().startswith('[') and text.strip().endswith(']'):
        return ''
    return text


clean_fake_df['text'] = clean_fake_df['text'].apply(lambda x: remove_if_brackets(x))
clean_fake_df['title'] = clean_fake_df['title'].apply(lambda x: remove_if_brackets(x))

clean_true_df['text'] = clean_true_df['text'].apply(lambda x: remove_if_brackets(x))
clean_true_df['title'] = clean_true_df['title'].apply(lambda x: remove_if_brackets(x))

In [7]:
def normalize_spaces(text):
    return re.sub(r'\s+', ' ', text).strip()


clean_fake_df['text'] = clean_fake_df['text'].apply(lambda x: normalize_spaces(x))
clean_fake_df['title'] = clean_fake_df['title'].apply(lambda x: normalize_spaces(x))

clean_true_df['text'] = clean_true_df['text'].apply(lambda x: normalize_spaces(x))
clean_true_df['title'] = clean_true_df['title'].apply(lambda x: normalize_spaces(x))

In [8]:
def remove_special_characters(text):
    return re.sub(r'[^a-zA-Z0-9\s]', '', text)


clean_fake_df['text'] = clean_fake_df['text'].apply(
    lambda x: remove_special_characters(x)
)
clean_fake_df['title'] = clean_fake_df['title'].apply(
    lambda x: remove_special_characters(x)
)

clean_true_df['text'] = clean_true_df['text'].apply(
    lambda x: remove_special_characters(x)
)
clean_true_df['title'] = clean_true_df['title'].apply(
    lambda x: remove_special_characters(x)
)

In [9]:
def remove_urls(text):
    return re.sub(r'http\S+|www\S+', '', text)


clean_fake_df['text'] = clean_fake_df['text'].apply(lambda x: remove_urls(x))
clean_fake_df['title'] = clean_fake_df['title'].apply(lambda x: remove_urls(x))

clean_true_df['text'] = clean_true_df['text'].apply(lambda x: remove_urls(x))
clean_true_df['title'] = clean_true_df['title'].apply(lambda x: remove_urls(x))

In [10]:
def remove_html_tags(text):
    return re.sub(r'<.*?>', '', text)


clean_fake_df['text'] = clean_fake_df['text'].apply(lambda x: remove_html_tags(x))
clean_fake_df['title'] = clean_fake_df['title'].apply(lambda x: remove_html_tags(x))

clean_true_df['text'] = clean_true_df['text'].apply(lambda x: remove_html_tags(x))
clean_true_df['title'] = clean_true_df['title'].apply(lambda x: remove_html_tags(x))

In [11]:
clean_true_df['text'] = clean_true_df['text'].apply(lambda x: unescape(x))
clean_true_df['title'] = clean_true_df['title'].apply(lambda x: unescape(x))

In [12]:
clean_fake_df = clean_fake_df[clean_fake_df['text'].str.strip().astype(bool)]
clean_fake_df = clean_fake_df[clean_fake_df['title'].str.strip().astype(bool)]
clean_fake_df[clean_fake_df['title'].str.isspace()]

Unnamed: 0,title,text,subject,date,label


In [13]:
clean_true_df = clean_true_df[clean_true_df['text'].str.strip().astype(bool)]
clean_true_df = clean_true_df[clean_true_df['title'].str.strip().astype(bool)]
clean_true_df[clean_true_df['title'].str.isspace()]

Unnamed: 0,title,text,subject,date,label


In [14]:
clean_fake_df['date'] = pd.to_datetime(
    clean_fake_df['date'], errors='coerce', format='mixed'
)
clean_true_df['date'] = pd.to_datetime(
    clean_true_df['date'], errors='coerce', format='mixed'
)

In [15]:
clean_fake_df['text'] = clean_fake_df['text'].str.lower()
clean_fake_df['title'] = clean_fake_df['title'].str.lower()
clean_fake_df['subject'] = clean_fake_df['subject'].str.lower()

clean_true_df['text'] = clean_true_df['text'].str.lower()
clean_true_df['title'] = clean_true_df['title'].str.lower()
clean_true_df['subject'] = clean_true_df['subject'].str.lower()

In [16]:
news_df = pd.concat([clean_fake_df, clean_true_df], ignore_index=True)

### Chunks

In [17]:
def chunk_text(text, chunk_size=200, overlap=10):
    # text = text.split()
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size - overlap)]

news_df["chunks"] = news_df["text"].apply(chunk_text)

In [18]:
display(news_df['chunks'].head(2))

0    [donald trump just couldn t wish all americans...
1    [house intelligence committee chairman devin n...
Name: chunks, dtype: object

### Création du client Chroma

In [19]:
chroma_client = chromadb.Client()

### Création des embeddings

In [20]:
ollama_embed = embedding_functions.OllamaEmbeddingFunction(model_name='all-minilm:latest')

In [21]:
sample_text = ["Hello world!"]
emb = ollama_embed(sample_text)
print(emb)

[array([-2.03935262e-02,  2.51360629e-02, -6.81411009e-04,  1.15855830e-02,
       -3.80322784e-02, -1.20117873e-01,  4.16751094e-02, -2.09032428e-02,
       -5.89546897e-02,  2.42159050e-02,  6.22026622e-02,  6.76669106e-02,
        3.30636092e-02, -1.04195131e-02, -3.10804658e-02, -3.27129439e-02,
       -2.06209952e-03,  9.21510160e-03, -1.24933176e-01,  1.11401910e-02,
        3.90760191e-02,  5.43305725e-02, -2.86404416e-03,  4.45343144e-02,
       -8.54468793e-02, -2.28452161e-02,  3.90663520e-02,  3.60194519e-02,
       -3.20656486e-02, -6.41789064e-02,  5.80813587e-02,  4.67638299e-02,
        8.06346238e-02, -7.68541545e-03, -2.21431032e-02,  6.72523901e-02,
       -4.50143814e-02, -1.02082230e-01,  1.31178834e-03,  4.69050147e-02,
        2.64180005e-02, -6.98286816e-02, -4.44863960e-02, -6.83199195e-03,
        1.93007812e-02,  2.07665656e-02,  6.65277336e-03,  3.54986228e-02,
        1.03962682e-01,  1.75311267e-02, -4.29008566e-02, -5.70672974e-02,
       -1.14082163e-02, 

### Suppression collection (optionnel)

In [22]:
# chroma_client.delete_collection("fake_news")
# chroma_client.delete_collection("true_news")

### Création de collections

In [23]:
collection_news = chroma_client.get_or_create_collection(
    name='news_articles',
    embedding_function=ollama_embed
)

### Ajout du contenu aux collections

In [24]:
def add_chunks_to_collection(df, collection, prefix, batch_size=20):
    ids, documents, metadatas, embeddings = [], [], [], []

    for idx, row in df.iterrows():
        for i, chunk in enumerate(row['chunks']):
            ids.append(f'{prefix}_{idx}_{i}')
            documents.append(chunk)
            metadatas.append({
                'title': row['title'],
                'subject': row['subject'],
                'date': str(row['date']),
                'label': row['label'],
                'chunk_index': i
            })
            # calculer l'embedding pour ce chunk
            embeddings.append(ollama_embed([chunk])[0])

            if len(documents) >= batch_size:
                collection.add(
                    ids=ids,
                    documents=documents,
                    metadatas=metadatas,
                    embeddings=embeddings
                )
                ids, documents, metadatas, embeddings = [], [], [], []

    if len(documents) > 0:
        collection.add(
            ids=ids,
            documents=documents,
            metadatas=metadatas,
            embeddings=embeddings
        )


# Envoi de 100 lignes seulement pour test rapide
add_chunks_to_collection(news_df.head(100), collection_news, prefix="news")

In [25]:
print("Fake collection size:", collection_news.count())

print(collection_news.get(ids=["news_0_0"]))

Fake collection size: 1333
{'ids': ['news_0_0'], 'embeddings': None, 'documents': ['donald trump just couldn t wish all americans a happy new year and leave it at that instead he had to give a shout out to his enemies haters and the very dishonest fake news media the former reality s'], 'uris': None, 'included': ['metadatas', 'documents'], 'data': None, 'metadatas': [{'date': '2017-12-31 00:00:00', 'title': 'donald trump sends out embarrassing new years eve message this is disturbing', 'label': 'fake', 'subject': 'news', 'chunk_index': 0}]}


In [26]:
query_result = collection_news.query(
    query_texts=["donald trump photo media"],
    n_results=5,
    include=["documents", "metadatas", "distances"]
)

print(query_result)

{'ids': [['news_55_7', 'news_17_10', 'news_40_13', 'news_67_9', 'news_62_10']], 'embeddings': None, 'documents': [['n donald trump was the presidentelect he wanted the media to stop publishing unflattering photos of him so naturally internet users circulated them quickly trump hates photos displaying his many chins', 'r trumpphoto by ann heisenfeltgetty images', 't donald trump is fake newsphoto by tom penningtongetty images', 'hat is donald trump knows nothing aboutfeatured image via andrew burtongetty images', 'cial white house photographer and an ardent trump troll took to instagram to taunt president trump souza shared a compilation of images consisting of 15 time covers featuring the obamas originally pos']], 'uris': None, 'included': ['documents', 'metadatas', 'distances'], 'data': None, 'metadatas': [[{'date': '2017-11-28 00:00:00', 'label': 'fake', 'title': 'sean hannity is throwing a stage4 temper tantrum over the photo he posed for image', 'subject': 'news', 'chunk_index': 7}