#  RAG on news data

In [19]:
import asyncio
from tg_fetcher import TelegramFetcher
import pandas as pd

import ollama
import json
import logging

In [6]:

data = await TelegramFetcher.fetch_once("terroralarm", 9000)

[@emilstrider] Waiting for 9 seconds before continuing (required by "messages.GetHistory")
[@emilstrider] Waiting for 7 seconds before continuing (required by "messages.GetHistory")


In [7]:
df = pd.DataFrame(data)

In [11]:
df["text"] = df["text"].astype(str).str.strip()

# 2. Drop rows where text is empty after stripping
df = df[df["text"] != ""]
df.head()

Unnamed: 0,id,date,views,forwards,text
0,25148,2025-08-21 02:00:28,3770.0,5.0,✅🇮🇹 Italy's Meloni backs Ukraine security plan...
1,25147,2025-08-20 23:53:48,4525.0,4.0,✅🇺🇬🇺🇸 Uganda denies CBS News report that it st...
3,25145,2025-08-20 22:53:27,4996.0,8.0,✅🇮🇱 IDF eliminated several terrorists during c...
4,25144,2025-08-20 22:43:19,5122.0,9.0,✅🇮🇱 Israel never needs Americans to die for it...
5,25143,2025-08-20 19:59:36,5812.0,13.0,✅Chinese Navy is heavily deployed in the Taiwa...


In [12]:
terrorism_keywords = [
    "terror", "terrorist", "terrorism",
    "attack", "attacked", "attacking",
    "bomb", "bombing", "explosion", "explosive",
    "militant", "extremist", "jihad", "suicide",
    "hostage", "assassination"
]

# Step 2: Create regex pattern (case insensitive)
pattern = r"\b(" + "|".join(terrorism_keywords) + r")\b"

In [13]:
df_filtered = df[df["text"].str.contains(pattern, case=False, na=False)]

  df_filtered = df[df["text"].str.contains(pattern, case=False, na=False)]


In [14]:
df_filtered

Unnamed: 0,id,date,views,forwards,text
23,25125,2025-08-19 11:35:33,7650.0,18.0,✅🇮🇱🇮🇷The impact of Israel as the region's supe...
35,25113,2025-08-18 12:26:20,8933.0,41.0,✅🇷🇺🇺🇸 Breaking: Russian troops raise American ...
70,25078,2025-08-16 01:00:08,8535.0,26.0,✅ Is China preparing to attack Taiwan tonight!...
84,25064,2025-08-15 16:28:01,8078.0,9.0,✅🇺🇸🇷🇺 Kremlin: Trump to meet Putin on his plan...
92,25056,2025-08-15 01:46:22,8571.0,15.0,✅🇺🇦🇷🇺 Several hundred Ukrainian drones are att...
...,...,...,...,...,...
8983,16075,2024-01-18 18:39:48,1906.0,6.0,🚨 DEAD OR ALIVE:\nTerror Alarm is officially p...
8990,16068,2024-01-18 06:53:15,1932.0,13.0,🚨🇵🇰 Pakistan retaliates Iran's attack.\n\nSign...
8994,16064,2024-01-17 23:55:36,2032.0,23.0,Confirmed list of LGBTQ+ flags and symbols. Up...
8998,16060,2024-01-17 21:40:45,1876.0,3.0,🚨🇵🇱 #Poland's parliament has voted to remove t...


In [46]:
OLLAMA_MODEL = 'llama2'

# --- Core Extraction Function ---

def extract_entities_from_text(text: str) -> dict:
    """
    Uses a local Ollama LLM to extract specific entities from a given text.

    The function prompts the model to identify an aggressor, victim, place, and
    number of casualties from the text and return them in a structured JSON format.

    Args:
        text (str): The raw input text string to be processed.

    Returns:
        dict: A dictionary containing the extracted entities. If an entity is not
              found, its value will be None. Returns an empty dictionary if the
              extraction fails or the text is irrelevant.
    """
    if not isinstance(text, str) or not text.strip():
        logging.warning("Input text is empty or not a string. Returning empty dictionary.")
        return {}

    # This is the system prompt. It sets the context and instructions for the LLM.
    # It's a key part of the RAG process, guiding the "generation" part.
    # We explicitly ask for JSON output and define the structure.
    system_prompt = """
    You are an expert data analyst. Your task is to carefully read the provided text
    and extract specific pieces of information: the aggressor, the victim, the location
    of the event, and the number of casualties.

    Your response MUST be ONLY a single, valid JSON object with the following keys:
    "aggressor", "victim", "place", "casualties".

    - If a piece of information cannot be found in the text, the value for that key should be null.
    - For "casualties", provide an integer if possible, otherwise a descriptive string.
    - Do not include any explanations, apologies, or any text outside of the JSON object.
    """

    try:
        # Call the Ollama chat API
        response = ollama.chat(
            model=OLLAMA_MODEL,
            messages=[
                {'role': 'system', 'content': system_prompt},
                {'role': 'user', 'content': text}
            ],
            # This parameter is crucial for ensuring the output is a parsable JSON string.
            format='json'
        )

        # The response content is a JSON string, so we parse it into a Python dictionary.
        extracted_json_string = response['message']['content']
        extracted_data = json.loads(extracted_json_string)

        # Basic validation to ensure the returned JSON has the keys we expect.
        expected_keys = ["aggressor", "victim", "place", "casualties"]
        if not all(key in extracted_data for key in expected_keys):
            logging.warning(f"LLM output missing expected keys for text: '{text[:50]}...'")
            return {}

        return extracted_data

    except json.JSONDecodeError:
        logging.error(f"Failed to decode JSON from LLM response for text: '{text[:50]}...'")
        return {}
    except Exception as e:
        logging.error(f"An unexpected error occurred while processing text '{text[:50]}...': {e}")
        return {}

# --- Main DataFrame Processing Function ---

def add_rag_json_column(df: pd.DataFrame, text_column: str, new_column_name: str = 'extracted_json') -> pd.DataFrame:
    """
    Applies the RAG-based entity extraction to a DataFrame column.

    This function iterates through each row of the specified text column, calls
    the LLM to extract information, and stores the resulting JSON (as a dictionary)
    in a new column.

    Args:
        df (pd.DataFrame): The input DataFrame.
        text_column (str): The name of the column containing the raw text data.
        new_column_name (str): The name for the new column that will store the
                               extracted JSON data.

    Returns:
        pd.DataFrame: The DataFrame with the new column added.
    """
    if text_column not in df.columns:
        raise ValueError(f"Column '{text_column}' not found in the DataFrame.")

    logging.info(f"Starting RAG extraction on column '{text_column}' using model '{OLLAMA_MODEL}'.")

    # The .apply() method is a powerful pandas feature that applies a function
    # along an axis of the DataFrame. Here, we apply our extraction function
    # to each element in the 'text_column'.
    df[new_column_name] = df[text_column].apply(extract_entities_from_text)

    logging.info("RAG extraction complete.")
    return df

In [47]:
%%time
result_df = add_rag_json_column(df_filtered.head(20), 'text')

CPU times: user 48.5 ms, sys: 1.61 ms, total: 50.1 ms
Wall time: 3min 58s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[new_column_name] = df[text_column].apply(extract_entities_from_text)


In [48]:
result_df

Unnamed: 0,id,date,views,forwards,text,extracted_json
23,25125,2025-08-19 11:35:33,7650.0,18.0,✅🇮🇱🇮🇷The impact of Israel as the region's supe...,"{'aggressor': None, 'victim': None, 'place': '..."
35,25113,2025-08-18 12:26:20,8933.0,41.0,✅🇷🇺🇺🇸 Breaking: Russian troops raise American ...,"{'aggressor': 'Russian troops', 'victim': None..."
70,25078,2025-08-16 01:00:08,8535.0,26.0,✅ Is China preparing to attack Taiwan tonight!...,"{'aggressor': 'China', 'victim': None, 'place'..."
84,25064,2025-08-15 16:28:01,8078.0,9.0,✅🇺🇸🇷🇺 Kremlin: Trump to meet Putin on his plan...,"{'aggressor': None, 'victim': None, 'place': '..."
92,25056,2025-08-15 01:46:22,8571.0,15.0,✅🇺🇦🇷🇺 Several hundred Ukrainian drones are att...,"{'aggressor': 'Ukrainian drones', 'victim': No..."
104,25044,2025-08-14 10:52:11,9966.0,26.0,✅🇺🇦🇷🇺 Targeted assassination: Ukrainian drone ...,"{'aggressor': 'Russia', 'victim': 'Russian com..."
116,25032,2025-08-13 15:19:03,9797.0,68.0,✅🇮🇷 Iran is no longer in a position to attack ...,"{'aggressor': None, 'victim': None, 'place': '..."
119,25029,2025-08-13 11:25:05,10004.0,16.0,✅🇦🇺 Australia is officially a state sponsor of...,"{'aggressor': None, 'victim': None, 'place': '..."
139,25009,2025-08-12 00:48:40,13826.0,220.0,✅🇮🇷 Massive explosion at Iran's IRGC base in H...,"{'aggressor': None, 'victim': 'Iran's IRGC bas..."
159,24986,2025-08-10 21:42:34,9434.0,23.0,✅🇸🇾 At least 22 senior officers in the new Syr...,"{'aggressor': 'Syrian regime', 'victim': None,..."


In [52]:
extracted_data_df = pd.json_normalize(result_df['extracted_json'])

final_df = pd.concat([
        result_df.drop(columns=['extracted_json']).reset_index(drop=True),
        extracted_data_df.reset_index(drop=True)
    ], axis=1)
final_df

Unnamed: 0,id,date,views,forwards,text,aggressor,victim,place,casualties
0,25125,2025-08-19 11:35:33,7650.0,18.0,✅🇮🇱🇮🇷The impact of Israel as the region's supe...,,,Iran,40
1,25113,2025-08-18 12:26:20,8933.0,41.0,✅🇷🇺🇺🇸 Breaking: Russian troops raise American ...,Russian troops,,Zaporizhzhia region,5
2,25078,2025-08-16 01:00:08,8535.0,26.0,✅ Is China preparing to attack Taiwan tonight!...,China,,Taiwan,0
3,25064,2025-08-15 16:28:01,8078.0,9.0,✅🇺🇸🇷🇺 Kremlin: Trump to meet Putin on his plan...,,,Kremlin,0
4,25056,2025-08-15 01:46:22,8571.0,15.0,✅🇺🇦🇷🇺 Several hundred Ukrainian drones are att...,Ukrainian drones,,Russian bases and airports,0
5,25044,2025-08-14 10:52:11,9966.0,26.0,✅🇺🇦🇷🇺 Targeted assassination: Ukrainian drone ...,Russia,Russian commander,Rostov,1
6,25032,2025-08-13 15:19:03,9797.0,68.0,✅🇮🇷 Iran is no longer in a position to attack ...,,,Iran,20
7,25029,2025-08-13 11:25:05,10004.0,16.0,✅🇦🇺 Australia is officially a state sponsor of...,,,Australia,0
8,25009,2025-08-12 00:48:40,13826.0,220.0,✅🇮🇷 Massive explosion at Iran's IRGC base in H...,,Iran's IRGC base,Hashtgerd,10
9,24986,2025-08-10 21:42:34,9434.0,23.0,✅🇸🇾 At least 22 senior officers in the new Syr...,Syrian regime,,Suwayda,22
