Article-Level Text Summarization with TinyLlama

In [None]:
import pandas as pd
import os

# local imports
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
# Setting secret credentials
from dotenv import load_dotenv #pip install python-dotenv
load_dotenv()

# os.environ["TOKENIZERS_PARALLELISM"] = "false" # Fix for Hugging Face Tokenizers issue


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/anna_verbytska/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Device set to use mps:0


In [2]:
# Fetch the data
df = pd.read_csv("../data/topics_summaries.csv")
df.head()

Unnamed: 0,body,title,category,word_count,topic,article_summary
0,"The ""best person for the job"" should be appoin...",'Best person' for top legal job,politics,465,7,"Christopher Leslie tells MPs the ""best person ..."
1,A cap on donations to political parties should...,'Debate needed' on donations cap,politics,549,4,The Electoral Commission says it is worth deba...
2,It could cost £80m to run a UK referendum on t...,'EU referendum could cost £80m',politics,194,6,Constitution would make the European Union wor...
3,The risk of pets and children being given vote...,'Fido' to be taken off vote lists,politics,510,0,Electoral Commission says having voters regist...
4,"People in England, Scotland and Wales must hav...",'Last chance' warning for voters,politics,270,0,Those who filled in forms last autumn should a...


In [3]:
df.describe()

Unnamed: 0,word_count,topic
count,284.0,284.0
mean,479.84507,3.721831
std,164.824837,3.150537
min,168.0,0.0
25%,351.5,1.0
50%,498.0,3.0
75%,589.5,6.0
max,988.0,10.0


In [4]:
import tiktoken

# Initialize the token encoder (using the encoding that the model uses)
enc = tiktoken.get_encoding("cl100k_base")  # This is the encoding used by OpenAI's GPT models and others

# Function to count tokens in a text
def count_tokens(text):
    return len(enc.encode(text))

# Function to get token counts for the entire DataFrame (e.g., the 'body' column)
def get_token_counts(df, column='article_summary'):
    df['token_count'] = df[column].apply(count_tokens)
    return df

# Get token counts for the 'body' column (or any other text column)
df_with_token_counts = get_token_counts(df)

# Print the DataFrame with token counts
print(df_with_token_counts[['article_summary', 'token_count']].head())  # Show the first few rows with token counts

# Optionally, you can calculate the total token count across all rows
total_token_count = df_with_token_counts['token_count'].sum()
print(f"Total tokens in the entire dataset: {total_token_count}")

                                     article_summary  token_count
0  Christopher Leslie tells MPs the "best person ...          108
1  The Electoral Commission says it is worth deba...          119
2  Constitution would make the European Union wor...          106
3  Electoral Commission says having voters regist...          109
4  Those who filled in forms last autumn should a...          106
Total tokens in the entire dataset: 31114


In [5]:
from google import genai
from google.genai import types

from IPython.display import HTML, Markdown, display

In [6]:
from google.api_core import retry
is_retriable = lambda e: (isinstance(e, genai.errors.APIError) and e.code in {429, 503})
genai.models.Models.generate_content = retry.Retry(
    predicate=is_retriable)(genai.models.Models.generate_content)

In [7]:
# Set up your API key

import os
from dotenv import load_dotenv
import nltk
from nltk.tokenize import sent_tokenize
from dotenv import load_dotenv
import os
import time

load_dotenv()  # Loads variables from .env

GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

In [None]:
# Initialize Gemini client
client = genai.Client(api_key=GOOGLE_API_KEY)
# Summarize an article using Gemini 2.0 Flash
def summarize_article(text, max_output_tokens=200):
    """
    Summarizes the article text using Gemini 2.0 Flash.
    
    Args:
        text (str): The article text to be summarized.
        max_output_tokens (int): The maximum number of tokens for the summary.
    
    Returns:
        str: The summarized text.
    """
    try:
        # Prepare the prompt for summarization
        prompt = f"Summarize the following article into a concise summary:\n\n{text}"

        config = types.GenerateContentConfig(temperature=0.2, max_output_tokens=max_output_tokens)

        # Call the Gemini API to generate the summary
        response = client.models.generate_content(
            model="gemini-2.0-flash",
            contents=prompt,
            config=config        
            )

        # Return the summary
        return response.text.strip()

    except Exception as e:
        print(f"Error during summarization: {e}")
        return ""

# Main function to apply summarization to all articles in a DataFrame
def summarize_articles_in_df(df, body_col='body'):
    """
    Summarizes the articles in the specified DataFrame column and adds the summaries to a new column.

    Args:
        df (pd.DataFrame): DataFrame with the articles.
        body_col (str): The name of the column containing article text to summarize.
    
    Returns:
        pd.DataFrame: DataFrame with a new 'article_summary' column containing the summaries.
    """
    # Apply the summarization function to each article in the 'body' column
    df['gemini_summary'] = df[body_col].apply(lambda text: summarize_article(text))

    return df

In [None]:
# Summarize the article in the 'body' column
df_single = df[['body']].iloc[0:1]
df_single = summarize_articles_in_df(df_single)

# Display the DataFrame to check the results
df_single[['body', 'gemini_summary']].head()

0    The UK government is pushing for reforms to th...
Name: gemini_summary, dtype: object
