Article-Level Text Summarization with TinyLlama

In [1]:
# pip install llama-cpp-python

import pandas as pd
import os
import re
import torch
from llama_cpp import Llama
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, AutoModelForSeq2SeqLM

# local imports
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from src.preprocessing import clean_text, drop_columns, filter_articles_by_length
# Setting secret credentials
from dotenv import load_dotenv #pip install python-dotenv
load_dotenv()

os.environ["TOKENIZERS_PARALLELISM"] = "false" # Fix for Hugging Face Tokenizers issue


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/anna_verbytska/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Device set to use mps:0


In [7]:
# Fetch the data
df = pd.read_csv("../data/topics.csv")
df.head()

Unnamed: 0,body,title,category,word_count,body_shorter,topic
0,"The ""best person for the job"" should be appoin...",'Best person' for top legal job,politics,465,"The ""best person for the job"" should be appoin...",7
1,A cap on donations to political parties should...,'Debate needed' on donations cap,politics,549,A cap on donations to political parties should...,4
2,It could cost £80m to run a UK referendum on t...,'EU referendum could cost £80m',politics,194,It could cost £80m to run a UK referendum on t...,6
3,The risk of pets and children being given vote...,'Fido' to be taken off vote lists,politics,510,The risk of pets and children being given vote...,0
4,"People in England, Scotland and Wales must hav...",'Last chance' warning for voters,politics,270,"People in England, Scotland and Wales must hav...",0


In [None]:
# EXPERIMENT WITH MICROSOFT/PHI-2
from huggingface_hub import login # in bash huggingface-cli login

# Set credentials
# Replace 'your_actual_token_here' with your Hugging Face token
token = 'HUGGINGFACE_TOKEN'
login(token=token)

# Check for GPU availability (use GPU if available, otherwise fall back to CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Set credentials


# Load the model and tokenizer
model_name = "microsoft/phi-2"  # Change to Phi-2 model

tokenizer = AutoTokenizer.from_pretrained(model_name)

# You can explicitly set the token IDs like this:
tokenizer.pad_token_id = tokenizer.eos_token_id  # Set pad_token_id to eos_token_id if no pad token exists
tokenizer.bos_token_id = tokenizer.bos_token_id or 0  # Set to 0 if no bos_token_id is set (depending on model)
tokenizer.unk_token_id = tokenizer.unk_token_id or 100  # You can change this to the ID you prefer

# Check if the tokens are set correctly
print(f"PAD token ID: {tokenizer.pad_token_id}")
print(f"BOS token ID: {tokenizer.bos_token_id}")
print(f"UNK token ID: {tokenizer.unk_token_id}")
print(f"EOS token ID: {tokenizer.eos_token_id}")

model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    torch_dtype=torch.float16,
    device_map="auto",  # Use GPU if available, otherwise defaults to CPU
)

# Pipeline Initialization for text generation
summarization_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
)

# Summarization function
def summarize_article(text):
    """Summarize a single article using Phi-2 model."""
    
    prompt = (
        "Summarize the following article in a concise paragraph (3-4 sentences). "
        "Do not include any additional commentary, formatting, or chatbot-like responses.\n\n"
        f"Article:\n{text}\n\n"
        "Summary:"
    )
    
    try:
        # Generate the summary with safe parameters
        sequences = summarization_pipeline(
            prompt,
            do_sample=True,   # Enable sampling for diverse summaries
            temperature=0.7,  # Controls randomness (lower = more deterministic)
            top_k=50,         # Consider top 50 tokens
            top_p=0.9,        # Nucleus sampling
            num_return_sequences=1,
            repetition_penalty=1.1,
            max_new_tokens=150,  # Limit summary length
        )
        
        # Extract generated text
        generated_text = sequences[0]['generated_text']
        
        # Remove unwanted chatbot text (e.g., '### Human:', '### OpenAssistant:')
        cleaned_summary = re.sub(r"###.*?:.*?\n", "", generated_text, flags=re.DOTALL).strip()

        return cleaned_summary
    except Exception as e:
        print(f"Error during summarization: {e}")
        return None

# Limit df to first 3 articles for example
df_sample = df.head(3)  # take first 3 rows

# Apply summarization only to these 3 articles
df_sample["summary_phi2"] = df_sample["body_shorter"].apply(summarize_article)
df_sample.head()

In [None]:
from huggingface_hub import whoami
whoami()

In [5]:
df_sample['summary_phi2'].iloc[1]