In [1]:
import pandas as pd
import json
import random
from tqdm import tqdm

In [2]:
dataset_path = r"C:\Users\Arun\Downloads\NewsSumm Dataset.xlsx"
output_path = "train_indian_news_llm.jsonl"
max = 2000
seed = 42

In [3]:
df=pd.read_excel(dataset_path)

In [4]:
df.head()

Unnamed: 0,newspaper_name,published_date\n,headline,article_text,human_summary,news_category
0,Indian Express,2020-06-01 00:00:00,Virus may be invisible enemy but COVID warrior...,Prime Minister Narendra Modi Monday hailed the...,Prime Minister of India said that the Virus ma...,National News
1,Economic Times,2013-02-11 00:00:00,"Economy can bounce back, says PM Modi","ALLAHABAD: At least 20 persons were killed, an...","In Maha Kumbh, nearly 20 persons were killed. ...",National News
2,Business Standard,2013-02-11 00:00:00,At least 20 killed in stampede in Allahabad,"At least 20 people were killed, and scores of ...",As per the sources 20 people died and scores w...,National News
3,Money Control,2013-02-11 00:00:00,Maha Kumbh: Over 20 dead in Allahabad station ...,More than 20 people were feared dead and 30 ot...,At least 20 people killed and 20 people are in...,National News
4,The Mint,2023-10-02 00:00:00,Gandhian wisdom,"This Gandhi Jayanti, we should reflect upon an...","In this article, the author reflects on Mahatm...",National News


In [5]:
df["published_date\n"]=pd.to_datetime(df["published_date\n"],errors="coerce")

In [6]:
df["year"]= df["published_date\n"].dt.year


In [7]:
df.head()

Unnamed: 0,newspaper_name,published_date\n,headline,article_text,human_summary,news_category,year
0,Indian Express,2020-06-01,Virus may be invisible enemy but COVID warrior...,Prime Minister Narendra Modi Monday hailed the...,Prime Minister of India said that the Virus ma...,National News,2020.0
1,Economic Times,2013-02-11,"Economy can bounce back, says PM Modi","ALLAHABAD: At least 20 persons were killed, an...","In Maha Kumbh, nearly 20 persons were killed. ...",National News,2013.0
2,Business Standard,2013-02-11,At least 20 killed in stampede in Allahabad,"At least 20 people were killed, and scores of ...",As per the sources 20 people died and scores w...,National News,2013.0
3,Money Control,2013-02-11,Maha Kumbh: Over 20 dead in Allahabad station ...,More than 20 people were feared dead and 30 ot...,At least 20 people killed and 20 people are in...,National News,2013.0
4,The Mint,2023-10-02,Gandhian wisdom,"This Gandhi Jayanti, we should reflect upon an...","In this article, the author reflects on Mahatm...",National News,2023.0


In [8]:
random.seed(seed)
df = df.dropna(subset=[
    "article_text",
    "human_summary",
    "published_date\n"
])

In [9]:
df.head()

Unnamed: 0,newspaper_name,published_date\n,headline,article_text,human_summary,news_category,year
0,Indian Express,2020-06-01,Virus may be invisible enemy but COVID warrior...,Prime Minister Narendra Modi Monday hailed the...,Prime Minister of India said that the Virus ma...,National News,2020.0
1,Economic Times,2013-02-11,"Economy can bounce back, says PM Modi","ALLAHABAD: At least 20 persons were killed, an...","In Maha Kumbh, nearly 20 persons were killed. ...",National News,2013.0
2,Business Standard,2013-02-11,At least 20 killed in stampede in Allahabad,"At least 20 people were killed, and scores of ...",As per the sources 20 people died and scores w...,National News,2013.0
3,Money Control,2013-02-11,Maha Kumbh: Over 20 dead in Allahabad station ...,More than 20 people were feared dead and 30 ot...,At least 20 people killed and 20 people are in...,National News,2013.0
4,The Mint,2023-10-02,Gandhian wisdom,"This Gandhi Jayanti, we should reflect upon an...","In this article, the author reflects on Mahatm...",National News,2023.0


In [10]:
df["article_text"] = df["article_text"].astype(str)
df["human_summary"] = df["human_summary"].astype(str)
df["year"] = df["year"].astype(int)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 347155 entries, 0 to 348765
Data columns (total 7 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   newspaper_name   347155 non-null  object        
 1   published_date
  347155 non-null  datetime64[ns]
 2   headline         347155 non-null  object        
 3   article_text     347155 non-null  object        
 4   human_summary    347155 non-null  object        
 5   news_category    347149 non-null  object        
 6   year             347155 non-null  int64         
dtypes: datetime64[ns](1), int64(1), object(5)
memory usage: 21.2+ MB


In [12]:
def build_instruction():
    return (
        "You are an AI assistant trained on Indian newssum dataset. "
        "Answer the question strictly using the provided news context. "
        "If the information is not present, say you do not have enough information."
    )

In [13]:
def build_input(row):
    question_templates = [
        "What happened in the year {year} regarding this news event?",
        "Summarize the key incident reported in {year}.",
        "Explain the news event that occurred in {year}.",
        "What does this news article report about the year {year}?"
    ]

    question = random.choice(question_templates).format(year=row["year"])

    context = row["article_text"][:max]

    return (
        f"[YEAR={row['year']}]\n"
        f"Question: {question}\n\n"
        f"News Context:\n{context}"
    )

In [14]:
def build_output(row):
    return str(row["human_summary"]).strip()

In [15]:
with open(output_path, "w", encoding="utf-8") as f:
    for _, row in tqdm(df.iterrows(), total=len(df)):
        record = {
            "instruction": build_instruction(),
            "input": build_input(row),
            "output": build_output(row)
        }
        f.write(json.dumps(record, ensure_ascii=False) + "\n")

100%|██████████| 347155/347155 [00:25<00:00, 13397.18it/s]


In [16]:
print(f"Training data generated: {output_path}")

Training data generated: train_indian_news_llm.jsonl


In [17]:
import json
from itertools import islice

with open("train_indian_news_llm.jsonl", "r", encoding="utf-8") as f:
    for line in islice(f, 1):
        print(json.loads(line))
        print("-" * 80)


{'instruction': 'You are an AI assistant trained on Indian newssum dataset. Answer the question strictly using the provided news context. If the information is not present, say you do not have enough information.', 'input': '[YEAR=2020]\nQuestion: What happened in the year 2020 regarding this news event?\n\nNews Context:\nPrime Minister Narendra Modi Monday hailed the contribution of medical workers during the coronavirus pandemic. Addressing an event at the Rajiv Gandhi Health University in Bengaluru via video conference, Modi said, “The virus may be an invisible enemy. But our warriors, medical workers are invincible. In the battle of Invisible vs Invincible, our medical workers are sure to win.” He also said that the world is looking at them with gratitude, hope, and seeks both ‘care’ and ‘cure’.\n\n“At the root of India’s brave fight against COVID-19 is the hard work of the medical community and our corona warriors. In fact, doctors and medical workers are like soldiers, but withou

In [18]:
from sklearn.model_selection import train_test_split

with open("train_indian_news_llm.jsonl", encoding="utf-8") as f:
    data = f.readlines()

train, val = train_test_split(data, test_size=0.05, random_state=42)

with open("train.jsonl", "w", encoding="utf-8") as f:
    f.writelines(train)

with open("val.jsonl", "w", encoding="utf-8") as f:
    f.writelines(val)


In [19]:
print(len(train), len(val))
print(train[0][:300])


329797 17358
{"instruction": "You are an AI assistant trained on Indian newssum dataset. Answer the question strictly using the provided news context. If the information is not present, say you do not have enough information.", "input": "[YEAR=2025]\nQuestion: Summarize the key incident reported in 2025.\n\nNews
