In [19]:
import pandas as pd
import os

In [20]:
file = "news_datasets/guardian_environment_news.csv"
output_folder = 'news_datasets'

In [21]:
df = pd.read_csv(file, delimiter=",", encoding='utf-8')
print(df.head())


                                                                    Title  \
0            Liz Truss ‘will approve more oil drilling if she becomes PM’   
1  Renewed Highland golf course plan has environmentalists crying 'Fore!'   
2   Visiting green spaces deters mental health drug use, researchers find   
3  Bought too much red cabbage? Turn it into a festive nut roast – recipe   
4         ‘This year has been very good’: readers’ UK butterfly sightings   

                                                                                                                                                                         Intro Text  \
0                                              Tory leadership candidate criticised by campaigners after reports her team have met energy firms\n\nPolitics live – latest updates\n   
1                                                         Scottish government rejected a new links at Coul to protect the complex dune system but investors have revived th

In [22]:
df.shape

(30059, 5)

In [23]:
df.columns

Index(['Title', 'Intro Text', 'Authors', 'Article Text', 'Date Published'], dtype='object')

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30059 entries, 0 to 30058
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Title           29111 non-null  object
 1   Intro Text      29977 non-null  object
 2   Authors         25489 non-null  object
 3   Article Text    29691 non-null  object
 4   Date Published  27618 non-null  object
dtypes: object(5)
memory usage: 1.1+ MB


In [25]:
print(df.isnull().values.any())
print(df.isnull().sum().sum())

True
8409


In [26]:
df_clean = df.dropna()

In [27]:
print(df_clean.isnull().values.any())
print(df_clean.isnull().sum().sum())

False
0


In [28]:
df_clean.shape

(22956, 5)

In [29]:
df_clean.loc[1]

Title                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   

In [30]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)

print(df_clean.head())

                                                                    Title  \
0            Liz Truss ‘will approve more oil drilling if she becomes PM’   
1  Renewed Highland golf course plan has environmentalists crying 'Fore!'   
2   Visiting green spaces deters mental health drug use, researchers find   
3  Bought too much red cabbage? Turn it into a festive nut roast – recipe   
4         ‘This year has been very good’: readers’ UK butterfly sightings   

                                                                                                                                                                         Intro Text  \
0                                              Tory leadership candidate criticised by campaigners after reports her team have met energy firms\n\nPolitics live – latest updates\n   
1                                                         Scottish government rejected a new links at Coul to protect the complex dune system but investors have revived th

In [31]:
# merge Intro Text and Article Text to keep full article
df_clean["Full Text"] = (
    df_clean['Intro Text'].fillna('') +
    '\n' +
    df_clean['Article Text'].fillna('')
)
pd.set_option('display.max_colwidth', None)

print(df_clean.loc[1, 'Full Text'])

Scottish government rejected a new links at Coul to protect the complex dune system but investors have revived the scheme
It is an area so tranquil that the notion of bitter dispute is hugely anomalous. The serenity of Coul - in east Sutherland, north of Dornoch – is in fact fundamental to a backdrop of unrest.When the Scottish government rejected a plan for a golf course at Coul early last year, it appeared those with grand plans had nowhere else to turn. This marked a victory for environmentalists who argued one of the most complex dune systems in Scotland and a site of special scientific interest (SSSI) should not be compromised. “The harmful impacts to protected habitats and species would outweigh the potential socio-economic benefits,” said Kevin Stewart, then planning minister. Anne McCall, the director of RSPB Scotland, said: “These damaging proposals threatened a site of global importance for nature and should never have made it this far.”The rich v the very, very rich: the reb

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean["Full Text"] = (


In [32]:
df_clean = df_clean.drop(columns=['Intro Text', 'Article Text'])

In [33]:
df_final = df_clean[['Title', 'Full Text']].copy()
print(df_final.head())

                                                                    Title  \
0            Liz Truss ‘will approve more oil drilling if she becomes PM’   
1  Renewed Highland golf course plan has environmentalists crying 'Fore!'   
2   Visiting green spaces deters mental health drug use, researchers find   
3  Bought too much red cabbage? Turn it into a festive nut roast – recipe   
4         ‘This year has been very good’: readers’ UK butterfly sightings   

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         

In [34]:
df_final.shape

(22956, 2)

In [35]:
# Select 800 items randomly
df_sample = df_clean.sample(n=800, random_state=42).copy()

print(f"Size of the sampled dataset : {len(df_sample)} news")

Size of the sampled dataset : 800 news


In [36]:
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
    print(f"Folder '{output_folder}' created'")
else:
    print(f"Folder '{output_folder}' already exists")

Folder 'news_datasets' already exists


In [37]:
path_llm_csv = os.path.join(output_folder, 'llm_batch_800.csv')
df_sample.to_csv(path_llm_csv, index=False)

print(f"Saved LLM file (CSV): {path_llm_csv}")

Saved LLM file (CSV): news_datasets/llm_batch_800.csv
