In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading xx

In [None]:
#unzip summaries.zip to summary folder (by creating the folder)
import os
import zipfile

# Define paths
zip_path = "summaries.zip"
extract_folder = "summary"

# Create the directory if it doesn't exist
os.makedirs(extract_folder, exist_ok=True)

# Unzip the file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_folder)

print(f"Extracted '{zip_path}' to '{extract_folder}'")


Extracted 'summaries.zip' to 'summary'


In [None]:
import pandas as pd
import re

# Folder containing CSVs
folder_path = "summary/summaries"
output_file = "merged_summaries.csv"
unique_output_file = "unique_summaries.csv"
duplicates_output_file = "duplicate_summaries.csv"

# Regex to extract numbers from filenames
pattern = re.compile(r'generated_summaries_(\d+)-(\d+)\.csv')

# Find and sort files
files = []
for filename in os.listdir(folder_path):
    match = pattern.match(filename)
    if match:
        start_num = int(match.group(1))
        end_num = int(match.group(2))
        files.append((start_num, end_num, os.path.join(folder_path, filename)))

# Sort files by the starting number
files.sort()

# Merge CSV files
dfs = []
for _, _, filepath in files:
    df = pd.read_csv(filepath)
    print(f"Processing file: {filepath}, Row count: {len(df)}")
    dfs.append(df)

merged_df = pd.concat(dfs, ignore_index=True)

# Check for duplicates in the 'text' column
duplicate_count = merged_df.duplicated(subset=['text']).sum()
print(f"Number of duplicate entries in 'text' column: {duplicate_count}")

# Save the merged file
merged_df.to_csv(output_file, index=False)
print(f"Merged {len(files)} files into {output_file}")

# Keep only the first occurrence of each unique 'text' value
unique_df = merged_df.drop_duplicates(subset=['text'], keep='first')

# Extract duplicated rows (excluding the first occurrence)
duplicates_df = merged_df[merged_df.duplicated(subset=['text'], keep=False)].sort_values(by=['text'])

# Save the duplicate summaries file
duplicates_df.to_csv(duplicates_output_file, index=False)
print(f"Saved duplicate summaries into {duplicates_output_file}")


Processing file: summary/summaries/generated_summaries_0-59.csv, Row count: 63
Processing file: summary/summaries/generated_summaries_60-99.csv, Row count: 40
Processing file: summary/summaries/generated_summaries_100-199.csv, Row count: 100
Processing file: summary/summaries/generated_summaries_200-299.csv, Row count: 100
Processing file: summary/summaries/generated_summaries_300-399.csv, Row count: 100
Processing file: summary/summaries/generated_summaries_400-469.csv, Row count: 70
Processing file: summary/summaries/generated_summaries_470-499.csv, Row count: 130
Processing file: summary/summaries/generated_summaries_500-599.csv, Row count: 100
Processing file: summary/summaries/generated_summaries_600-699.csv, Row count: 100
Processing file: summary/summaries/generated_summaries_700-799.csv, Row count: 100
Processing file: summary/summaries/generated_summaries_800-899.csv, Row count: 200
Processing file: summary/summaries/generated_summaries_900-999.csv, Row count: 100
Processing f

In [None]:
import re

# Function to count summaries with unwanted tags
def count_summaries_with_tags(df, tags):
    count = sum(df['generated_summary'].str.contains(tags, regex=True))
    return count

# Define the unwanted tags (to match <debut>, </debut>, <fin>, and </fin>)
tags_to_remove = r"</?debut>|</?fin>"

# Count the summaries with the unwanted tags before cleaning
initial_count = count_summaries_with_tags(unique_df, tags_to_remove)
print(f"Initial count of summaries with unwanted tags: {initial_count}")

# Function to clean unwanted tags and trim spaces/newlines
def clean_summary(summary):
    # Remove </debut>, <debut>, and </fin> tags
    cleaned_summary = re.sub(r"</?debut>|</?fin>", "", summary)
    # Trim leading and trailing spaces and newlines
    cleaned_summary = cleaned_summary.strip()
    return cleaned_summary

# Clean the 'generated_summary' column in merged_df
unique_df['generated_summary'] = unique_df['generated_summary'].apply(clean_summary)

# Count again to ensure the tags were removed
final_count = count_summaries_with_tags(unique_df, tags_to_remove)
print(f"Final count of summaries with unwanted tags after cleaning: {final_count}")

Initial count of summaries with unwanted tags: 193
Final count of summaries with unwanted tags after cleaning: 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_df['generated_summary'] = unique_df['generated_summary'].apply(clean_summary)


In [None]:
# Save the unique summaries file
unique_df.to_csv(unique_output_file, index=False)
print(f"Saved unique summaries into {unique_output_file}")

Saved unique summaries into unique_summaries.csv


In [None]:
unique_df.head()

Unnamed: 0,text,generated_summary
0,Thierry Mariani sur la liste du Rassemblement ...,"Thierry Mariani, ancien ministre de Nicolas Sa..."
1,Dans une interview accord e au Figaro mercredi...,Le Medef propose plusieurs mesures pour endigu...
2,Le pr judice est estim 2 millions d'euros. ...,Un vaste réseau d'escrocs impliquant 90 gendar...
6,"Apr s une septi me dition impressionnante, Da...","Marie-Claude Pietragalla, juge de Danse avec l..."
7,"Il revendique sa proximit avec le peuple, ses...","G rald Darmanin, proche d'Emmanuel Macron et c..."


In [None]:
count_no_summary = (unique_df["generated_summary"] == "No summary found").sum()
print(f"Number of rows with 'No summary found': {count_no_summary}")

Number of rows with 'No summary found': 204


In [None]:
count_short_texts = unique_df["generated_summary"].apply(lambda x: len(str(x).split()) < 10).sum()
print(f"Number of rows with fewer than 10 words: {count_short_texts}")

Number of rows with fewer than 10 words: 207


In [None]:
# Remove rows where "text" is "No summary found"
filtered_df = unique_df[unique_df["generated_summary"] != "No summary found"]

# Remove rows where "text" has fewer than 10 words
filtered_df = filtered_df[filtered_df["generated_summary"].apply(lambda x: len(str(x).split()) >= 10)]

# Save the cleaned dataset
filtered_df.to_csv("cleaned_dataset.csv", index=False)

print(f"✅ Cleaned dataset saved as 'cleaned_dataset.csv' with {len(filtered_df)} rows.")


✅ Cleaned dataset saved as 'cleaned_dataset.csv' with 5000 rows.


In [None]:
from datasets import load_dataset
import pandas as pd

# Load the OrangeSum dataset (first 5000 entries)
dataset = load_dataset("giuliadc/orangesum_filtered_new_spaces")

# Function to clean & deduplicate text
def preprocess_text(text):
    text = text.strip()  # Remove extra spaces
    text = re.sub(r"\s+", " ", text)  # Normalize spaces
    return text


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

orangesum-space-train.json:   0%|          | 0.00/26.7M [00:00<?, ?B/s]

orangesum-space-validation.json:   0%|          | 0.00/2.05M [00:00<?, ?B/s]

orangesum-space-test.json:   0%|          | 0.00/1.89M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7864 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/611 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/543 [00:00<?, ? examples/s]

In [None]:
# Apply preprocessing (clean, deduplicate)
dataset = dataset.map(lambda x: {"text": preprocess_text(x["text"])})
#count data points that have length more than 2900 characters
dataset = dataset.filter(lambda x: len(x["text"]) < 2900)

print("✅ Preprocessing complete. Total articles:", len(dataset["train"]))

Map:   0%|          | 0/7864 [00:00<?, ? examples/s]

Map:   0%|          | 0/611 [00:00<?, ? examples/s]

Map:   0%|          | 0/543 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7864 [00:00<?, ? examples/s]

Filter:   0%|          | 0/611 [00:00<?, ? examples/s]

Filter:   0%|          | 0/543 [00:00<?, ? examples/s]

✅ Preprocessing complete. Total articles: 5409


In [None]:
dataset_df = pd.DataFrame(dataset['train'])
dataset_df.head()

Unnamed: 0,summary,text
0,L'information n'a pas été confirmée par l'inté...,Thierry Mariani sur la liste du Rassemblement ...
1,"Les médecins jugés ""gros prescripteurs d'arrêt...",Dans une interview accord e au Figaro mercredi...
2,Il aura fallu mobiliser 90 gendarmes pour cett...,Le pr judice est estim 2 millions d'euros. ...
3,Alors que la saison 7 de Danse avec les stars ...,"Apr s une septi me dition impressionnante, Da..."
4,Le ministre de l'Action et des Comptes publics...,"Il revendique sa proximit avec le peuple, ses..."


In [None]:
# Merge dataset_df with filtered_df based on the 'text' column
merged_df = dataset_df.merge(filtered_df, on="text", how="inner")

# Save the filtered dataset
merged_df.to_csv("filtered_merged_dataset.csv", index=False)

print(f"✅ Merged dataset saved with {len(merged_df)} rows.")


✅ Merged dataset saved with 5000 rows.


In [None]:
!pip install evaluate rouge_score

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=ab856f8b84824249e9d8c6155473b8198097fda7bf76fcb4eeccf34d89b42d1b
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score, evaluate
Successfully installed evaluate-0.4.3 rouge_score-0.1.2


In [None]:
import evaluate

# Load ROUGE metric
rouge = evaluate.load("rouge")

# Compute ROUGE scores
results = rouge.compute(
    predictions=merged_df["generated_summary"].tolist(),
    references=merged_df["summary"].tolist()
)

print(f"ROUGE Scores: {results}")

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

ROUGE Scores: {'rouge1': 0.30994770207178257, 'rouge2': 0.1111529237748566, 'rougeL': 0.20289960786347733, 'rougeLsum': 0.20288352794459724}


In [None]:
!pip install bert-score



In [None]:
from bert_score import score

# Compute BERTScore
P, R, F1 = score(merged_df["generated_summary"].tolist(), merged_df["summary"].tolist(), lang="fr")  # Change "fr" for French, "en" for English

# Print average BERTScore (F1 is most relevant)
print(f"BERTScore P: {P.mean().item():.4f}, R: {R.mean().item():.4f}, F1: {F1.mean().item():.4f}")

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

BERTScore P: 0.7086, R: 0.7325, F1: 0.7200
