# Mercedes F1 Infringement Profile - TextRank Summarization

This notebook implements TextRank algorithm for extractive summarization of Mercedes F1 infringement documents.

## Objective:
- Process all `no_footer_` files from the preprocessed dataset
- Apply TextRank algorithm for extractive summarization
- Maintain temporal analysis (year-by-year summaries)
- Generate "Version One A" of the infringement profile

## Input:
- `pre_proc_op/` folder containing `no_footer_*.txt` files organized by year

## Output:
- Console summaries for each year
- Overall consolidated summary


In [8]:
# Import required libraries
import os
import re
from pathlib import Path
import pandas as pd
import numpy as np
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

# TextRank implementation
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

print("Libraries imported successfully")


Libraries imported successfully


In [9]:
# Configuration
processed_base_path = Path("pre_proc_op")
years = ["2020", "2021", "2022", "2023", "2024"]

# TextRank parameters
SENTENCE_COUNT = 5  # Number of top sentences to extract
SIMILARITY_THRESHOLD = 0.1  # Minimum similarity threshold

print("Configuration:")
print(f"Processed base path: {processed_base_path}")
print(f"Years to analyze: {years}")
print(f"Sentences per summary: {SENTENCE_COUNT}")
print(f"Similarity threshold: {SIMILARITY_THRESHOLD}")


Configuration:
Processed base path: pre_proc_op
Years to analyze: ['2020', '2021', '2022', '2023', '2024']
Sentences per summary: 5
Similarity threshold: 0.1


In [10]:
# TextRank implementation
class TextRankSummarizer:
    def __init__(self, sentence_count=5, similarity_threshold=0.1):
        self.sentence_count = sentence_count
        self.similarity_threshold = similarity_threshold
        self.stemmer = PorterStemmer()
        
        # Get stopwords
        try:
            self.stop_words = set(stopwords.words('english'))
        except LookupError:
            nltk.download('stopwords')
            self.stop_words = set(stopwords.words('english'))
    
    def preprocess_sentence(self, sentence):
        """Preprocess sentence for similarity calculation"""
        # Convert to lowercase
        sentence = sentence.lower()
        
        # Remove special characters and digits
        sentence = re.sub(r'[^a-zA-Z\s]', '', sentence)
        
        # Tokenize and remove stopwords
        words = word_tokenize(sentence)
        words = [self.stemmer.stem(word) for word in words if word not in self.stop_words]
        
        return ' '.join(words)
    
    def calculate_similarity(self, sentence1, sentence2):
        """Calculate similarity between two sentences"""
        # Preprocess sentences
        processed1 = self.preprocess_sentence(sentence1)
        processed2 = self.preprocess_sentence(sentence2)
        
        # Create TF-IDF vectors
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform([processed1, processed2])
        
        # Calculate cosine similarity
        similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
        
        return similarity
    
    def build_similarity_matrix(self, sentences):
        """Build similarity matrix for all sentence pairs"""
        n = len(sentences)
        similarity_matrix = np.zeros((n, n))
        
        for i in range(n):
            for j in range(n):
                if i != j:
                    similarity = self.calculate_similarity(sentences[i], sentences[j])
                    similarity_matrix[i][j] = similarity
        
        return similarity_matrix
    
    def apply_pagerank(self, similarity_matrix):
        """Apply PageRank algorithm to similarity matrix"""
        # Create graph from similarity matrix
        graph = nx.from_numpy_array(similarity_matrix)
        
        # Apply PageRank
        scores = nx.pagerank(graph, alpha=0.85)
        
        return scores
    
    def summarize(self, text):
        """Generate summary using TextRank algorithm"""
        if not text or len(text.strip()) < 100:
            return "Insufficient text for summarization."
        
        # Tokenize into sentences
        sentences = sent_tokenize(text)
        
        if len(sentences) < 2:
            return text
        
        # Build similarity matrix
        similarity_matrix = self.build_similarity_matrix(sentences)
        
        # Apply PageRank
        scores = self.apply_pagerank(similarity_matrix)
        
        # Get top sentences
        ranked_sentences = sorted(scores.items(), key=lambda x: x[1], reverse=True)
        
        # Select top sentences
        top_sentences = ranked_sentences[:self.sentence_count]
        
        # Sort by original order
        top_sentences.sort(key=lambda x: x[0])
        
        # Extract sentences
        summary_sentences = [sentences[idx] for idx, score in top_sentences]
        
        return ' '.join(summary_sentences)

print("TextRankSummarizer class defined successfully")


TextRankSummarizer class defined successfully


In [11]:
# Initialize TextRank summarizer
summarizer = TextRankSummarizer(sentence_count=SENTENCE_COUNT, similarity_threshold=SIMILARITY_THRESHOLD)

print("TextRank summarizer initialized")
print(f"Parameters: {SENTENCE_COUNT} sentences, {SIMILARITY_THRESHOLD} similarity threshold")


TextRank summarizer initialized
Parameters: 5 sentences, 0.1 similarity threshold


In [12]:
# Process documents year by year
print("MERCEDES F1 INFRINGEMENT PROFILE - TEXTRANK SUMMARIZATION")

yearly_summaries = {}
yearly_stats = {}

for year in years:
    print(f"\nPROCESSING {year} - MERCEDES F1 INFRINGEMENTS")
    
    year_path = processed_base_path / year
    
    if not year_path.exists():
        print(f"Folder {year_path} does not exist")
        continue
    
    # Get all no_footer_ files
    no_footer_files = list(year_path.glob("no_footer_*.txt"))
    
    if not no_footer_files:
        print(f"No no_footer_ files found in {year}")
        continue
    
    print(f"Found {len(no_footer_files)} processed documents in {year}")
    
    # Read and combine all documents for the year
    combined_text = ""
    total_chars = 0
    processed_files = 0
    
    for file_path in no_footer_files:
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read().strip()
                if content:
                    combined_text += content + " "
                    total_chars += len(content)
                    processed_files += 1
        except Exception as e:
            print(f"Error reading {file_path.name}: {e}")
    
    if not combined_text.strip():
        print(f"No valid content found in {year}")
        continue
    
    print(f"Processed {processed_files} files, {total_chars:,} total characters")
    
    # Generate summary using TextRank
    print(f"\nGenerating TextRank summary for {year}...")
    summary = summarizer.summarize(combined_text)
    
    # Store results
    yearly_summaries[year] = summary
    yearly_stats[year] = {
        'files_processed': processed_files,
        'total_chars': total_chars,
        'summary_length': len(summary)
    }
    
    # Display summary
    print(f"\nTEXTRANK SUMMARY FOR {year}:")
    print("-" * 50)
    print(summary)
    print("-" * 50)
    print(f"Summary length: {len(summary):,} characters")
    print(f"Compression ratio: {(len(summary)/total_chars)*100:.1f}%")

print(f"\nProcessed {len(yearly_summaries)} years successfully")


MERCEDES F1 INFRINGEMENT PROFILE - TEXTRANK SUMMARIZATION

PROCESSING 2020 - MERCEDES F1 INFRINGEMENTS
Found 11 processed documents in 2020
Processed 11 files, 10,087 total characters

Generating TextRank summary for 2020...

TEXTRANK SUMMARY FOR 2020:
--------------------------------------------------
Reason Stewards heard from driver of Car 44 (Lewis Hamilton) and team representative and have reviewed video and telemetry evidences. Stewards heard from driver of Car 44 (Lewis Hamilton) and team representative and have reviewed new video evidence and telemetry evidences. Decision Deletion of lap time (1:03.061) in accordance with Article 12.3.1.e of FIA International Sporting Code and Art 31.4 of FIA Formula One Sporting Regulations. Reason Stewards heard from driver of Car 44 (Lewis Hamilton) and team representative and have reviewed video, evidence. Reason Stewards heard from driver of Car 44 (Lewis Hamilton) and team representative and have reviewed video evidence.
-----------------

In [15]:
# Generate overall consolidated summary
print("\nOVERALL CONSOLIDATED SUMMARY - ALL YEARS")

if yearly_summaries:
    # Combine all yearly summaries
    all_summaries_text = " "
    for year, summary in yearly_summaries.items():
        all_summaries_text += f"{year} Summary: {summary} "
    
    # Generate overall summary
    print("Generating overall consolidated summary...")
    overall_summary = summarizer.summarize(all_summaries_text)
    
    print("\nOVERALL MERCEDES F1 INFRINGEMENT PROFILE (TEXTRANK):")
    
    print(overall_summary)
    print("....." * 70)
    
    # Statistics
    total_files = sum(stats['files_processed'] for stats in yearly_stats.values())
    total_chars = sum(stats['total_chars'] for stats in yearly_stats.values())
    
    print(f"\nOVERALL STATISTICS:")
    print(f"Total documents processed: {total_files}")
    print(f"Total characters analyzed: {total_chars:,}")
    print(f"Overall summary length: {len(overall_summary):,} characters")
    print(f"Overall compression ratio: {(len(overall_summary)/total_chars)*100:.1f}%")
    
    print(f"\nYEARLY BREAKDOWN:")
    for year, stats in yearly_stats.items():
        compression = (stats['summary_length']/stats['total_chars'])*100 if stats['total_chars'] > 0 else 0
        print(f"{year}: {stats['files_processed']} docs, {stats['total_chars']:,} chars -> {stats['summary_length']:,} chars ({compression:.1f}%)")
    
else:
    print("No summaries generated. Please check the input files.")



OVERALL CONSOLIDATED SUMMARY - ALL YEARS
Generating overall consolidated summary...

OVERALL MERCEDES F1 INFRINGEMENT PROFILE (TEXTRANK):
 2020 Summary: Reason Stewards heard from driver of Car 44 (Lewis Hamilton) and team representative and have reviewed video and telemetry evidences. Reason Stewards heard from driver of Car 44 (Lewis Hamilton) and team representative and have reviewed video, evidence. Reason Stewards heard from driver of Car 44 (Lewis Hamilton) and team representative and have reviewed video evidence. Reason Stewards heard from driver of Car ​​44 ( Lewis Hamilton), team representative and reviewed video evidence. Reason Stewards heard from driver of Car 63 (George Russell), driver of Car 44 (Lewis Hamilton), team representatives and reviewed positioning/marshalling system data, video, team radio and in-car video evidence.
..................................................................................................................................................

In [16]:
# Save results to files
print("\nSAVING RESULTS")

# Create output directory
output_dir = Path("textrank_results")
output_dir.mkdir(exist_ok=True)

# Save yearly summaries
for year, summary in yearly_summaries.items():
    output_file = output_dir / f"textrank_summary_{year}.txt"
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(f"Mercedes F1 Infringement Summary - {year}\n")
        f.write("="*50 + "\n\n")
        f.write(summary)
    print(f"Saved {year} summary to {output_file}")

# Save overall summary
if yearly_summaries:
    overall_file = output_dir / "textrank_overall_summary.txt"
    with open(overall_file, 'w', encoding='utf-8') as f:
        f.write("Mercedes F1 Infringement Profile - Overall Summary (TextRank)\n")
        f.write("="*70 + "\n\n")
        f.write(overall_summary)
        f.write("\n\n" + "="*70 + "\n")
        f.write("STATISTICS:\n")
        f.write(f"Total documents processed: {total_files}\n")
        f.write(f"Total characters analyzed: {total_chars:,}\n")
        f.write(f"Overall summary length: {len(overall_summary):,} characters\n")
        f.write(f"Overall compression ratio: {(len(overall_summary)/total_chars)*100:.1f}%\n")
    print(f"Saved overall summary to {overall_file}")

print(f"\nTextRank summarization completed successfully!")
print(f"Results saved in: {output_dir}")



SAVING RESULTS
Saved 2020 summary to textrank_results\textrank_summary_2020.txt
Saved 2021 summary to textrank_results\textrank_summary_2021.txt
Saved 2022 summary to textrank_results\textrank_summary_2022.txt
Saved 2023 summary to textrank_results\textrank_summary_2023.txt
Saved 2024 summary to textrank_results\textrank_summary_2024.txt
Saved overall summary to textrank_results\textrank_overall_summary.txt

TextRank summarization completed successfully!
Results saved in: textrank_results
