In [1]:
!pip install bert-extractive-summarizer transformers

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [2]:
import os
import re
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from summarizer import Summarizer
from arabert.preprocess import ArabertPreprocessor
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
import nltk
from nltk import word_tokenize, pos_tag
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('punkt', quiet=True)

True

In [3]:
model_path = r"C:\Users\USER\Desktop\Bahaa_GP\t5_weights\3"
test_data_path = 'cleaned_final_sum_test_separate_columns.csv'

# Load the model and tokenizer for inference
model = AutoModelForSeq2SeqLM.from_pretrained(model_path, local_files_only=True)
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, local_files_only=True)
test_data = pd.read_csv(test_data_path)
arabert_prep = ArabertPreprocessor(model_name="C:/Users/USER/Desktop/Bahaa_GP/t5_weights/3")
extractive_model = Summarizer()

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


In [4]:
def normalize_arabic_word(word):
    replacements = {'ة': 'ه', 'أ': 'ا', 'إ': 'ا', 'آ': 'ا', 'ى': 'ى', 'ؤ': 'و', 'ئ': 'ي'}
    for old, new in replacements.items():
        word = word.replace(old, new)
    return word

In [5]:
def remove_duplicates(summary):
    sentences = summary.split('، ')
    seen_sentences = set()
    result_sentences = []
    for sentence in sentences:
        normalized_sentence = ' '.join(normalize_arabic_word(word) for word in sentence.split())
        if normalized_sentence not in seen_sentences:
            seen_sentences.add(normalized_sentence)
            words = sentence.split()
            seen_words = defaultdict(set)
            result_words = []
            for word in words:
                normalized_word = normalize_arabic_word(word)
                if normalized_word not in seen_words[normalized_word]:
                    seen_words[normalized_word].add(normalized_word)
                    result_words.append(word)
            result_sentences.append(' '.join(result_words))
    return '، '.join(result_sentences)

In [6]:
def remove_unwanted_patterns(summary):
    summary = re.sub(r'(بو ابه اخبار الالكترونيه|اليوم السابع|أي نمط آخر غير مرغوب)', '', summary)
    summary = ' '.join(summary.split())
    return summary

def clean_summary(summary):
    summary = remove_duplicates(summary)
    summary = remove_unwanted_patterns(summary)
    return summary

In [7]:
def extract_keywords(text, n_keywords=10):
    if not text.strip() or len(text.split()) < 3: 
        return []
    tfidf_vectorizer = TfidfVectorizer(max_features=n_keywords, stop_words=None)
    try:
        tfidf_matrix = tfidf_vectorizer.fit_transform([text])
        feature_names = tfidf_vectorizer.get_feature_names_out()
        dense = tfidf_matrix.todense()
        dense_list = dense.tolist()[0]
        keywords = [feature_names[i] for i in sorted(range(len(dense_list)), key=lambda i: dense_list[i], reverse=True)[:n_keywords]]
    except ValueError as e:
        print(f"Error extracting keywords: {e}")
        return []
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)
    pos_keywords = [word for word, pos in tagged if pos in ['NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJR', 'JJS']]
    combined_keywords = list(set(keywords + pos_keywords))
    return combined_keywords[:n_keywords]

In [8]:
def calculate_target_length(original_text):
    length = len(original_text.split())
    target_min_length = max(40, int(length * 0.3))  
    target_max_length = max(100, int(length * 0.6))  
    return target_min_length, target_max_length

In [9]:
def generate_and_save_summary(row):
    text = row['Original']
    if pd.isna(row['GeneratedSummary']) or row['GeneratedSummary'] == '':
        preprocessed_text = arabert_prep.preprocess(text)
        extractive_summary = extractive_model(preprocessed_text, min_length=60, max_length=200)

        if not extractive_summary.strip():
            extractive_summary = preprocessed_text

        keywords = extract_keywords(extractive_summary, n_keywords=10)
        keywords_prompt = " ".join(keywords)
        inputs = tokenizer.encode(f"summarize: {extractive_summary} {keywords_prompt}", return_tensors="pt", max_length=1024, truncation=True)
        attention_mask = inputs != tokenizer.pad_token_id
        target_min_length, target_max_length = calculate_target_length(preprocessed_text)
        outputs = model.generate(
            inputs,
            attention_mask=attention_mask,
            max_length=target_max_length,
            min_length=target_min_length,
            length_penalty=1.5,
            num_beams=7,
            no_repeat_ngram_size=3,
            repetition_penalty=2.0,
            early_stopping=True
        )
        summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
        summary = clean_summary(summary)
        row['GeneratedSummary'] = summary
        test_data.loc[test_data['Original'] == text, 'GeneratedSummary'] = summary
        test_data.to_csv(test_data_path, index=False)
        print(f"Summary generated and saved for text: {text[:50]}...")
    return row

In [None]:
if 'GeneratedSummary' not in test_data.columns:
    test_data['GeneratedSummary'] = ''
test_data.apply(generate_and_save_summary, axis=1)
print("Summaries generated and saved successfully.")
