In [5]:
from transformers import BartTokenizer, BartForConditionalGeneration, AdamW

# Load pre-trained BART model
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

# Define optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

def transform(source, target):
    source_encodings = tokenizer(source, padding=True, truncation=True, return_tensors='pt', max_length=1024, add_special_tokens=True)
    target_encodings = tokenizer(target, padding=True, truncation=True, return_tensors='pt', max_length=150, add_special_tokens=True)  # Adjust max_length as needed
    return source_encodings, target_encodings

def train(source_encodings, target_encodings):
    optimizer.zero_grad()
    input_ids = source_encodings['input_ids']
    attention_mask = source_encodings['attention_mask']
    labels = target_encodings['input_ids']

    loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss
    loss.backward()
    optimizer.step()

# Training loop
for epoch in range(50):
    for source_batch, target_batch in zip(epoch_sources, epoch_targets):
        source_encodings, target_encodings = transform(source_batch, target_batch)
        train(source_encodings, target_encodings)

print("Training complete.")

Training complete.


In [7]:
model.save_pretrained("C:/Users/annar/OneDrive/Desktop/Visual Studio/GitHub/Projects2/Summarizer/Model2")
tokenizer.save_pretrained("C:/Users/annar/OneDrive/Desktop/Visual Studio/GitHub/Projects2/Summarizer/Model2")

('C:/Users/annar/OneDrive/Desktop/Visual Studio/GitHub/Projects2/Summarizer/Model2\\tokenizer_config.json',
 'C:/Users/annar/OneDrive/Desktop/Visual Studio/GitHub/Projects2/Summarizer/Model2\\special_tokens_map.json',
 'C:/Users/annar/OneDrive/Desktop/Visual Studio/GitHub/Projects2/Summarizer/Model2\\vocab.json',
 'C:/Users/annar/OneDrive/Desktop/Visual Studio/GitHub/Projects2/Summarizer/Model2\\merges.txt',
 'C:/Users/annar/OneDrive/Desktop/Visual Studio/GitHub/Projects2/Summarizer/Model2\\added_tokens.json')

In [8]:
from transformers import BartTokenizer, BartForConditionalGeneration

# Load pre-trained model and tokenizer
model_name = 'C:/Users/annar/OneDrive/Desktop/Visual Studio/GitHub/Projects2/Summarizer/Model2'
model = BartForConditionalGeneration.from_pretrained(model_name)
tokenizer = BartTokenizer.from_pretrained(model_name)

In [12]:
import pandas as pd

data = pd.read_csv('./Data/summarized-data.csv')
# Sample input text
data.head()

Unnamed: 0,id,article,highlights,Summary
0,92c514c913c0bdfe25341af9fd72b29db544099b,Ever noticed how plane seats appear to be gett...,Experts question if packed out planes are put...,U.S consumer advisory group says minimum space...
1,2003841c7dc0e7c5b1a248f9cd536d727f27a45a,A drunk teenage boy had to be rescued by secur...,Drunk teenage boy climbed into lion enclosure ...,"Rahul Kumar , 17 , ran towards animals shoutin..."
2,91b7d2311527f5c2b63a65ca98d21d9c92485149,Dougie Freedman is on the verge of agreeing a ...,Nottingham Forest are close to extending Dougi...,Forest boss took former manager Stuart Pearce ...
3,caabf9cbdf96eb1410295a673e953d304391bfbb,Liverpool target Neto is also wanted by PSG an...,Fiorentina goalkeeper Neto has been linked wit...,Neto joined Firoentina Brazilian outfit Atleti...
4,3da746a7d9afcaa659088c8366ef6347fe6b53ea,Bruce Jenner will break his silence in a two-h...,"Tell-all interview with the reality TV star, 6...",comes amid continuing speculation transition w...


In [13]:
output_text=[]
for i in data['Summary']:
    # Tokenize the input text
    input_ids = tokenizer.encode(i, return_tensors='pt')
    # Generate summary
    output_ids = model.generate(input_ids, max_length=100, num_beams=4, early_stopping=True)
    # Decode the generated summary
    output_text.append(tokenizer.decode(output_ids[0], skip_special_tokens=True))

data['Transformed_summary'] = output_text
data.head()

Unnamed: 0,id,article,highlights,Summary,Transformed_summary
0,92c514c913c0bdfe25341af9fd72b29db544099b,Ever noticed how plane seats appear to be gett...,Experts question if packed out planes are put...,U.S consumer advisory group says minimum space...,U.S consumer advisory group says minimum space...
1,2003841c7dc0e7c5b1a248f9cd536d727f27a45a,A drunk teenage boy had to be rescued by secur...,Drunk teenage boy climbed into lion enclosure ...,"Rahul Kumar , 17 , ran towards animals shoutin...","Rahul Kumar, 17, ran towards animals shouting ..."
2,91b7d2311527f5c2b63a65ca98d21d9c92485149,Dougie Freedman is on the verge of agreeing a ...,Nottingham Forest are close to extending Dougi...,Forest boss took former manager Stuart Pearce ...,Nottingham Forest are close to extending Dougi...
3,caabf9cbdf96eb1410295a673e953d304391bfbb,Liverpool target Neto is also wanted by PSG an...,Fiorentina goalkeeper Neto has been linked wit...,Neto joined Firoentina Brazilian outfit Atleti...,Neto joined Firoentina Brazilian outfit Atleti...
4,3da746a7d9afcaa659088c8366ef6347fe6b53ea,Bruce Jenner will break his silence in a two-h...,"Tell-all interview with the reality TV star, 6...",comes amid continuing speculation transition w...,comes amid continuing speculation transition w...


In [14]:
data = data.drop(columns = ["id", "article"])
data.head()

Unnamed: 0,highlights,Summary,Transformed_summary
0,Experts question if packed out planes are put...,U.S consumer advisory group says minimum space...,U.S consumer advisory group says minimum space...
1,Drunk teenage boy climbed into lion enclosure ...,"Rahul Kumar , 17 , ran towards animals shoutin...","Rahul Kumar, 17, ran towards animals shouting ..."
2,Nottingham Forest are close to extending Dougi...,Forest boss took former manager Stuart Pearce ...,Nottingham Forest are close to extending Dougi...
3,Fiorentina goalkeeper Neto has been linked wit...,Neto joined Firoentina Brazilian outfit Atleti...,Neto joined Firoentina Brazilian outfit Atleti...
4,"Tell-all interview with the reality TV star, 6...",comes amid continuing speculation transition w...,comes amid continuing speculation transition w...


In [15]:
data.to_csv('Transformed-data.csv', index = False)

In [17]:
#calculating the rouge score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize

def calculate_rouge_scores(generated_summaries, reference_summaries):
    rouge_1_scores = []
    rouge_2_scores = []
    rouge_l_scores = []

    for generated, reference in zip(generated_summaries, reference_summaries):
        # Tokenize the generated and reference summaries
        gen_tokens = word_tokenize(generated)
        ref_tokens = word_tokenize(reference)
        
        # Calculate ROUGE-N scores
        rouge_1_scores.append(sentence_bleu([ref_tokens], gen_tokens, weights=(1, 0, 0), smoothing_function=SmoothingFunction().method1))
        rouge_2_scores.append(sentence_bleu([ref_tokens], gen_tokens, weights=(0.5, 0.5, 0), smoothing_function=SmoothingFunction().method1))
        
        # Calculate ROUGE-L score
        rouge_l_scores.append(sentence_bleu([ref_tokens], gen_tokens, weights=(0, 1, 0), smoothing_function=SmoothingFunction().method1))

    avg_rouge_1 = sum(rouge_1_scores) / len(rouge_1_scores)
    avg_rouge_2 = sum(rouge_2_scores) / len(rouge_2_scores)
    avg_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores)

    return avg_rouge_1, avg_rouge_2, avg_rouge_l

# Example usage
generated_summaries = data['Transformed_summary']
reference_summaries = data['highlights']

rouge_1, rouge_2, rouge_l = calculate_rouge_scores(generated_summaries, reference_summaries)
print("ROUGE-1 Score:", rouge_1)
print("ROUGE-2 Score:", rouge_2)
print("ROUGE-L Score:", rouge_l)

ROUGE-1 Score: 0.4290773828791932
ROUGE-2 Score: 0.32274262876432097
ROUGE-L Score: 0.24486682387567116
