In [None]:
!pip install torch pandas numpy scikit-learn
!pip install transformers datasets nltk
!pip install pdfplumber rouge_score arxiv
!pip install bs4 requests

In [None]:
# Import necessary libraries
import torch
import pandas as pd
import random
import numpy as np
from transformers import (
    AutoTokenizer, BartForConditionalGeneration, BartTokenizer,
    AutoModelForSeq2SeqLM,
    Trainer,
    TrainingArguments,
    TrainerCallback,
    DataCollatorForSeq2Seq
)
from datasets import Dataset, DatasetDict
import pdfplumber
import re
from sklearn.model_selection import train_test_split
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
import nltk
from rouge_score import rouge_scorer
import arxiv
from bs4 import BeautifulSoup
import requests
import json
from tqdm import tqdm
import matplotlib.pyplot as plt

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
# Initialize the model and tokenizer
model_name = "facebook/bart-base"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)


In [None]:
import pandas as pd
import json

# Load the JSON file
with open('/content/combined.json', 'r') as f:
    data = json.load(f)

# Convert JSON data to DataFrame
if isinstance(data, list):  # JSON is a list of dictionaries
    df = pd.DataFrame(data)
elif isinstance(data, dict):  # JSON is a dictionary
    df = pd.DataFrame([data])  # Convert dict to DataFrame with one row
else:
    raise ValueError("The JSON file contains an unsupported structure.")

# Display the DataFrame
print(df)

# Optionally, save the DataFrame to a CSV file
df.to_csv('data.csv', index=False)


# Generate synthetic dataset
print("Generating synthetic dataset...")
synthetic_data = df

In [None]:
def tokenize_bart(example):
    # tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
    inputs = tokenizer(example['text'], max_length=1024, padding='max_length', truncation=True, return_tensors="pt")
    targets = tokenizer(example['summary'], max_length=512, padding='max_length', truncation=True, return_tensors="pt")
    example['input_ids'] = inputs['input_ids'][0]
    example['attention_mask'] = inputs['attention_mask'][0]
    example['labels'] = targets['input_ids'][0]
    return example

In [None]:
from sklearn.model_selection import train_test_split

# Splitting the synthetic dataset
train_data, test_data = train_test_split(synthetic_data, test_size=0.2, random_state=42)

# Assign the train dataset as bart_train
bart_train = train_data

# Verify the split
print(f"Training set size: {len(bart_train)}")
print(f"Test set size: {len(test_data)}")

In [None]:
from datasets import Dataset

# Convert train and test dataframes to Hugging Face Dataset
bart_train = Dataset.from_pandas(train_data)
bart_test = Dataset.from_pandas(test_data)

# Tokenize the datasets
bart_train = bart_train.map(tokenize_bart)
bart_test = bart_test.map(tokenize_bart)

In [None]:
# Define fine-tuning function
def fine_tune_model(model, train_dataset, test_dataset, model_name):
    training_args = TrainingArguments(
        output_dir=f'./results_{model_name}',
        evaluation_strategy="epoch",
        learning_rate=5e-5,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        num_train_epochs=3,  # Adjust as needed
        weight_decay=0.01,
        logging_dir=f'./logs_{model_name}',
        save_total_limit=2,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
    )

    trainer.train()
    return trainer

# Fine-tune the BART model
bart_trainer = fine_tune_model(model, bart_train, bart_test, 'bart-base')

In [None]:
!pip install evaluate


In [None]:
import torch
import evaluate

# Load the ROUGE metric
rouge = evaluate.load("rouge")

# Evaluation Function with ROUGE Score
def evaluate_model(trainer, test_dataset, tokenizer):
    predictions = []
    references = []

    # Ensure model is on the correct device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    trainer.model.to(device)

    for data in test_dataset:
        # Move input_ids to the same device as the model
        input_ids = torch.tensor(data["input_ids"]).unsqueeze(0).to(device)

        # Generate summary
        summary_ids = trainer.model.generate(
            input_ids,
            max_length=512,
            min_length=10,
            num_beams=4,
            early_stopping=True
        )
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

        # Append prediction and reference
        predictions.append(summary)
        references.append(data["summary"])

    # Calculate ROUGE scores
    results = rouge.compute(predictions=predictions, references=references)
    return results

# Example usage
# Evaluate using bart_trainer (which is the trainer object created for fine-tuning BART)
results = evaluate_model(bart_trainer, bart_test, tokenizer)
print("ROUGE Scores:")
print(results)


In [None]:
def evaluate_model(trainer, test_dataset, tokenizer):
    predictions = []
    references = []

    # Ensure model is on the correct device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    trainer.model.to(device)

    for data in test_dataset:
        # Move input_ids to the same device as the model
        input_ids = torch.tensor(data["input_ids"]).unsqueeze(0).to(device)

        # Generate summary
        summary_ids = trainer.model.generate(input_ids, max_length=512, min_length=10)
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

        # Append prediction and reference
        predictions.append(summary)
        references.append(data["summary"])

    # Calculate ROUGE scores
    results = rouge.compute(predictions=predictions, references=references)
    return results

In [None]:
save_directory = "/content/Enviro_bart_model"

# Save BART model and tokenizer
model.save_pretrained(save_directory)
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
tokenizer.save_pretrained(save_directory)

In [None]:
import shutil

# Zip the directory
shutil.make_archive(save_directory, 'zip', save_directory)

# Confirm the zip file exists
print("Zip file created at:", f"{save_directory}.zip")

In [None]:
from google.colab import files

# Download the zip file
files.download(f"{save_directory}.zip")
