## Developing and Testing To Achieve a Functional Training Model -Temp

In [None]:
# Install required packages
# !pip install datasets transformers[torch] evaluate accelerate rouge_score


In [None]:
# Import necessary libraries
import torch
from datasets import load_dataset, Dataset, DatasetDict
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from transformers import AutoTokenizer
import string
from tqdm import tqdm  # For progress tracking
import os


nltk.download('punkt')
nltk.download('stopwords')


In [None]:
# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')


In [None]:
# Load the dataset
ds = load_dataset('multi_news', trust_remote_code=True)

# Convert to DataFrames
ds_train = pd.DataFrame(ds['train'])
ds_test = pd.DataFrame(ds['test'])

# Display the first few rows
print(ds_train.head())
print(ds_test.head())


In [None]:
# Load stopwords
stop_words = set(stopwords.words('english'))

# Preprocessing Function
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove punctuation and special characters using string.punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove stop words
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Function to preprocess in batches and save periodically
def preprocess_and_save(dataset, save_path, batch_size=100):
    total_rows = len(dataset)
    for start in tqdm(range(0, total_rows, batch_size), desc="Processing batches"):
        end = min(start + batch_size, total_rows)
        batch = dataset.iloc[start:end].copy()  # Used .copy() to avoid SettingWithCopyWarning

        # Preprocess the current batch
        batch['document'] = batch['document'].map(preprocess_text)

        # Save the processed batch
        if start == 0:
            batch.to_csv(save_path, index=False)
        else:
            batch.to_csv(save_path, mode='a', header=False, index=False)

        print(f"Processed and saved rows {start} to {end}")

# Check for existing preprocessed file
save_path_train = 'preprocessed_train.csv'
save_path_test = 'preprocessed_test.csv'

# Preprocess training data
if not os.path.exists(save_path_train):
    preprocess_and_save(ds_train, save_path_train)

# Preprocess test data
if not os.path.exists(save_path_test):
    preprocess_and_save(ds_test, save_path_test)

print("Preprocessing completed.")


In [None]:
# Load the preprocessed datasets
new_ds = DatasetDict({
    'train': Dataset.from_pandas(pd.read_csv(save_path_train)),
    'test': Dataset.from_pandas(pd.read_csv(save_path_test))
})


In [None]:
# Load a tokenizer
tokenizer = AutoTokenizer.from_pretrained('facebook/bart-base')

# Set maximum length explicitly 512
max_length = 512

# Tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples['document'],
        padding='max_length',
        truncation=True,
        max_length=max_length
    )

# Apply tokenization to the dataset
tokenized_ds_train = new_ds['train'].map(tokenize_function, batched=True)
tokenized_ds_test = new_ds['test'].map(tokenize_function, batched=True)

# Convert to pandas DataFrame
train_df = pd.DataFrame(tokenized_ds_train)
test_df = pd.DataFrame(tokenized_ds_test)

# Save to CSV
train_df.to_csv('tokenized_ds_train.csv', index=False)
test_df.to_csv('tokenized_ds_test.csv', index=False)


In [None]:
# Load and train BART model
from transformers import BartForConditionalGeneration, Trainer, TrainingArguments

# Load the model
model = BartForConditionalGeneration.from_pretrained('facebook/bart-base').to(device)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',            
    evaluation_strategy="epoch",       
    learning_rate=2e-5,                
    per_device_train_batch_size=4,     
    per_device_eval_batch_size=4,      
    num_train_epochs=3,                
    weight_decay=0.01,                 
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds_train,
    eval_dataset=tokenized_ds_test,
)

# Train the model
trainer.train()
