In [1]:
import os
import csv
import numpy as np
import pandas as pd
import gc
import bitsandbytes as bnb
import random
import string
import inflect
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score, recall_score, f1_score
from sklearn.metrics.pairwise import cosine_similarity
#from transformers import T5ForConditionalGeneration, T5Tokenizer
#from bert_score import BERTScorer
from rouge import Rouge
from transformers import Trainer, TrainingArguments, AdamW, T5ForConditionalGeneration, T5Tokenizer
import torch
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
import warnings
import pickle
import sys
from torch.cuda.amp import GradScaler, autocast
from tqdm.notebook import tqdm as notebook_tqdm

In [6]:
sys.setrecursionlimit(4000)

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [8]:
warnings.filterwarnings("ignore")

In [9]:
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

In [10]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jelly\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\jelly\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jelly\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jelly\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\jelly\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [11]:
def lowercase_text(text):
    return text.lower()

def tokenize_text(text):
    return word_tokenize(text)

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    words = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

# Initialize inflect engine
p = inflect.engine()

# Convert numbers to words with error handling for large numbers
def convert_numbers_in_text(text):
    words = text.split()
    new_words = []
    
    for word in words:
        if word.isdigit():
            try:
                word_in_words = p.number_to_words(word)
            except Exception:  # Catch any exception, including out-of-range numbers
                word_in_words = word  # Leave the number unchanged if there's an error
            new_words.append(word_in_words)
        else:
            new_words.append(word)
    
    return ' '.join(new_words)

# Main preprocessing function
def preprocess_text(text):
    text = lowercase_text(text)
    text = remove_punctuation(text)
    text = remove_stopwords(text)
    text = lemmatize_text(text)
    text = convert_numbers_in_text(text)  # Combined the number conversion here
    return text

# Access and preprocess text from document and summary files
def access_and_preprocess_text(doc_file_path, summary_file_path):
    with open(doc_file_path, 'r', encoding='utf-8') as doc_file:
        doc_text = doc_file.read()
        processed_doc_text = preprocess_text(doc_text)

    with open(summary_file_path, 'r', encoding='utf-8') as summary_file:
        summary_text = summary_file.read()
        processed_summary_text = preprocess_text(summary_text)

    return processed_doc_text, processed_summary_text

In [12]:
# Initialize empty lists for training documents (X_train) and summaries (y_train)
X_train = []
y_train = []

# Initialize empty lists for testing documents (X_test) and summaries (y_test)
X_test = []
y_test = []

In [13]:
# Directories for training and testing documents and summaries
train_doc_folder = r"C:/Users/jelly/Desktop/Sem-4/SML/Research Paper/train_data/judgement"
train_summary_folder = r"C:/Users/jelly/Desktop/Sem-4/SML/Research Paper/train_data/summary"
test_doc_folder = r"C:/Users/jelly/Desktop/Sem-4/SML/Research Paper/test_data/judgement"
test_summary_folder = r"C:/Users/jelly/Desktop/Sem-4/SML/Research Paper/test_data/summary"

In [14]:
# Specify the number of training and testing documents to process
#num_train_documents = 7030
#num_test_documents = 100

num_train_documents = 500
num_test_documents = 100

In [15]:
# Get list of training and testing document files
train_doc_files = os.listdir(train_doc_folder)
train_summary_files = os.listdir(train_summary_folder)
test_doc_files = os.listdir(test_doc_folder)
test_summary_files = os.listdir(test_summary_folder)

In [16]:
# Process the specified number of training documents and collect X_train and y_train
for i in range(num_train_documents):
    doc_file = train_doc_files[i]
    summary_file = train_summary_files[i]
    processed_doc_text, processed_summary_text = access_and_preprocess_text(os.path.join(train_doc_folder, doc_file), os.path.join(train_summary_folder, summary_file))
    X_train.append(processed_doc_text)
    y_train.append(processed_summary_text)

In [17]:
# Process the specified number of testing documents and collect X_test and y_test
for i in range(num_test_documents):
    doc_file = test_doc_files[i]
    summary_file = test_summary_files[i]
    processed_doc_text, processed_summary_text = access_and_preprocess_text(os.path.join(test_doc_folder, doc_file), os.path.join(test_summary_folder, summary_file))
    X_test.append(processed_doc_text)
    y_test.append(processed_summary_text)

In [18]:
# Setting up validation dataset
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [19]:
print(f"Total Training Documents: {len(X_train)}")
print(f"Total Validation Documents: {len(X_val)}")
print(f"Total Testing Documents: {len(X_test)}")
print(f"Total Training Summaries: {len(y_train)}")
print(f"Total Validation Summaries: {len(y_val)}")
print(f"Total Testing Summaries: {len(y_test)}")

Total Training Documents: 375
Total Validation Documents: 125
Total Testing Documents: 100
Total Training Summaries: 375
Total Validation Summaries: 125
Total Testing Summaries: 100


In [20]:
# Define the TF-IDF vectorizer with updated parameters
tfidf = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1, 2))

# Vectorize the training, validation, and testing data
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.transform(X_val)
X_test_tfidf = tfidf.transform(X_test)

y_train_tfidf = tfidf.fit_transform(y_train)
y_val_tfidf = tfidf.transform(y_val)
y_test_tfidf = tfidf.transform(y_test)

In [21]:
"""
# Load the BART model and tokenizer
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')


# Load the T5 model and tokenizer
model = T5ForConditionalGeneration.from_pretrained('t5-large')
tokenizer = T5Tokenizer.from_pretrained('t5-large')
"""
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ['TORCH_USE_CUDA_DSA'] = "1"

# Load the DistilBART model and tokenizer
model = T5ForConditionalGeneration.from_pretrained('t5-large')
tokenizer = T5Tokenizer.from_pretrained('t5-large')

# Move model to GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Enable gradient checkpointing to save memory
model.gradient_checkpointing_enable()

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [22]:
for name, param in model.named_parameters():
    if param.data.isnan().any():
        print(f"NaNs detected in parameter: {name}")
else:
    print("No NaN")

No NaN


In [23]:
for name, param in model.named_parameters():
    if param.grad is not None and param.grad.isnan().any():
        print(f"NaNs detected in gradient: {name}")
else:
    print("No NaN")

No NaN


In [24]:
gc.collect()

0

In [25]:
torch.cuda.empty_cache()
allocated_memory = torch.cuda.memory_allocated()
# Get the current GPU memory reserved (including overhead)
reserved_memory = torch.cuda.memory_reserved()

# Print memory in MB
print(f"Allocated memory: {allocated_memory / (1024 ** 2):.2f} MB")
print(f"Reserved memory: {reserved_memory / (1024 ** 2):.2f} MB")

# Get the free memory (total - reserved memory)
free_memory = torch.cuda.get_device_properties(0).total_memory - reserved_memory
print(f"Free memory: {free_memory / (1024 ** 2):.2f} MB")

Allocated memory: 2814.48 MB
Reserved memory: 2816.00 MB
Free memory: 5371.50 MB


In [26]:
# Function to clear GPU memory
def clear_gpu_memory():
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    gc.collect()# Ensure all GPU operations are finished

In [27]:
def preprocess_for_training(doc_texts, summaries):
    inputs = tokenizer(doc_texts, max_length=512, truncation=True, padding='max_length', return_tensors='pt', return_attention_mask=True)
    labels = tokenizer(summaries, max_length=150, truncation=True, padding='max_length', return_tensors='pt').input_ids

    labels[labels == tokenizer.pad_token_id] = -100
    return inputs, labels

# Convert data to appropriate format
train_encodings, train_labels = preprocess_for_training(X_train, y_train)
val_encodings, val_labels = preprocess_for_training(X_val, y_val)

# Dataset class
class SummaryDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# Create datasets
train_dataset = SummaryDataset(train_encodings, train_labels)
val_dataset = SummaryDataset(val_encodings, val_labels)

# Create DataLoaders with batch_size=1 for reduced memory usage
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=1)

# Define optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Define gradient accumulation steps to simulate larger batch sizes
accumulation_steps = 2  # Adjust this as needed based on memory

# Training loop with tqdm
num_epochs = 3
model.train()

for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()  # Reset gradients at the start of the epoch
    progress_bar = notebook_tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}', leave=False)

    for i, batch in enumerate(progress_bar):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss / accumulation_steps  # Normalize loss for gradient accumulation

        if torch.isnan(loss) or torch.isinf(loss):
            raise ValueError(f"NaN or Inf detected in loss: {loss.item()}")
            
        loss.backward()

        # Perform optimizer step every few steps
        if (i + 1) % accumulation_steps == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            torch.cuda.synchronize()  # Ensure operations complete before moving forward
            optimizer.step()
            optimizer.zero_grad()  # Clear gradients after step

        progress_bar.set_postfix(loss=loss.item())

        del input_ids
        del attention_mask
        del labels
        del outputs
        del loss

        # Clear GPU memory
        torch.cuda.empty_cache()

    # Print GPU memory usage to monitor consumption
    print(f"Allocated memory: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB")
    print(f"Reserved memory: {torch.cuda.memory_reserved() / 1024 ** 2:.2f} MB")

    print(f'Epoch {epoch + 1} completed')


# Save the fine-tuned T5 model and tokenizer
model.save_pretrained('t5-finetuned')
tokenizer.save_pretrained('t5-finetuned')

Epoch 1/3:   0%|          | 0/375 [00:00<?, ?it/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


OutOfMemoryError: CUDA out of memory. Tried to allocate 126.00 MiB. GPU 0 has a total capacity of 8.00 GiB of which 0 bytes is free. Of the allocated memory 7.12 GiB is allocated by PyTorch, and 77.90 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
torch.cuda.empty_cache()
allocated_memory = torch.cuda.memory_allocated()
# Get the current GPU memory reserved (including overhead)
reserved_memory = torch.cuda.memory_reserved()

# Print memory in MB
print(f"Allocated memory: {allocated_memory / (1024 ** 2):.2f} MB")
print(f"Reserved memory: {reserved_memory / (1024 ** 2):.2f} MB")

# Get the free memory (total - reserved memory)
free_memory = torch.cuda.get_device_properties(0).total_memory - reserved_memory
print(f"Free memory: {free_memory / (1024 ** 2):.2f} MB")

In [None]:
csv_file = "train_evaluation.csv"

# Open the CSV file and write the headers
with open(csv_file, "w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["document", "actual_summary", "predicted_summary", "cosine_similarity", "rouge_1", "rouge_2", "rouge_L"])  # Add headers
    
    # Iterate over 30 documents
    for i in tqdm(range(30), desc="Processing Documents", leave=False):
        print(f"\n### Document: {i+1} ###\n")
        
        # Generate summary for the document
        summary = summarize_chunks(X_train[i])
        
        # Compute cosine similarity using TF-IDF vectors (ensure `tfidf` is already defined and fitted)
        cosine_sim = cosine_similarity(tfidf.transform([y_train[i]]), tfidf.transform([summary]))[0][0]
        
        # Compute ROUGE scores (assuming rouge_scores is a predefined function)
        rouge_1, rouge_2, rouge_L = rouge_scores(y_train[i], summary)
        
        # Print metrics for inspection
        print(f"\nCosine similarity: {cosine_sim}")
        print(f"ROUGE-1: {rouge_1}")
        print(f"ROUGE-2: {rouge_2}")
        print(f"ROUGE-L: {rouge_L}\n")
        
        # Write the data to CSV file
        writer.writerow([X_train[i], y_train[i], summary, cosine_sim, rouge_1, rouge_2, rouge_L])
        
        # Clear memory for unused variables
        summary = summary.cpu()
        del summary
        del cosine_sim
        del rouge_1
        del rouge_2
        del rouge_L
        
        # Explicitly clear GPU memory and trigger garbage collection
        clear_gpu_memory()

print("Processing and evaluation completed.")

In [None]:
def rouge_scores(actual_summary, predicted_summary):
    rouge = Rouge()
    scores = rouge.get_scores(predicted_summary, actual_summary)
    return scores[0]['rouge-1']['f'], scores[0]['rouge-2']['f'], scores[0]['rouge-l']['f']

In [None]:
# Function to clear GPU memory
def clear_gpu_memory():
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    gc.collect()# Ensure all GPU operations are finished

In [None]:
clear_gpu_memory()

In [None]:
csv_file = "train_evaluation.csv"
with open(csv_file, "w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["document", "actual_summary", "predicted_summary", "cosine_similarity", "rouge_1", "rouge_2", "rouge_L"]) # Add headers
    for i in range(50):
        print(f"\n###Document: {i+1}###\n")
        
        # Generate summary
        summary = summarize_chunks(X_train[i])
        
        # Compute cosine similarity
        cosine_sim = cosine_similarity(tfidf.transform([y_train[i]]), tfidf.transform([summary]))
        
        # Compute ROUGE scores
        rouge_1, rouge_2, rouge_L = rouge_scores(y_train[i], summary)
        
        # Print metrics
        print(f"\ncosine similarity: {cosine_sim}")
        print(f"rouge_1: {rouge_1}")
        print(f"rouge_2: {rouge_2}")
        print(f"rouge_L: {rouge_L}\n")
        
        # Write to CSV
        writer.writerow([X_train[i], y_train[i], summary, cosine_sim, rouge_1, rouge_2, rouge_L])
        
        # Clean up
        del summary
        del cosine_sim
        del rouge_1
        del rouge_2
        del rouge_L
        
        # Clear GPU memory
        clear_gpu_memory()

In [None]:
csv_file = "val_evaluation.csv"

# Open the CSV file and write the headers
with open(csv_file, "w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["document", "actual_summary", "predicted_summary", "cosine_similarity", "rouge_1", "rouge_2", "rouge_L"])  # Add headers
    
    # Iterate over 20 documents
    for i in tqdm(range(20), desc="Processing Documents", leave=False):
        print(f"\n### Document: {i+1} ###\n")
        
        # Generate summary for the document
        summary = summarize_chunks(X_val[i])
        
        # Compute cosine similarity using TF-IDF vectors (ensure `tfidf` is already defined and fitted)
        cosine_sim = cosine_similarity(tfidf.transform([y_val[i]]), tfidf.transform([summary]))[0][0]
        
        # Compute ROUGE scores (assuming rouge_scores is a predefined function)
        rouge_1, rouge_2, rouge_L = rouge_scores(y_val[i], summary)
        
        # Print metrics for inspection
        print(f"\nCosine similarity: {cosine_sim}")
        print(f"ROUGE-1: {rouge_1}")
        print(f"ROUGE-2: {rouge_2}")
        print(f"ROUGE-L: {rouge_L}\n")
        
        # Write the data to CSV file
        writer.writerow([X_val[i], y_val[i], summary, cosine_sim, rouge_1, rouge_2, rouge_L])
        
        # Clear memory for unused variables
        del summary
        del cosine_sim
        del rouge_1
        del rouge_2
        del rouge_L
        
        # Explicitly clear GPU memory and trigger garbage collection
        clear_gpu_memory()

print("Processing and evaluation completed.")

In [None]:
# BART performance on Training data
train_bart = pd.read_csv("train_evaluation.csv")
train_bart

In [None]:
train_bart["cosine_similarity_"] = train_bart["cosine_similarity"].astype(str)

# Define a lambda function to remove square brackets
remove_brackets = lambda x: x.replace("[", "").replace("]", "")

train_bart['cosine_similarity_'] = train_bart['cosine_similarity_'].apply(remove_brackets).astype("float64")
train_bart

In [None]:
# Create subplots for histograms with KDE for Cosine Similarities and ROUGE Scores
cosine_similarity_scores = train_bart["cosine_similarity_"]
rouge_1_scores = train_bart["rouge_1"]
rouge_2_scores = train_bart["rouge_2"]
rouge_L_scores = train_bart["rouge_L"]


fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# Cosine Similarity subplot
sns.histplot(cosine_similarity_scores, kde=True, bins=10, ax=axes[0, 0])
axes[0, 0].set_title('Cosine Similarity Scores')
axes[0, 0].set_xlabel('Cosine Similarity')
axes[0, 0].set_ylabel('Frequency')

# ROUGE-1 subplot
sns.histplot(rouge_1_scores, kde=True, bins=10, ax=axes[0, 1])
axes[0, 1].set_title('ROUGE-1 Scores')
axes[0, 1].set_xlabel('ROUGE-1')
axes[0, 1].set_ylabel('Frequency')

# ROUGE-2 subplot
sns.histplot(rouge_2_scores, kde=True, bins=10, ax=axes[1, 0])
axes[1, 0].set_title('ROUGE-2 Scores')
axes[1, 0].set_xlabel('ROUGE-2')
axes[1, 0].set_ylabel('Frequency')

# ROUGE-L subplot
sns.histplot(rouge_L_scores, kde=True, bins=10, ax=axes[1, 1])
axes[1, 1].set_title('ROUGE-L Scores')
axes[1, 1].set_xlabel('ROUGE-L')
axes[1, 1].set_ylabel('Frequency')

plt.suptitle('Training Data')
plt.tight_layout()
plt.show()

In [None]:
avg_rouge_1 = train_bart["rouge_1"].mean()
avg_rouge_2 = train_bart["rouge_2"].mean()
avg_rouge_L = train_bart["rouge_L"].mean()

rouge_df = pd.DataFrame([[avg_rouge_1, avg_rouge_2, avg_rouge_L]], columns=["ROUGE-1", "ROUGE-2", "ROUGE-L"], index=["BART"])
rouge_df