In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Importing Libraries**

In [None]:
import re
import string
import spacy
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
try:
    import contractions
except ImportError:
    !pip install contractions
    import contractions

from contractions import fix  # Ensure contractions library is installed
from contractions import fix  # Assuming you have a function to expand contractions
# Load English language model for lemmatization
nlp = spacy.load("en_core_web_sm")
!pip install rouge
!pip install bert_score 

# **Loading Dataset**

In [None]:
data = pd.read_csv('/kaggle/input/100row/YT_Summary_100_ROW.csv')  # data contains Validate Dataset
data.shape

In [None]:
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

In [None]:
data.shape

In [None]:
data.info()

# **Preprocessing**

In [None]:
df=pd.DataFrame()
df=data
df.shape

In [None]:
df.isnull().sum()  # count the number of missing values (NaNs) in each column of a DataFrame df.

In [None]:
if df.isnull().values.any():   # removes rows containing missing values (NaNs) from the DataFrame (As missing value present in df, we are removing here the respective rows)
    df.dropna(inplace=True)

In [None]:
df.nunique()  #used to count the number of unique values in each column of a DataFrame df.

In [None]:
df.info()  #used to get a concise summary of a DataFrame

In [None]:
df.shape

In [None]:
# Assuming df is your original DataFrame containing the "chunks" column
# Create a new DataFrame df2 with only the "chunks" column
df2 = pd.DataFrame()
df2['chunks'] = df['chunks']
df2.shape

In [None]:
df2.head(2)

In [None]:
df2.info()

# **Parsing**

## **1. Shallow Parsing**

### **1.1 POS Tagging**

In [None]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')


from nltk.tokenize import sent_tokenize, word_tokenize

def shallow_parsing(chunk):
    if not isinstance(chunk, str):
        # Handle non-string chunks (if needed)
        return []
    # Tokenize the chunk into sentences
    sentences = sent_tokenize(chunk)
    pos_tagged_sentences = []
    for sentence in sentences:
        # Tokenize each sentence into words
        tokens = word_tokenize(sentence)
        # Perform POS tagging on the list of tokens
        pos_tags = nltk.pos_tag(tokens)
        pos_tagged_sentences.append(pos_tags)
    return pos_tagged_sentences


In [None]:
df2['pos_tagged_chunks']= df2['chunks'].apply(shallow_parsing)

In [None]:
df2.head(5)

## **2. Deep Parsing**

### **Chunk Dependency Visualization**

Here, we showcase dependency parsing using SpaCy on a selected chunk from DataFrame df2. Dependency parsing unveils grammatical relationships between words, visualized through displaCy's arrows, offering insight into the text's structural organization.

In [None]:
from spacy import displacy

# Load the English language model
nlp = spacy.load("en_core_web_sm")

# Assuming df2 is your DataFrame with a column named 'chunks'
# Get the content of the first row in the 'chunks' column
sentence = df2['chunks'].iloc[0]

# Process the sentence using SpaCy
doc = nlp(sentence)

# Visualize the dependency parse tree
displacy.render(doc, style="dep", jupyter=True, options={'compact': True})

### **Dependency and POS Tagging - Based Summarization of Chunked Text**

In [None]:
# Function to extract key phrases using dependency parsing
def extract_key_phrases(chunk):
    doc = nlp(chunk)
    key_phrases = []
    for token in doc:
        # Consider tokens that are nouns, verbs, adjectives, or adverbs
        if token.pos_ in ["NOUN", "VERB", "ADJ", "ADV"]:
            # Add token text to key phrases
            key_phrases.append(token.text)
        # Also consider tokens that are dependent on nouns, verbs, adjectives, or adverbs
        if token.dep_ in ["nsubj", "dobj", "attr", "advmod", "acomp"]:
            # Add dependent token text to key phrases
            key_phrases.append(token.text)
    return key_phrases

# Function to generate summary using dependency parsing
def generate_dependency_summary_batch(texts):
    summaries = []
    for text in texts:
        # Extract key phrases using dependency parsing
        key_phrases = extract_key_phrases(text)
        # Combine key phrases to form summary
        summary = " ".join(key_phrases)
        summaries.append(summary)
    return summaries

In [None]:
# Apply dependency parsing-based summarization to batches of rows in df2['chunks'] and store in df7
batch_size = 32
df3 = pd.DataFrame()
for i in range(0, len(df2), batch_size):
    batch_texts = df2['chunks'].iloc[i:i+batch_size].tolist()
    batch_summaries = generate_dependency_summary_batch(batch_texts)
    df3 = pd.concat([df3, pd.DataFrame({'dependency_summary': batch_summaries})], ignore_index=True)


In [None]:
print("Original Text: ")
print(df2["chunks"].iloc[0])
print("\n\n")
print("Summary Generated via dependecy parsing: ")
print(df3["dependency_summary"].iloc[0])

### **Comparing Parsing output to the Original summary in df frame**

In [None]:
# Compute ROUGE scores
from rouge import Rouge
rouge = Rouge()
rouge_scores = rouge.get_scores(df3['dependency_summary'], df['summary'], avg=True)


print("ROUGE Scores:")
for metric, score in rouge_scores.items():
    print(f"{metric}: {score}")

In [None]:
# Compute BERTScore scores
from bert_score import score

# Convert summaries to strings
reference_summaries = [' '.join(summary.split()) for summary in df['summary']]
generated_summaries = [' '.join(summary.split()) for summary in df3['dependency_summary']]

P, R, F1 = score(generated_summaries, reference_summaries, lang='en', verbose=True)
print("BERTScore Precision:", P.mean().item())
print("BERTScore Recall:", R.mean().item())
print("BERTScore F1:", F1.mean().item())

# **BART Model With Output Given by Dependency Parsing**

1. 'dependency_summary' column of df3 frame contains the chunks after applying Dependency Parsing on it.
2. We Experimented with different summarization models from Hugging Face; BART emerged as the top performer with the highest Rouge score (Rouge-1 f=0.54).
3. Now, our focus shifts to evaluating whether incorporating dependency parsing data, specifically df3['dependency_summary'], enhances the performance of the BART model.

In [None]:
!pip install --upgrade transformers torch
!pip install rouge
!pip install bert_score

import torch


In [None]:
# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

In [None]:
from transformers import BartForConditionalGeneration, BartTokenizer

def generate_bart_summary_batch(texts):
    # Load tokenizer
    tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")

    # Load model
    model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
    model.to(device)  # Move model to GPU
    summaries = []
    for text in texts:
        # Tokenize the text
        inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True, padding="longest")
        inputs.to(device)  # Move input tensors to GPU

        # Generate summary
        summary_ids = model.generate(inputs.input_ids.to(device), max_length=100, num_beams=5, early_stopping=True, min_length=30)

        # Decode and append the summary
        summaries.append(tokenizer.decode(summary_ids[0], skip_special_tokens=True))

    return summaries



In [None]:
# Apply BART summarization to batches of rows in df2['chunks'] and store in df7
batch_size = 32
df4 = pd.DataFrame()
for i in range(0, len(df2), batch_size):
    batch_texts = df3['dependency_summary'].iloc[i:i+batch_size].tolist()
    batch_summaries = generate_bart_summary_batch(batch_texts)
    df4 = pd.concat([df4, pd.DataFrame({'bart_summary': batch_summaries})], ignore_index=True)

df4.head(2)

In [None]:
# Compute ROUGE scores
from rouge import Rouge

rouge = Rouge()
rouge_scores = rouge.get_scores(df4['bart_summary'], df['summary'], avg=True)


print("ROUGE Scores:")
for metric, score in rouge_scores.items():
    print(f"{metric}: {score}")


In [None]:
# Compute BERTScore scores
from bert_score import score

# Convert summaries to strings
reference_summaries = [' '.join(summary.split()) for summary in df['summary']]
generated_summaries = [' '.join(summary.split()) for summary in df4['bart_summary']]

P, R, F1 = score(generated_summaries, reference_summaries, lang='en', verbose=True)
print("BERTScore Precision:", P.mean().item())
print("BERTScore Recall:", R.mean().item())
print("BERTScore F1:", F1.mean().item())