In [None]:
# 2_Summary_Generation_and_Mood_Classification.ipynb

# 1. Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

In [None]:
# Check if CUDA is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cpu


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!touch "/content/gdrive/My Drive/Colab Notebooks/Matcher/processed_books.csv"


In [None]:
import pandas as pd

file_path = "/content/gdrive/My Drive/Colab Notebooks/Matcher/processed_books.csv"
df = pd.read_csv(file_path)
print(f"Loaded dataset with {len(df)} rows")
print(df.head())  # Display the first few rows to verify


Loaded dataset with 4400 rows
   Unnamed: 0           isbn                           title  \
0           0  9780689830594                   Summer Story    
1           1  9780375704970            The Lake of Darkness   
2           2  9780345446671  Beyond the Blue Event Horizon    
3           3  9780446403016               St. Peter's Fair    
4           4  9780425198773                       Twice Shy   

                    series_title series_release_number        authors  \
0                  Brambly Hedge                     2   Jill Barklem   
1                            NaN                   NaN   Ruth Rendell   
2                   Heechee Saga                     2  Frederik Pohl   
3  Chronicles of Brother Cadfael                     4   Ellis Peters   
4                            NaN                   NaN   Dick Francis   

                    publisher language  \
0                    Atheneum  English   
1  Vintage Crime/Black Lizard  English   
2            Ballant

In [None]:
class SummaryGenerator:
    def __init__(self, model_name="facebook/bart-large-cnn"):
        # Directly use the pipeline for simplicity
        self.summarizer = pipeline("summarization", model=model_name, device=0 if torch.cuda.is_available() else -1)

    def generate_summary(self, text, min_length=40, max_length_cap=150):
        """Generate a concise summary of the book description"""
        if not isinstance(text, str) or len(text.split()) < min_length:
            return "No valid description available"

        # Calculate max_length as 70% of input length, capped at max_length_cap
        input_length = len(text.split())
        max_length = min(int(input_length * 0.7), max_length_cap)

        # Ensure max_length is at least min_length
        max_length = max(max_length, min_length)

        summary = self.summarizer(text, max_length=max_length, min_length=min_length,
                                do_sample=False)
        return summary[0]['summary_text']

    def batch_generate(self, texts, batch_size=8):
        """Generate summaries for a list of texts"""
        summaries = []

        # Use tqdm for progress tracking
        for i in tqdm(range(0, len(texts), batch_size), desc="Generating summaries"):
            batch = texts[i:i+batch_size]
            valid_batch = [text for text in batch if isinstance(text, str) and len(text.split()) > 40]

            if valid_batch:
                batch_summaries = []
                for text in valid_batch:
                    summary = self.generate_summary(text)
                    batch_summaries.append(summary)
                summaries.extend(batch_summaries)
            else:
                summaries.extend(["No valid description available"] * len(batch))

        return summaries

# Initialize the summary generator
summary_generator = SummaryGenerator()

# Generate summaries for a small sample first (to test)
sample_size = 10
sample_descriptions = df['description'].head(sample_size).tolist()

print("\nGenerating sample summaries...")
sample_summaries = []

for i, desc in enumerate(sample_descriptions):
    print(f"\nOriginal Description {i+1}:")
    print(desc[:300] + "..." if isinstance(desc, str) and len(desc) > 300 else desc)

    summary = summary_generator.generate_summary(desc)
    sample_summaries.append(summary)

    print(f"\nGenerated Summary {i+1}:")
    print(summary)

Device set to use cpu



Generating sample summaries...

Original Description 1:
It was such a hot summer. The sky was deep blue and the sun never faltered.All along Brambly Hedge, the mice did their best to keep cool. Poppy Eyebright sought refuge in the mossy shadows of the mill wheel; Dusty Dogwood took to walking by the banks of the cooling stream. Dusty and Poppy spent more...

Generated Summary 1:
Poppy Eyebright and Dusty Dogwood got engaged. They decided on a very unusual setting for the wedding ceremony. Even they didn't realize just how unusual it would prove to be.

Original Description 2:
Martin Urban is a quiet bachelor with a comfortable life, free of worry and distractions. When he unexpectedly comes into a small fortune, he decides to use his newfound wealth to help out those in need. Finn also leads a quiet life, and comes into a little money of his own. Normally, their paths wo...

Generated Summary 2:
Ruth Rendell takes the old adage that no good deed goes unpunished to a startling, hauntin

In [None]:
# 4. Mood Analysis using NLTK's Sentiment Analyzer

# Initialize the sentiment analyzer
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

# Function to analyze sentiment
def analyze_sentiment(text):
    if not isinstance(text, str) or text == "No valid description available":
        return {
            'compound': 0,
            'neg': 0,
            'neu': 0,
            'pos': 0
        }

    return sia.polarity_scores(text)

# Analyze sentiment for sample summaries
for i, summary in enumerate(sample_summaries):
    sentiment = analyze_sentiment(summary)
    print(f"\nSummary {i+1} Sentiment:")
    print(f"Positive: {sentiment['pos']:.3f}")
    print(f"Negative: {sentiment['neg']:.3f}")
    print(f"Neutral: {sentiment['neu']:.3f}")
    print(f"Compound: {sentiment['compound']:.3f}")

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...



Summary 1 Sentiment:
Positive: 0.088
Negative: 0.000
Neutral: 0.912
Compound: 0.402

Summary 2 Sentiment:
Positive: 0.257
Negative: 0.142
Neutral: 0.601
Compound: 0.818

Summary 3 Sentiment:
Positive: 0.027
Negative: 0.075
Neutral: 0.898
Compound: -0.459

Summary 4 Sentiment:
Positive: 0.000
Negative: 0.000
Neutral: 0.000
Compound: 0.000

Summary 5 Sentiment:
Positive: 0.000
Negative: 0.000
Neutral: 0.000
Compound: 0.000

Summary 6 Sentiment:
Positive: 0.073
Negative: 0.070
Neutral: 0.857
Compound: 0.026

Summary 7 Sentiment:
Positive: 0.210
Negative: 0.000
Neutral: 0.790
Compound: 0.710

Summary 8 Sentiment:
Positive: 0.096
Negative: 0.000
Neutral: 0.904
Compound: 0.494

Summary 9 Sentiment:
Positive: 0.202
Negative: 0.041
Neutral: 0.757
Compound: 0.852

Summary 10 Sentiment:
Positive: 0.000
Negative: 0.000
Neutral: 0.000
Compound: 0.000


In [None]:
# 5. Mood Classification System

# Define mood categories and keywords
mood_categories = {
    "happy": ["happy", "joy", "funny", "humor", "comedy", "light", "uplifting", "delightful"],
    "sad": ["sad", "sorrow", "grief", "melancholy", "tragic", "heart-breaking", "emotional"],
    "motivational": ["inspire", "motivate", "achieve", "success", "overcome", "empower", "growth"],
    "adventurous": ["adventure", "exciting", "thrill", "journey", "quest", "action", "danger"],
    "relaxing": ["calm", "relax", "peaceful", "gentle", "soothing", "comfort", "serene"],
    "intellectual": ["thought-provoking", "philosophical", "complex", "profound", "intelligent", "academic"],
    "escapist": ["fantasy", "magical", "imaginary", "enchanting", "fairy-tale", "otherworldly"]
}

# Function to match text to moods based on keywords
def classify_mood_by_keywords(text, mood_dict, threshold=0):
    """Classify text into moods based on keyword matching"""
    if not isinstance(text, str):
        return []

    text = text.lower()
    mood_scores = {}

    for mood, keywords in mood_dict.items():
        score = sum(1 for keyword in keywords if keyword in text)
        if score > threshold:
            mood_scores[mood] = score

    # Sort by score in descending order
    sorted_moods = sorted(mood_scores.items(), key=lambda x: x[1], reverse=True)
    return sorted_moods

# Function to combine sentiment and keyword analysis
def determine_book_mood(description, summary):
    """Determine the mood of a book using both sentiment and keyword analysis"""
    # Get sentiment scores
    sentiment = analyze_sentiment(summary)

    # Get keyword-based mood classification
    keyword_moods = classify_mood_by_keywords(summary, mood_categories)

    # Combine the approaches
    mood_scores = {}

    # If strongly positive sentiment, boost happy and motivational
    if sentiment['compound'] > 0.5:
        mood_scores['happy'] = mood_scores.get('happy', 0) + 2
        mood_scores['motivational'] = mood_scores.get('motivational', 0) + 1

    # If strongly negative sentiment, boost sad
    elif sentiment['compound'] < -0.5:
        mood_scores['sad'] = mood_scores.get('sad', 0) + 2

    # Add scores from keyword matching
    for mood, score in keyword_moods:
        mood_scores[mood] = mood_scores.get(mood, 0) + score

    # Sort by final score
    sorted_moods = sorted(mood_scores.items(), key=lambda x: x[1], reverse=True)

    return sorted_moods, sentiment

# Test the mood classification on sample summaries
for i, (desc, summary) in enumerate(zip(sample_descriptions, sample_summaries)):
    print(f"\nBook {i+1}:")
    print(f"Summary: {summary[:100]}...")

    moods, sentiment = determine_book_mood(desc, summary)

    print("Detected moods:")
    for mood, score in moods:
        print(f"  {mood}: {score}")

    print(f"Sentiment: {sentiment['compound']:.3f} (positive: {sentiment['pos']:.2f}, negative: {sentiment['neg']:.2f})")


Book 1:
Summary: Poppy Eyebright and Dusty Dogwood got engaged. They decided on a very unusual setting for the weddin...
Detected moods:
Sentiment: 0.402 (positive: 0.09, negative: 0.00)

Book 2:
Summary: Ruth Rendell takes the old adage that no good deed goes unpunished to a startling, haunting conclusi...
Detected moods:
  happy: 2
  motivational: 1
  adventurous: 1
  relaxing: 1
Sentiment: 0.818 (positive: 0.26, negative: 0.14)

Book 3:
Summary: Robinette Broadhead is on his way to making a fortune by bankrolling an expedition to the Food Facto...
Detected moods:
Sentiment: -0.459 (positive: 0.03, negative: 0.07)

Book 4:
Summary: No valid description available...
Detected moods:
Sentiment: 0.000 (positive: 0.00, negative: 0.00)

Book 5:
Summary: No valid description available...
Detected moods:
Sentiment: 0.000 (positive: 0.00, negative: 0.00)

Book 6:
Summary: Master storyteller Robin McKinley spins two new fairy tales and retells two cherished classics. All ...
Detected moods:
S

In [None]:
# 6. Scale up to process more books

# Set how many books to process (adjust based on your compute resources)
num_books_to_process = 100  # Start with a small number and increase as needed

# Select books to process
books_to_process = df.head(num_books_to_process)

# Generate summaries in batches
print(f"\nGenerating summaries for {num_books_to_process} books...")
batch_descriptions = books_to_process['description'].tolist()
batch_summaries = summary_generator.batch_generate(batch_descriptions, batch_size=8)

# Store the summaries
books_to_process['summary'] = batch_summaries

# Analyze moods
print("\nClassifying book moods...")
mood_results = []
sentiment_results = []

for i, row in tqdm(books_to_process.iterrows(), total=len(books_to_process), desc="Analyzing moods"):
    moods, sentiment = determine_book_mood(row['description'], row['summary'])
    primary_mood = moods[0][0] if moods else "unknown"
    mood_results.append(primary_mood)
    sentiment_results.append(sentiment['compound'])

# Add results to the dataframe
books_to_process['primary_mood'] = mood_results
books_to_process['sentiment_score'] = sentiment_results