In [1]:
# 2_Summary_Generation_and_Mood_Classification.ipynb

# 1. Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

In [2]:
# Check if CUDA is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cpu


In [3]:
# 2. Load the processed dataset
df = pd.read_csv('processed_books.csv')
print(f"Loaded dataset with {len(df)} books")

Loaded dataset with 4400 books


In [21]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from tqdm.auto import tqdm

# Set device (GPU if available, else CPU)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device set to use {device}")

# 3. Summary Generation with BART

class SummaryGenerator:
    def __init__(self, model_name="facebook/bart-large-cnn"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
        self.summarizer = pipeline("summarization", model=self.model, tokenizer=self.tokenizer, device=0 if device == "cuda" else -1)
        
    def generate_summary(self, text, max_length=150, min_length=40):
        """Generate a concise summary of the book description"""
        if not isinstance(text, str) or len(text.split()) < min_length:
            return "No valid description available"
        
        # Adjust max_length dynamically
        max_length = min(max_length, len(text.split()))
        
        summary = self.summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
        return summary[0]['summary_text']
    
    def batch_generate(self, texts, batch_size=8):
        """Generate summaries for a list of texts"""
        summaries = []
        
        # Use tqdm for progress tracking
        for i in tqdm(range(0, len(texts), batch_size), desc="Generating summaries"):
            batch = texts[i:i+batch_size]
            valid_batch = [text for text in batch if isinstance(text, str) and len(text.split()) > 40]
            
            if valid_batch:
                batch_summaries = self.summarizer(valid_batch, max_length=150, min_length=40, do_sample=False)
                summaries.extend([s['summary_text'] for s in batch_summaries])
            else:
                summaries.extend(["No valid description available"] * len(batch))
                
        return summaries

# Initialize the summary generator
summary_generator = SummaryGenerator()

# Generate summaries for a small sample first (to test)
sample_size = 10
sample_descriptions = df['description'].head(sample_size).tolist()  # Use 'description' instead of 'Description'

print("\nGenerating sample summaries...")
sample_summaries = []

for i, desc in enumerate(sample_descriptions):
    print(f"\nOriginal Description {i+1}:")
    print(desc[:300] + "..." if isinstance(desc, str) and len(desc) > 300 else desc)
    
    summary = summary_generator.generate_summary(desc)
    sample_summaries.append(summary)
    
    print(f"\nGenerated Summary {i+1}:")
    print(summary)

Device set to use cpu


Device set to use cpu



Generating sample summaries...

Original Description 1:
It was such a hot summer. The sky was deep blue and the sun never faltered.All along Brambly Hedge, the mice did their best to keep cool. Poppy Eyebright sought refuge in the mossy shadows of the mill wheel; Dusty Dogwood took to walking by the banks of the cooling stream. Dusty and Poppy spent more...

Generated Summary 1:
Poppy Eyebright and Dusty Dogwood got engaged. They decided on a very unusual setting for the wedding ceremony. Even they didn't realize just how unusual it would prove to be.

Original Description 2:
Martin Urban is a quiet bachelor with a comfortable life, free of worry and distractions. When he unexpectedly comes into a small fortune, he decides to use his newfound wealth to help out those in need. Finn also leads a quiet life, and comes into a little money of his own. Normally, their paths wo...

Generated Summary 2:
Ruth Rendell takes the old adage that no good deed goes unpunished to a startling, hauntin

In [25]:
# 4. Mood Analysis using NLTK's Sentiment Analyzer

# Initialize the sentiment analyzer
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

# Function to analyze sentiment
def analyze_sentiment(text):
    if not isinstance(text, str) or text == "No valid description available":
        return {
            'compound': 0,
            'neg': 0,
            'neu': 0,
            'pos': 0
        }
    
    return sia.polarity_scores(text)

# Analyze sentiment for sample summaries
for i, summary in enumerate(sample_summaries):
    sentiment = analyze_sentiment(summary)
    print(f"\nSummary {i+1} Sentiment:")
    print(f"Positive: {sentiment['pos']:.3f}")
    print(f"Negative: {sentiment['neg']:.3f}")
    print(f"Neutral: {sentiment['neu']:.3f}")
    print(f"Compound: {sentiment['compound']:.3f}")


Summary 1 Sentiment:
Positive: 0.088
Negative: 0.000
Neutral: 0.912
Compound: 0.402

Summary 2 Sentiment:
Positive: 0.257
Negative: 0.142
Neutral: 0.601
Compound: 0.818

Summary 3 Sentiment:
Positive: 0.027
Negative: 0.075
Neutral: 0.898
Compound: -0.459

Summary 4 Sentiment:
Positive: 0.000
Negative: 0.000
Neutral: 0.000
Compound: 0.000

Summary 5 Sentiment:
Positive: 0.000
Negative: 0.000
Neutral: 0.000
Compound: 0.000

Summary 6 Sentiment:
Positive: 0.073
Negative: 0.070
Neutral: 0.857
Compound: 0.026

Summary 7 Sentiment:
Positive: 0.147
Negative: 0.000
Neutral: 0.853
Compound: 0.710

Summary 8 Sentiment:
Positive: 0.096
Negative: 0.000
Neutral: 0.904
Compound: 0.494

Summary 9 Sentiment:
Positive: 0.202
Negative: 0.041
Neutral: 0.757
Compound: 0.852

Summary 10 Sentiment:
Positive: 0.000
Negative: 0.000
Neutral: 0.000
Compound: 0.000


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\arpit\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [None]:
# 5. Mood Classification System

# Define mood categories and keywords
mood_categories = {
    "happy": ["happy", "joy", "funny", "humor", "comedy", "light", "uplifting", "delightful"],
    "sad": ["sad", "sorrow", "grief", "melancholy", "tragic", "heart-breaking", "emotional"],
    "motivational": ["inspire", "motivate", "achieve", "success", "overcome", "empower", "growth"],
    "adventurous": ["adventure", "exciting", "thrill", "journey", "quest", "action", "danger"],
    "relaxing": ["calm", "relax", "peaceful", "gentle", "soothing", "comfort", "serene"],
    "intellectual": ["thought-provoking", "philosophical", "complex", "profound", "intelligent", "academic"],
    "escapist": ["fantasy", "magical", "imaginary", "enchanting", "fairy-tale", "otherworldly"]
}

# Function to match text to moods based on keywords
def classify_mood_by_keywords(text, mood_dict, threshold=0):
    """Classify text into moods based on keyword matching"""
    if not isinstance(text, str):
        return []
        
    text = text.lower()
    mood_scores = {}
    
    for mood, keywords in mood_dict.items():
        score = sum(1 for keyword in keywords if keyword in text)
        if score > threshold:
            mood_scores[mood] = score
            
    # Sort by score in descending order
    sorted_moods = sorted(mood_scores.items(), key=lambda x: x[1], reverse=True)
    return sorted_moods

# Function to combine sentiment and keyword analysis
def determine_book_mood(description, summary):
    """Determine the mood of a book using both sentiment and keyword analysis"""
    # Get sentiment scores
    sentiment = analyze_sentiment(summary)
    
    # Get keyword-based mood classification
    keyword_moods = classify_mood_by_keywords(summary, mood_categories)
    
    # Combine the approaches
    mood_scores = {}
    
    # If strongly positive sentiment, boost happy and motivational
    if sentiment['compound'] > 0.5:
        mood_scores['happy'] = mood_scores.get('happy', 0) + 2
        mood_scores['motivational'] = mood_scores.get('motivational', 0) + 1
    
    # If strongly negative sentiment, boost sad
    elif sentiment['compound'] < -0.5:
        mood_scores['sad'] = mood_scores.get('sad', 0) + 2
    
    # Add scores from keyword matching
    for mood, score in keyword_moods:
        mood_scores[mood] = mood_scores.get(mood, 0) + score
        
    # Sort by final score
    sorted_moods = sorted(mood_scores.items(), key=lambda x: x[1], reverse=True)
    
    return sorted_moods, sentiment

# Test the mood classification on sample summaries
for i, (desc, summary) in enumerate(zip(sample_descriptions, sample_summaries)):
    print(f"\nBook {i+1}:")
    print(f"Summary: {summary[:100]}...")
    
    moods, sentiment = determine_book_mood(desc, summary)
    
    print("Detected moods:")
    for mood, score in moods:
        print(f"  {mood}: {score}")
        
    print(f"Sentiment: {sentiment['compound']:.3f} (positive: {sentiment['pos']:.2f}, negative: {sentiment['neg']:.2f})")

In [None]:
def adjust_max_length(input_length):
    # Adjust max_length based on input_length
    # For example, set max_length to be about 30-40% of input_length
    max_length = int(input_length * 0.35)
    # Ensure max_length is not too short
    max_length = max(max_length, 20)
    return max_length

# Generate summaries in batches with dynamic max_length
batch_descriptions = books_to_process['description'].tolist()

# Initialize list to store summaries
batch_summaries = []

# Process in batches
for i in range(0, len(batch_descriptions), 8):
    batch = batch_descriptions[i:i+8]
    
    # Generate summaries for this batch
    batch_input_lengths = [len(desc) for desc in batch]
    batch_max_lengths = [adjust_max_length(length) for length in batch_input_lengths]
    
    # Assuming summary_generator supports variable max_length per input
    # You might need to adjust this based on your actual summary_generator function
    batch_summary = summary_generator.batch_generate(batch, batch_max_lengths)
    
    # Append summaries to the list
    batch_summaries.extend(batch_summary)

# Store the summaries
books_to_process['summary'] = batch_summaries


In [None]:
# 7. Visualize mood distribution
plt.figure(figsize=(12, 6))
sns.countplot(y=books_to_process['primary_mood'])
plt.title('Distribution of Primary Book Moods')
plt.xlabel('Count')
plt.ylabel('Mood')
plt.tight_layout()
plt.show()

# Relationship between mood and rating
plt.figure(figsize=(12, 6))
sns.boxplot(x='primary_mood', y='rating_score', data=books_to_process)  # Use 'rating_score' instead of 'Rating'
plt.title('Book Ratings by Primary Mood')
plt.xlabel('Mood')
plt.ylabel('Rating')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# 8. Save processed data
books_to_process.to_csv('books_with_summaries_and_moods.csv', index=False)
print("\nProcessed book data saved to 'books_with_summaries_and_moods.csv'")

In [None]:
def recommend_books_by_mood(books_df, user_mood, top_n=5):
    """Recommend books based on user's current mood."""
    
    # Normalize user input
    user_mood = user_mood.lower()
    
    # Check if input matches any of our mood categories
    matching_mood = None
    for mood, keywords in mood_categories.items():
        if mood in user_mood or any(keyword in user_mood for keyword in keywords):
            matching_mood = mood
            break
            
    # If no direct match, use the keyword with the highest occurrence
    if not matching_mood:
        mood_scores = {}
        for mood, keywords in mood_categories.items():
            for keyword in keywords:
                if keyword in user_mood:
                    mood_scores[mood] = mood_scores.get(mood, 0) + 1

        if mood_scores:
            matching_mood = max(mood_scores.items(), key=lambda x: x[1])[0]
        else:
            # Default to "happy" if no match is found
            matching_mood = "happy"
    
    print(f"Based on your input, we'll show books with a '{matching_mood}' mood.")
    
    # Filter books by the matching mood
    matching_books = books_df[books_df['primary_mood'] == matching_mood]
    
    # Sort by rating and return top N recommendations
    recommendations = matching_books.sort_values('rating_score', ascending=False).head(top_n)
    
    return recommendations, matching_mood


# Test the recommendation function
test_mood = "I'm feeling like I need some adventure today"
recommendations, matched_mood = recommend_books_by_mood(books_to_process, test_mood)

# Display recommendations
print(f"\nTop book recommendations for '{test_mood}':")
for i, (_, book) in enumerate(recommendations.iterrows(), start=1):
    print(f"\n{i}. {book['title']} by {book['authors']}")
    print(f"   Rating: {book['rating_score']}")
    print(f"   Summary: {book['summary'][:150]}...")