In [None]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma

In [None]:
from dotenv import load_dotenv

load_dotenv()

In [None]:
import pandas as pd

books = pd.read_csv("books_cleaned.csv")

In [None]:
#books["tagged_description"]

In [None]:
books["tagged_description"].to_csv("tagged_description.txt", sep = "\n", index = False, header = False)

In [None]:
raw_documents = TextLoader("tagged_description.txt").load()
text_splitter = CharacterTextSplitter(chunk_size=1, chunk_overlap=0,separator="\n")
documents = text_splitter.split_documents(raw_documents)

In [None]:
#documents[0]

In [None]:
db_books = Chroma.from_documents(documents,
       embedding=OpenAIEmbeddings())

In [None]:
query = "A book to teach about space"
docs = db_books.similarity_search(query, k = 10)

In [None]:
docs

In [None]:
books[books["isbn13"] == int(docs[0].page_content.split()[0].strip())]

In [None]:
def retrieve_recommendations(query: str, top_k: int = 10) -> pd.DataFrame:
    recs = db_books.similarity_search(query, k = 50)

    books_list = []

    for i in range(0, len(recs)):
        books_list += [int(recs[i].page_content.strip('"').split()[0])]

    return books[books["isbn13"].isin(books_list)].head(top_k)

In [None]:
retrieve_recommendations("A book to teach about space")

In [None]:
books["categories"].value_counts().reset_index().query("count > 50")

In [None]:
category_mapping = {'Fiction' : "Fiction",
'Juvenile Fiction': "Children's Fiction",
'Biography & Autobiography': "Nonfiction",
'History': "Nonfiction",
'Literary Criticism': "Nonfiction",
'Philosophy': "Nonfiction",
'Religion': "Nonfiction",
'Comics & Graphic Novels': "Fiction",
'Drama': "Fiction",
'Juvenile Nonfiction': "Children's Nonfiction",
'Science': "Nonfiction",
'Poetry': "Fiction"}

books["simple_category"] = books["categories"].map(category_mapping)

In [None]:
books[~(books["simple_category"].isna())]

#transformers lets us work on llm models easily and those that are on hugging face

In [None]:
from transformers import pipeline
import torch

fiction_categories = ["Fiction", "Nonfiction"]

pipe = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli", device="mps")


In [None]:
sequence = books.loc[books["simple_category"] == "Fiction", "description"].reset_index(drop=True)[0]

In [None]:
pipe(sequence, fiction_categories)

In [None]:
import numpy as np

max_index = np.argmax(pipe(sequence, fiction_categories)["scores"])
max_label = pipe(sequence, fiction_categories)["labels"][max_index]
max_label

In [None]:
def generate_predictions(sequence, fiction_categories):
    predictions = pipe(sequence, fiction_categories)
    max_index = np.argmax(predictions["scores"])
    max_label = predictions["labels"][max_index]
    return max_label

In [None]:
from tqdm import tqdm

actual_cats = []
predicated_cats = []

for i in tqdm(range(0,300)):
    sequence = books.loc[books["simple_category"] == "Fiction", "description"].reset_index(drop=True)[i]
    predicated_cats += [generate_predictions(sequence, fiction_categories)]
    actual_cats += ["Fiction"]

In [None]:
for i in tqdm(range(0,300)):
    sequence = books.loc[books["simple_category"] == "Nonfiction", "description"].reset_index(drop=True)[i]
    predicated_cats += [generate_predictions(sequence, fiction_categories)]
    actual_cats += ["Nonfiction"]

In [None]:
predictions_df = pd.DataFrame({"actual_categories": actual_cats, "predicted_categories": predicated_cats})

In [None]:
predictions_df

In [None]:
predictions_df["correct_prediction"] = (np.where(predictions_df["actual_categories"] == predictions_df["predicted_categories"], 1, 0)
)

In [None]:
predictions_df["correct_prediction"].sum() / len(predictions_df)

In [None]:
 isbns = []
 predicted_cats = []

 missing_cats = books.loc[books["simple_category"].isna(), ["isbn13", "description"]].reset_index(drop=True)

In [None]:
for i in tqdm(range(0,len(missing_cats))):
    sequence = missing_cats["description"][i]
    predicted_cats += [generate_predictions(sequence, fiction_categories)]
    isbns += [missing_cats["isbn13"][i]]

In [None]:
missing_predictions_df = pd.DataFrame({"isbn13": isbns, "predicted_categories": predicted_cats})

In [None]:
missing_predictions_df

In [None]:
books = pd.merge(books, missing_predictions_df, on="isbn13", how="left")
books["simple_category"] = np.where(books["simple_category"].isna(), books["predicted_categories"], books["simple_category"])
books = books.drop(columns= ["predicted_categories"])

In [None]:
books.to_csv("books_with_categories.csv", index=False)

In [None]:
from transformers import pipeline
classifier=pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", top_k = None, device="mps")
classifier("I love this!")

In [None]:
import numpy as np

emotion_labels = ["anger", "disgust", "fear", "joy", "sadness", "surprise", "nuetral"]
isbn=[]
emotion_scores = {label: [] for label in emotion_labels}

def calc_max_emotion_score(predictions):
    per_emotion_scores = {label: [] for label in emotion_labels}
    for prediction in predictions:
        sorted_prediction = sorted(prediction, key=lambda x:x["label"])
        for index, label in enumerate(emotion_labels):
            per_emotion_scores[label].append(sorted_prediction[index]["score"])
    return {label: np.max(scores) for label, scores in per_emotion_scores.items()}

In [None]:
from tqdm import tqdm


emotion_labels = ["anger", "disgust", "fear", "joy", "sadness", "surprise", "neutral"]
isbn=[]
emotion_scores = {label: [] for label in emotion_labels}


for i in tqdm(range(len(books))):
    isbn.append(books["isbn13"][i])
    sentences = books["description"][i].split(".")
    predictions = classifier(sentences)
    max_scores = calc_max_emotion_score(predictions)
    for label in emotion_labels:
        emotion_scores[label].append(max_scores[label])

In [None]:
emotions_df = pd.DataFrame(emotion_scores)
emotions_df["isbn13"] = isbn

In [None]:
emotions_df

In [None]:
books = pd.merge(books, emotions_df, on="isbn13")

In [None]:
books.to_csv("books_with_emotions.csv", index=False)